In [1]:
# Import libraries
# Common imports
import numpy as np
import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 


# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

#Data analysing
from sklearn.preprocessing import MinMaxScaler

#Classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from matplotlib.colors import ListedColormap
from sklearn.naive_bayes import  GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [2]:
df_train = pd.read_csv("data_train.csv")
df_test = pd.read_csv("data_test.csv")

In [3]:
pd.set_option('display.max_columns',111)
df_train.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
32556,27,2,257302,6,12,1,10,2,0,1,0,0,38,0,0
32557,40,2,154374,1,9,1,9,1,0,0,0,0,40,0,1
32558,58,2,151910,1,9,6,0,4,0,1,0,0,40,0,0
32559,22,2,201490,1,9,0,0,3,0,0,0,0,20,0,0
32560,52,6,287927,1,9,1,1,2,0,1,15024,0,40,0,1


In [4]:
df = pd.concat([df_train, df_test], ignore_index=True)
print(df['income'].value_counts())
print(df_train['income'].value_counts())

0    37155
1    11687
Name: income, dtype: int64
0    24720
1     7841
Name: income, dtype: int64


In [5]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [6]:
#Drop the redundant columns
df = df.drop('education-num', axis=1)
df_train = df_train.drop('education-num', axis=1)
df_test = df_test.drop('education-num', axis=1)

In [7]:
##Separating categorical and continuous variables
#{cat: <45, cont:>45}
cat=[feature for feature in df.columns if df[feature].nunique()<45]
cont=[feature for feature in df.columns if df[feature].nunique()>45]

In [8]:
df_cat = df[cat].copy()
df_cat.shape

(48842, 9)

In [9]:
df_cont = df[cont].copy()
df_cont.shape

(48842, 5)

## Implementation: Data Exploration

As we can see that the last column from this dataset, 'income', will be our target label (whether an individual makes more than, or at most, $50,000 annually). All other columns are features about each individual in the census database.

A premilinary investigation of the dataset will determine how many individuals fit into either group, and will tell us about the percentage of these individuals making more than \$50,000. In the code cell below, you will need to compute the following:

- The total number of records, 'n_records'
- The number of individuals making more than \$50,000 annually, 'n_greater_50k'.
- The number of individuals making at most \$50,000 annually, 'n_at_most_50k'.
- The percentage of individuals making more than \$50,000 annually, 'greater_percent'.

In [12]:
# DONE: Total number of records
n_records = df['income'].count()

# DONE: Number of records where individual's income is more than $50,000
count = 0
count = [count+1 for income in df['income'] if income == 1]
n_greater_50k = sum(count)
 
# DONE: Number of records where individual's income is at most $50,000
n_at_most_50k = n_records - n_greater_50k

# DONE: Percentage of individuals whose income is more than $50,000
greater_percent = (n_greater_50k*100.0)/n_records

# Print the results
print("Total number of records: {}".format(n_records)) 
print("Individuals making more than $50,000: {}".format(n_greater_50k)) 
print("Individuals making at most $50,000: {}".format(n_at_most_50k)) 
print("Percentage of individuals making more than $50,000: {:.2f}%".format(greater_percent)) 

Total number of records: 48842
Individuals making more than $50,000: 11687
Individuals making at most $50,000: 37155
Percentage of individuals making more than $50,000: 23.93%


#### Take the combined dataframe and split it into train, validation and test set 

In [15]:
# Option2: Run for all dataset (categorical and continuous data) without Validation set
from sklearn.model_selection import train_test_split

X=df.copy()
X.drop('income',axis=1,inplace=True)
y=df['income']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [16]:
# Apply with X_train < X_train_val
from sklearn.ensemble import RandomForestClassifier

ran_forest = RandomForestClassifier(n_estimators=100, random_state=42)
ran_forest.fit(X_train, y_train)

y_pred_rf = ran_forest.predict(X_test)

#for name, score in zip(df.columns, ran_forest.feature_importances_):
#    print(name, score)


In [17]:
score = ran_forest.score(X_train, y_train)
print("Evaluating the model on the training set yields an accuracy of {}%".format(score*100))
score=ran_forest.score(X_test, y_test)
print("Evaluating the model on the testing set yields an accuracy of {:.2f}%".format(score*100))

Evaluating the model on the training set yields an accuracy of 99.9872034397154%
Evaluating the model on the testing set yields an accuracy of 85.81%


In [24]:
# DONE: Import 'GridSearchCV', 'make_scorer', and any other necessary libraries
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# DONE: Initialize the classifier
clf = RandomForestClassifier(random_state = 50)

# Done: Create the parameters list you wish to tune
parameters = {'min_samples_split' : [2,3,4,5],'min_samples_leaf' : [1,5,10,50,100,200,500] , 'n_jobs' : [1,-1]}

#print clf.get_params().keys()
# DONE: Make an fbeta_score scoring object
scorer = make_scorer(fbeta_score, beta=.5,average='micro')

# DONE: Perform grid search on the classifier using 'scorer' as the scoring method
grid_obj = GridSearchCV(clf, parameters,scoring=scorer)

# DONE: Fit the grid search object to the training data and find the optimal parameters
grid_fit = grid_obj.fit(X_train, y_train)

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using the unoptimized and model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))) 
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5,average='micro'))) 
print("\nOptimized Model\n------") 
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))) 
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5,average='micro'))) 

NameError: name 'fbeta_score' is not defined

### DecisionTreeClassifier

In [18]:
# Repeat the actions above for different random states
for random_state in range(4):
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=random_state, test_size=0.2)
    classifier = DecisionTreeClassifier(random_state=1)
    classifier.fit(X_train, y_train)
    score=classifier.score(X_test, y_test)
    print("Evaluating the model on the testing set yields an accuracy of {:.2f}% with random state {}".format(score*100, random_state))

Evaluating the model on the testing set yields an accuracy of 81.92% with random state 0
Evaluating the model on the testing set yields an accuracy of 81.21% with random state 1
Evaluating the model on the testing set yields an accuracy of 80.73% with random state 2
Evaluating the model on the testing set yields an accuracy of 80.95% with random state 3


In [19]:
score = classifier.score(X_train, y_train)
print("Evaluating the model on the training set yields an accuracy of {}%".format(score*100))
score=classifier.score(X_test, y_test)
print("Evaluating the model on the testing set yields an accuracy of {:.2f}%".format(score*100))

Evaluating the model on the training set yields an accuracy of 99.99232206382925%
Evaluating the model on the testing set yields an accuracy of 80.95%


In [20]:
from sklearn.model_selection import cross_val_score
classifier = DecisionTreeClassifier(random_state=1)
scores = cross_val_score(classifier, X, y, cv=5) # cv is the number of folds (k)
print(scores)

# It is always a good practice to show the mean AND the standard deviation of the model accuracy
print("Accuracy: {:.2f}% (+/- {:.2f})".format(scores.mean() * 100, scores.std() * 100))

[0.81021599 0.81318456 0.81900082 0.81214169 0.80773956]
Accuracy: 81.25% (+/- 0.38)


### Finding the best hyper-parameters

In [44]:
params={'n_estimators':[100,300,500],
            'criterion':['gini','entropy'],
            'max_depth':[None,1,2,3,4,5],
           'max_features':['int','float','auto','log2']}

In [45]:
from sklearn.model_selection import GridSearchCV, train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=41, test_size=0.2)
ran_forest = RandomForestClassifier(random_state=42)


In [None]:
# Setting up the grid search that will test every combination of parameters
gridsearch = GridSearchCV(RandomForestClassifier(),params,cv=10

# As we are doing cross-validation on the training set, the testing set X_test is untouched
result = gridsearch.fit(X_train, y_train)

In [None]:
print("The best parameters are :", result.best_params_)
print("The best accuracy is {:.2f}%:".format(result.best_score_ * 100))

# We can now use the testing set with the optimal hyper-parameters to get the final generalization accuracy
ran_forest = result.best_estimator_
score = ran_forest.score(X_test, y_test)
print("The generalization accuracy of the model is {:.2f}%".format(score * 100))

In [None]:
def single_grid_search(X_train, y_train):
    """
    Performs a grid search using the training set given.
    """
    # Setting all the parameters we want to test
    params = {
        'max_features' : np.arange(0.1,1,0.1).tolist(), #Number of features to consider as a fraction of all features
        'max_depth': [1,2,4,8, None] # Depth of the tree
    }

    gridsearch = GridSearchCV(estimator = ran_forest,
                            param_grid = params,
                            scoring = 'accuracy', 
                            cv = 5, # Use 5 folds
                            verbose = 0,
                            n_jobs = -1 #Use all but one CPU core
                            )

    # As we are doing cross-validation on the training set, the testing set X_test is untouched    
    return gridsearch.fit(X_train, y_train)

In [None]:
# Redoing the same computation as before, but this time
# using the method we created to show that we have the same results
result = single_grid_search(X_train, y_train)
ran_forest = result.best_estimator_
score = ran_forest.score(X_test, y_test)
print("The generalization accuracy of the model is {:.2f}%".format(score * 100))

In [None]:
#SELECT MODEL

# Now we can create k train-test splits using KFold
from sklearn.model_selection import KFold

# Using KFold instead of calling multiple times train_test_split to ensure that each
# sample goes into a single split only
kf = KFold(n_splits=5, random_state=45, shuffle=True)

split = 0
scores = []
for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    result = single_grid_search(X_train, y_train)
    
    decision_tree = result.best_estimator_
    score = decision_tree.score(X_test, y_test)
    scores.append(score)
    print("### Split {}: Accuracy is {:.2f}% ###".format(split := split + 1, score*100))
    
print("The mean generalization accuracy of the model is {:.2f}% (+/- {:.2f}%)".format(np.mean(scores) * 100, np.std(scores) * 100))

In [None]:
####TRY THIS WHEN HAVE TIME££££

In [12]:
#Function to find the best random state
def random_state(X_int,y_int):
    maxi=0
    model=RandomForestClassifier()
    for ran_st in range(1,201):
        xtrain, xtest, ytrain, ytest=train_test_split(X_int, y_int, test_size=0.20, random_state=ran_st)
        model.fit(xtrain,ytrain)
        p=model.predict(xtest)
        accu=accuracy_score(p,ytest)
        if accu > maxi:
            maxi = accu
            ran_st2=ran_st
    return ran_st2

In [13]:
#To evakuate performances of all the models
def performance(p,ytest,m,xtest,s):
    print('------',m,'------------------------------------')
    print('Accuracy',np.round(accuracy_score(p,ytest),4))
    print('--------------')
    print('Mean of Cross Validation Score',np.round(s.mean(),4))
    print('--------------')
    print('AUC_ROC Score',np.round(roc_auc_score(ytest,m.predict_proba(xtest)[:,1]),4))
    print('--------------')
    print('Confusion Matrix')
    print(confusion_matrix(p,ytest))
    print('--------------')
    print('Classification Report')
    print(classification_report(p,ytest))

In [14]:
#Creating a list of models which will be created one by one
models=[#GaussianNB(),KNeighborsClassifier(),
        #LogisticRegression(),DecisionTreeClassifier(),
        RandomForestClassifier(),#AdaBoostClassifier(),
        GradientBoostingClassifier()]

In [15]:
def create_model(X_int,y_int):
    xtrain,xtest,ytrain,ytest=train_test_split(X_int,y_int,test_size=0.2,random_state=randomstate(X_int,X_int))
    for i in models:
        model=i
        model.fit(xtrain,ytrain)
        p=model.predict(xtest)
        score=cross_val_score(model,X_int,X_int,cv=10)
        performance(p,ytest,model,xtest,score) 

In [None]:
create_model(X_train_val, y_train_val)

### Listing of attributes:

Target: income: >50K, <=50K.

 - age: continuous.
 - workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
 - fnlwgt (final weight): continuous.
 - education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
 - education-num: continuous.
 - marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
 - occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
 - relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
 - race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
 - sex: Female, Male.
 - capital-gain: continuous.
 - capital-loss: continuous.
 - hours-per-week: continuous.
 - native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.