In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [2]:
feature_dict = {i:label for i,label in zip(range(14),
                  ('age',
                  'workclass',
                  'fnlwgt',
                  'education',
                   'education-num',
                   'marital-status',
                   'occupation',
                   'relationship',
                   'race',
                   'sex',
                   'capital-gain',
                   'capital-loss',
                   'hours-per-week',
                   'native-country',
                  ))}
data_frame = pd.read_csv('adult.data',header = None)
data_frame.columns = [l for i,l in sorted(feature_dict.items())] + ['class-label']
data_frame.dropna(how="all", inplace=True) # to drop the empty line at file-end
data_frame.replace('<=50K', 0,regex=True,inplace=True)
data_frame.replace('>50K', 1,regex=True,inplace=True)
data_frame.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class-label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [3]:
x_train=data_frame

x_train = pd.concat([x_train, pd.get_dummies(x_train.workclass,prefix = 'workclass')], axis=1)
x_train.drop(['workclass'],axis = 1,inplace = True)

x_train = pd.concat([x_train, pd.get_dummies(x_train.education,prefix = 'education')], axis=1)
x_train.drop(['education'],axis = 1,inplace = True)

x_train = pd.concat([x_train, pd.get_dummies(x_train['marital-status'],prefix = 'marital-status')], axis=1)
x_train.drop(['marital-status'],axis = 1,inplace = True)

x_train = pd.concat([x_train, pd.get_dummies(x_train.occupation,prefix = 'occupation')], axis=1)
x_train.drop(['occupation'],axis = 1,inplace = True)

x_train = pd.concat([x_train, pd.get_dummies(x_train.relationship,prefix = 'relationship')], axis=1)
x_train.drop(['relationship'],axis = 1,inplace = True)

x_train = pd.concat([x_train, pd.get_dummies(x_train.race,prefix = 'race')], axis=1)
x_train.drop(['race'],axis = 1,inplace = True)

x_train = pd.concat([x_train, pd.get_dummies(x_train.sex,prefix = 'sex')], axis=1)
x_train.drop(['sex'],axis = 1,inplace = True)

x_train = pd.concat([x_train, pd.get_dummies(x_train['native-country'],prefix = 'native-country')], axis=1)
x_train.drop(['native-country'],axis = 1,inplace = True)

x_train.drop(['class-label'],axis = 1,inplace = True)

y_train=data_frame['class-label']
x_train

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32557,40,154374,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32558,58,151910,9,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
32559,22,201490,9,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
dt = DecisionTreeClassifier()
params={
    "max_depth":range(5,15),
    "min_samples_split":range(2,12),
    "min_samples_leaf":range(1,10)}

grid_tree = GridSearchCV(dt, params, cv = 5)
desicion_tree=grid_tree.fit(x_train, y_train)
print(desicion_tree.best_params_)
print(desicion_tree.best_score_)

{'max_depth': 10, 'min_samples_leaf': 9, 'min_samples_split': 4}
0.8577134098840684


In [5]:
rf=RandomForestClassifier()
forest_params = {
    'n_estimators': [50,100,110],
    'max_depth': [15,20,25],
    'max_features' : ['sqrt','log2',None ]}

grid_forest=GridSearchCV(rf, forest_params, cv = 5)
random_forest=grid_forest.fit(x_train, y_train)
print(random_forest.best_params_)
print(random_forest.best_score_)

{'max_depth': 15, 'max_features': None, 'n_estimators': 100}
0.8647463907942949


In [6]:
data_frame_test = pd.read_csv('adult.test',header = None)
data_frame_test.columns = [l for i,l in sorted(feature_dict.items())] + ['class-label']
data_frame_test.dropna(how="all", inplace=True) # to drop the empty line at file-end
data_frame_test.replace('<=50K.', 0,regex=True,inplace=True)
data_frame_test.replace('>50K.', 1,regex=True,inplace=True)
data_frame_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class-label
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,0
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,0
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,0
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,0


In [7]:
x_test=data_frame_test

x_test = pd.concat([x_test, pd.get_dummies(x_test.workclass,prefix = 'workclass')], axis=1)
x_test.drop(['workclass'],axis = 1,inplace = True)

x_test = pd.concat([x_test, pd.get_dummies(x_test.education,prefix = 'education')], axis=1)
x_test.drop(['education'],axis = 1,inplace = True)

x_test = pd.concat([x_test, pd.get_dummies(x_test['marital-status'],prefix = 'marital-status')], axis=1)
x_test.drop(['marital-status'],axis = 1,inplace = True)

x_test = pd.concat([x_test, pd.get_dummies(x_test.occupation,prefix = 'occupation')], axis=1)
x_test.drop(['occupation'],axis = 1,inplace = True)

x_test = pd.concat([x_test, pd.get_dummies(x_test.relationship,prefix = 'relationship')], axis=1)
x_test.drop(['relationship'],axis = 1,inplace = True)

x_test = pd.concat([x_test, pd.get_dummies(x_test.race,prefix = 'race')], axis=1)
x_test.drop(['race'],axis = 1,inplace = True)

x_test = pd.concat([x_test, pd.get_dummies(x_test.sex,prefix = 'sex')], axis=1)
x_test.drop(['sex'],axis = 1,inplace = True)

x_test = pd.concat([x_test, pd.get_dummies(x_test['native-country'],prefix = 'native-country')], axis=1)
x_test.drop(['native-country'],axis = 1,inplace = True)

x_test.drop(['class-label'],axis = 1,inplace = True)

y_test=data_frame_test['class-label']


In [8]:
different_column = set( x_train.columns ) - set( x_test.columns )
index=x_train.columns.get_indexer(different_column)[0]
different_column=list(different_column)[0]
x_test.insert(loc = index.item(), column = different_column, value = np.zeros(x_test.shape[0]))

In [9]:
predict_dt = desicion_tree.predict(x_test)
accuracy_dt = accuracy_score(y_test, predict_dt)
print(accuracy_dt)
roc_dt = roc_auc_score(y_test, predict_dt)
print(roc_dt)

0.8621706283397825
0.7699582603328259


In [10]:
predict_rf = random_forest.predict(x_test)
accuracy_rf = accuracy_score(y_test, predict_rf)
print(accuracy_rf)
roc_rf = roc_auc_score(y_test, predict_rf)
print(roc_rf)

0.8633376328235366
0.7770977570104011
