# Dataset 1

## 1) Preparing Data:

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df1 = pd.read_csv("../Dataset/Dataset1.csv")
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


### check for null values

In [None]:
print(df1.info())

### generalize numerical attributes

In [3]:
print("Age values: ",set(df1["age"].values))
print("Hours-per-week values: ",set(df1["hours-per-week"].values))
df1["age"] = df1["age"].map(lambda x: x//10)
df1["hours-per-week"] = df1["hours-per-week"].map(lambda x: x//10)
df1["fnlwgt"] = df1["fnlwgt"].map(lambda x: x//100000)
df1["capital-gain"] = df1["capital-gain"].map(lambda x: x//1000)
df1["capital-loss"] = df1["capital-loss"].map(lambda x: x//1000)

Age values:  {17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 87, 88, 90}
Hours-per-week values:  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, 72, 73, 75, 76, 77, 78, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99}


### encode categorical attributes

In [4]:
ordinal_enc = OrdinalEncoder()
ordinal_vals = ordinal_enc.fit_transform(df1[["workclass", "marital-status",
                                             "occupation","relationship", "race",
                                             "sex", "native-country"]])

df1["workclass"] = ordinal_vals[:, 0].astype('int8')
df1["marital-status"] = ordinal_vals[:, 1].astype('int8')
df1["occupation"] = ordinal_vals[:, 2].astype('int8')
df1["relationship"] = ordinal_vals[:, 3].astype('int8')
df1["race"] = ordinal_vals[:, 4].astype('int8')
df1["sex"] = ordinal_vals[:, 5].astype('int8')
df1["native-country"] = ordinal_vals[:, 6].astype('int8')

### drop unwanted attributes

In [5]:
df1 = df1.drop(['education'], axis=1)

In [6]:
print(set(df1["income"].values))

def replace_income(x):
    income = x['income']
    if income == '<=50K':
        return 0
    else :
        return 1

df1['income'] = df1.apply(replace_income, axis=1)

{'<=50K', '>50K'}


### clean data

In [7]:
X = df1.drop(['income'], axis=1)
y = df1['income']
df1.head()


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,3,7,0,13,4,1,1,4,1,2,0,4,39,0
1,5,6,0,13,2,4,0,4,1,0,0,1,39,0
2,5,4,2,7,2,6,0,2,1,0,0,4,39,0
3,2,4,3,13,2,10,5,2,0,0,0,4,5,0
4,3,4,2,14,2,4,5,4,0,0,0,4,39,0


## 2) Classifying the Data

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier


from sklearn.tree import export_graphviz
from sklearn.datasets import load_wine
from IPython.display import SVG
from graphviz import Source
from IPython.display import display


### split into test and train sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


### Decision Tree with gini

In [10]:
dtree_gini = DecisionTreeClassifier(criterion='gini')
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_gini, out_file="./01/dtree_gini.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 0.8111324376199616


### Decision Tree with entropy

In [11]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy')
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_entropy, out_file="./01/dtree_entropy.dot",
                feature_names=X_train.columns,
                filled = True)


Accuracy: 0.8159309021113244


### Random Forest with gini

In [12]:
rf_gini = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='gini')
rf_model = rf_gini.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_gini.estimators_[0], out_file="./01/rf_gini.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 0.8397312859884837


### Random Forest with entropy

In [13]:
rf_entropy = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='entropy')
rf_model = rf_entropy.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_entropy.estimators_[0], out_file="./01/rf_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 0.8406909788867563


### visualize

In [14]:
# In order to see each tree in jupyter notebook
# uncomment following lines and execute them in 
# separate cells

# graph = Source(export_graphviz(dtree_gini, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(dtree_entropy, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

# random forests have many estimators so we should travers them
# or just visualize one of them

# graph = Source(export_graphviz(rf_gini.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(rf_entropy.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


### grid search

In [15]:
param = {'min_samples_split': [2, 10, 50],
        'max_depth': [5, 10, 15, None]}

grid search on decision tree with gini

In [16]:
gs1 = GridSearchCV(dtree_gini, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit1 = gs1.fit(X, y)
pd.DataFrame(gs_fit1.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
5,0.066794,0.004964,0.003101,0.000113,10,50,"{'max_depth': 10, 'min_samples_split': 50}",0.836852,0.846449,0.848752,...,0.846904,0.005499,1,0.861606,0.85959,0.857479,0.855799,0.858061,0.858507,0.001968
3,0.065288,0.000626,0.003263,0.000379,10,2,"{'max_depth': 10, 'min_samples_split': 2}",0.836084,0.844338,0.848369,...,0.845714,0.005341,2,0.8677,0.865493,0.864053,0.861798,0.863052,0.864419,0.002039
4,0.065988,0.003255,0.003156,0.000189,10,10,"{'max_depth': 10, 'min_samples_split': 10}",0.836852,0.843954,0.848177,...,0.845637,0.004875,3,0.865397,0.863429,0.86103,0.859254,0.860605,0.861943,0.00219
8,0.096029,0.00896,0.003556,0.000104,15,50,"{'max_depth': 15, 'min_samples_split': 50}",0.830518,0.845681,0.847025,...,0.844639,0.007258,4,0.870675,0.869619,0.870723,0.868324,0.870345,0.869937,0.000898
0,0.037278,0.0034,0.00356,0.000763,5,2,"{'max_depth': 5, 'min_samples_split': 2}",0.838196,0.843378,0.841267,...,0.843142,0.003912,5,0.845194,0.84409,0.845002,0.842747,0.84477,0.844361,0.000889


grid search on decision tree with entropy

In [17]:
gs2 = GridSearchCV(dtree_entropy, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit2 = gs2.fit(X, y)
pd.DataFrame(gs_fit2.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
5,0.054635,0.002416,0.002822,7.7e-05,10,50,"{'max_depth': 10, 'min_samples_split': 50}",0.836852,0.845681,0.850288,...,0.847672,0.006098,1,0.86055,0.856663,0.858486,0.855751,0.856382,0.857567,0.001747
3,0.05764,0.007337,0.002707,0.000318,10,2,"{'max_depth': 10, 'min_samples_split': 2}",0.835509,0.841843,0.848944,...,0.845599,0.006243,2,0.865637,0.862757,0.863237,0.85983,0.860509,0.862394,0.002073
4,0.049793,0.009795,0.002557,0.000543,10,10,"{'max_depth': 10, 'min_samples_split': 10}",0.835893,0.841459,0.849328,...,0.84556,0.006046,3,0.864149,0.860598,0.86127,0.858007,0.858781,0.860561,0.002148
8,0.068305,0.003241,0.003008,8.6e-05,15,50,"{'max_depth': 15, 'min_samples_split': 50}",0.830902,0.844146,0.849136,...,0.845484,0.00786,4,0.870723,0.868948,0.870723,0.866644,0.869482,0.869304,0.001501
2,0.039144,0.000947,0.002691,0.000236,5,50,"{'max_depth': 5, 'min_samples_split': 50}",0.83858,0.843378,0.841459,...,0.843295,0.003794,5,0.845194,0.843898,0.844474,0.842603,0.844434,0.844121,0.000864


grid search on random forest with gini

In [18]:
gs3 = GridSearchCV(rf_gini, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit3 = gs3.fit(X, y)
pd.DataFrame(gs_fit3.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
7,0.757338,0.045023,0.137074,0.041696,15.0,10,"{'max_depth': 15, 'min_samples_split': 10}",0.846641,0.856814,0.855854,...,0.855925,0.004966,1,0.888574,0.886607,0.884975,0.883968,0.884693,0.885763,0.00165
6,0.70013,0.06085,0.162465,0.067344,15.0,2,"{'max_depth': 15, 'min_samples_split': 2}",0.845681,0.854319,0.856622,...,0.855158,0.005293,2,0.906665,0.905466,0.902634,0.902682,0.903743,0.904238,0.00159
11,0.676512,0.050685,0.149302,0.040302,,50,"{'max_depth': None, 'min_samples_split': 50}",0.847601,0.855278,0.852975,...,0.854966,0.004456,3,0.878545,0.879361,0.878257,0.877873,0.877687,0.878345,0.000589
8,0.687335,0.094663,0.17594,0.048819,15.0,50,"{'max_depth': 15, 'min_samples_split': 50}",0.845106,0.854894,0.854127,...,0.854697,0.005527,4,0.872691,0.870147,0.868948,0.870195,0.869242,0.870245,0.001318
10,0.807523,0.089279,0.127599,0.012619,,10,"{'max_depth': None, 'min_samples_split': 10}",0.848177,0.853743,0.848944,...,0.852777,0.004063,5,0.914967,0.913671,0.914631,0.912424,0.912572,0.913653,0.001036


grid search on random forest with entropy

In [19]:
gs4 = GridSearchCV(rf_entropy, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit4 = gs4.fit(X, y)
pd.DataFrame(gs_fit4.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
6,0.672978,0.075479,0.141316,0.011818,15.0,2,"{'max_depth': 15, 'min_samples_split': 2}",0.847217,0.857198,0.85739,...,0.855733,0.004289,1,0.900955,0.901291,0.901147,0.899371,0.901919,0.900937,0.000847
7,0.77151,0.047699,0.196963,0.089327,15.0,10,"{'max_depth': 15, 'min_samples_split': 10}",0.846641,0.855086,0.85547,...,0.855618,0.005196,2,0.885407,0.884831,0.884399,0.882912,0.884501,0.88441,0.000828
11,0.661512,0.123361,0.1117,0.009651,,50,"{'max_depth': None, 'min_samples_split': 50}",0.847601,0.851248,0.856046,...,0.854543,0.004488,3,0.879793,0.8808,0.878545,0.878065,0.878215,0.879084,0.001052
8,0.762356,0.101459,0.186077,0.042309,15.0,50,"{'max_depth': 15, 'min_samples_split': 50}",0.845489,0.855278,0.852015,...,0.854121,0.00517,4,0.870771,0.869236,0.868708,0.86914,0.870202,0.869611,0.000758
10,0.966572,0.079334,0.209258,0.079508,,10,"{'max_depth': None, 'min_samples_split': 10}",0.847985,0.851248,0.848944,...,0.853085,0.004694,5,0.915687,0.915543,0.916071,0.915111,0.913388,0.91516,0.000938


# Test on Unknown Data

In [20]:
df1_val = pd.read_csv("../Dataset/Dataset1_Unknown.csv")
res1 = pd.DataFrame()


In [21]:
df1_val["age"] = df1_val["age"].map(lambda x: x//10)
df1_val["hours-per-week"] = df1_val["hours-per-week"].map(lambda x: x//10)
df1_val["fnlwgt"] = df1_val["fnlwgt"].map(lambda x: x//100000)
df1_val["capital-gain"] = df1_val["capital-gain"].map(lambda x: x//1000)
df1_val["capital-loss"] = df1_val["capital-loss"].map(lambda x: x//1000)

ordinal_vals = ordinal_enc.transform(df1_val[["workclass",
                                                  "marital-status",
                                                  "occupation",
                                                  "relationship",
                                                  "race", "sex",
                                                  "native-country"]])

df1_val["workclass"] = ordinal_vals[:, 0].astype('int8')
df1_val["marital-status"] = ordinal_vals[:, 1].astype('int8')
df1_val["occupation"] = ordinal_vals[:, 2].astype('int8')
df1_val["relationship"] = ordinal_vals[:, 3].astype('int8')
df1_val["race"] = ordinal_vals[:, 4].astype('int8')
df1_val["sex"] = ordinal_vals[:, 5].astype('int8')
df1_val["native-country"] = ordinal_vals[:, 6].astype('int8')

df1_val = df1_val.drop(['education'], axis=1)

df1_val.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,2,4,0,13,4,13,1,4,0,0,0,4,39
1,1,0,2,9,4,0,4,4,0,0,0,2,39
2,4,6,1,4,2,8,0,4,1,0,0,7,22
3,6,5,1,13,6,8,4,4,0,0,0,3,39
4,4,4,3,10,0,1,4,4,0,0,0,4,39


## predict with decision tree using gini

In [22]:
dtree_gini = DecisionTreeClassifier(criterion='gini',
                                   max_depth=10,
                                   min_samples_split=50)
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(df1_val)

res1["dtree_gini"] = y_pred

## predict with decision tree using entropy

In [23]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy',
                                   max_depth=10,
                                   min_samples_split=50)
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(df1_val)

res1["dtree_entropy"] = y_pred

## predict with random forest using gini

In [24]:
rf_gini = RandomForestClassifier(criterion='gini',
                                 n_estimators=50,
                                 max_depth=15,
                                 min_samples_split=10)
cls = rf_gini.fit(X_train,y_train)
y_pred = cls.predict(df1_val)

res1["rf_gini"] = y_pred

## predict with random forest using entropy

In [25]:
rf_entropy = RandomForestClassifier(criterion='entropy',
                                 n_estimators=50,
                                 max_depth=15,
                                 min_samples_split=2)
cls = rf_entropy.fit(X_train,y_train)
y_pred = cls.predict(df1_val)

res1["rf_entropy"] = y_pred

In [26]:
res1.to_csv("./01/prediction.csv")