# Dataset 1

## 1) Preparing Data:

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [None]:
df1 = pd.read_csv("../Dataset/Dataset1.csv")
df1.head()

### check for null values

In [None]:
print(df1.info())

### generalize numerical attributes

In [None]:
print("Age values: ",set(df1["age"].values))
print("Hours-per-week values: ",set(df1["hours-per-week"].values))
df1["age"] = df1["age"].map(lambda x: x//10)
df1["hours-per-week"] = df1["hours-per-week"].map(lambda x: x//10)
df1["fnlwgt"] = df1["fnlwgt"].map(lambda x: x//100000)
df1["capital-gain"] = df1["capital-gain"].map(lambda x: x//1000)
df1["capital-loss"] = df1["capital-loss"].map(lambda x: x//1000)

### encode categorical attributes

In [None]:
ordinal_enc = OrdinalEncoder()
ordinal_vals = ordinal_enc.fit_transform(df1[["workclass", "marital-status",
                                             "occupation","relationship", "race",
                                             "sex", "native-country"]])

df1["workclass"] = ordinal_vals[:, 0].astype('int8')
df1["marital-status"] = ordinal_vals[:, 1].astype('int8')
df1["occupation"] = ordinal_vals[:, 2].astype('int8')
df1["relationship"] = ordinal_vals[:, 3].astype('int8')
df1["race"] = ordinal_vals[:, 4].astype('int8')
df1["sex"] = ordinal_vals[:, 5].astype('int8')
df1["native-country"] = ordinal_vals[:, 6].astype('int8')

### drop unwanted attributes

In [None]:
df1 = df1.drop(['education'], axis=1)

In [None]:
print(set(df1["income"].values))

def replace_income(x):
    income = x['income']
    if income == '<=50K':
        return 0
    else :
        return 1

df1['income'] = df1.apply(replace_income, axis=1)

### clean data

In [None]:
X = df1.drop(['income'], axis=1)
y = df1['income']
df1.head()


## 2) Classifying the Data

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier


from sklearn.tree import export_graphviz
from sklearn.datasets import load_wine
from IPython.display import SVG
from graphviz import Source
from IPython.display import display


### split into test and train sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


### Decision Tree with gini

In [None]:
dtree_gini = DecisionTreeClassifier(criterion='gini')
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_gini, out_file="./01/dtree_gini.dot",
                feature_names=X_train.columns,
                filled = True)

### Decision Tree with entropy

In [None]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy')
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_entropy, out_file="./01/dtree_entropy.dot",
                feature_names=X_train.columns,
                filled = True)


### Random Forest with gini

In [None]:
rf_gini = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='gini')
rf_model = rf_gini.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_gini.estimators_[0], out_file="./01/rf_gini.dot",
                feature_names=X_train.columns,
                filled = True)

### Random Forest with entropy

In [None]:
rf_entropy = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='entropy')
rf_model = rf_entropy.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_entropy.estimators_[0], out_file="./01/rf_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

### visualize

In [None]:
# In order to see each tree in jupyter notebook
# uncomment following lines and execute them in 
# separate cells

# graph = Source(export_graphviz(dtree_gini, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(dtree_entropy, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

# random forests have many estimators so we should travers them
# or just visualize one of them

# graph = Source(export_graphviz(rf_gini.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(rf_entropy.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


### grid search

In [None]:
param = {'min_samples_split': [2, 10, 50],
        'max_depth': [5, 10, 15, None]}

grid search on decision tree with gini

In [None]:
gs1 = GridSearchCV(dtree_gini, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit1 = gs1.fit(X, y)
pd.DataFrame(gs_fit1.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

grid search on decision tree with entropy

In [None]:
gs2 = GridSearchCV(dtree_entropy, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit2 = gs2.fit(X, y)
pd.DataFrame(gs_fit2.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

grid search on random forest with gini

In [None]:
gs3 = GridSearchCV(rf_gini, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit3 = gs3.fit(X, y)
pd.DataFrame(gs_fit3.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

grid search on random forest with entropy

In [None]:
gs4 = GridSearchCV(rf_entropy, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit4 = gs4.fit(X, y)
pd.DataFrame(gs_fit4.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

# Test on Unknown Data

In [None]:
df1_val = pd.read_csv("../Dataset/Dataset1_Unknown.csv")
res1 = pd.DataFrame()


In [None]:
df1_val["age"] = df1_val["age"].map(lambda x: x//10)
df1_val["hours-per-week"] = df1_val["hours-per-week"].map(lambda x: x//10)
df1_val["fnlwgt"] = df1_val["fnlwgt"].map(lambda x: x//100000)
df1_val["capital-gain"] = df1_val["capital-gain"].map(lambda x: x//1000)
df1_val["capital-loss"] = df1_val["capital-loss"].map(lambda x: x//1000)

ordinal_vals = ordinal_enc.transform(df1_val[["workclass",
                                                  "marital-status",
                                                  "occupation",
                                                  "relationship",
                                                  "race", "sex",
                                                  "native-country"]])

df1_val["workclass"] = ordinal_vals[:, 0].astype('int8')
df1_val["marital-status"] = ordinal_vals[:, 1].astype('int8')
df1_val["occupation"] = ordinal_vals[:, 2].astype('int8')
df1_val["relationship"] = ordinal_vals[:, 3].astype('int8')
df1_val["race"] = ordinal_vals[:, 4].astype('int8')
df1_val["sex"] = ordinal_vals[:, 5].astype('int8')
df1_val["native-country"] = ordinal_vals[:, 6].astype('int8')

df1_val = df1_val.drop(['education'], axis=1)

df1_val.head()

## predict with decision tree using gini

In [None]:
dtree_gini = DecisionTreeClassifier(criterion='gini',
                                   max_depth=10,
                                   min_samples_split=50)
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(df1_val)

res1["dtree_gini"] = y_pred

## predict with decision tree using entropy

In [None]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy',
                                   max_depth=10,
                                   min_samples_split=50)
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(df1_val)

res1["dtree_entropy"] = y_pred

## predict with random forest using gini

In [None]:
rf_gini = RandomForestClassifier(criterion='gini',
                                 n_estimators=50,
                                 max_depth=15,
                                 min_samples_split=10)
cls = rf_gini.fit(X_train,y_train)
y_pred = cls.predict(df1_val)

res1["rf_gini"] = y_pred

## predict with random forest using entropy

In [None]:
rf_entropy = RandomForestClassifier(criterion='entropy',
                                 n_estimators=50,
                                 max_depth=15,
                                 min_samples_split=2)
cls = rf_entropy.fit(X_train,y_train)
y_pred = cls.predict(df1_val)

res1["rf_entropy"] = y_pred

In [None]:
res1.to_csv("./01/prediction.csv")