# Dataset 2

## 1) Preparing Data:

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df2 = pd.read_csv("../Dataset/Dataset2.csv")
print(df2.columns)
df2.head()

Index(['poisonous', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')


Unnamed: 0,poisonous,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
2,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
3,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
4,e,x,y,y,t,a,f,c,b,n,...,s,w,w,p,w,o,p,k,n,g


### check for null values

In [3]:
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6499 entries, 0 to 6498
Data columns (total 23 columns):
poisonous                   6499 non-null object
cap-shape                   6499 non-null object
cap-surface                 6499 non-null object
cap-color                   6499 non-null object
bruises                     6499 non-null object
odor                        6499 non-null object
gill-attachment             6499 non-null object
gill-spacing                6499 non-null object
gill-size                   6499 non-null object
gill-color                  6499 non-null object
stalk-shape                 6499 non-null object
stalk-root                  6499 non-null object
stalk-surface-above-ring    6499 non-null object
stalk-surface-below-ring    6499 non-null object
stalk-color-above-ring      6499 non-null object
stalk-color-below-ring      6499 non-null object
veil-type                   6499 non-null object
veil-color                  6499 non-null object
ring-number

### drop unwanted attributes

In [4]:
for column in df2.columns:
    print(column, ": ", set(df2[column].values))
    
df2 = df2.drop(['veil-type'], axis=1)

poisonous :  {'p', 'e'}
cap-shape :  {'x', 's', 'k', 'f', 'c', 'b'}
cap-surface :  {'s', 'g', 'f', 'y'}
cap-color :  {'p', 'g', 'e', 'n', 'u', 'w', 'c', 'r', 'b', 'y'}
bruises :  {'f', 't'}
odor :  {'p', 'm', 's', 'a', 'n', 'f', 'c', 'l', 'y'}
gill-attachment :  {'f', 'a'}
gill-spacing :  {'w', 'c'}
gill-size :  {'b', 'n'}
gill-color :  {'p', 'g', 'k', 'b', 'n', 'u', 'e', 'o', 'w', 'r', 'h', 'y'}
stalk-shape :  {'t', 'e'}
stalk-root :  {'e', 'r', '?', 'c', 'b'}
stalk-surface-above-ring :  {'s', 'k', 'f', 'y'}
stalk-surface-below-ring :  {'s', 'k', 'f', 'y'}
stalk-color-above-ring :  {'p', 'g', 'n', 'e', 'o', 'w', 'c', 'b', 'y'}
stalk-color-below-ring :  {'p', 'g', 'n', 'e', 'o', 'w', 'c', 'b', 'y'}
veil-type :  {'p'}
veil-color :  {'y', 'o', 'w', 'n'}
ring-number :  {'o', 't', 'n'}
ring-type :  {'p', 'n', 'e', 'f', 'l'}
spore-print-color :  {'k', 'b', 'u', 'n', 'o', 'w', 'r', 'h', 'y'}
population :  {'s', 'a', 'v', 'n', 'c', 'y'}
habitat :  {'m', 'g', 'p', 'd', 'u', 'w', 'l'}


### encode categorical attributes

In [5]:
def replace_poison(x):
    poison = x['poisonous']
    if poison == 'p':
        return 1
    else :
        return 0

df2['poisonous'] = df2.apply(replace_poison, axis=1)

X = df2.drop(['poisonous'], axis=1)
y = df2['poisonous']

In [6]:
ordinal_enc = OrdinalEncoder()
ordinal_vals = ordinal_enc.fit_transform(X)
ordinal_vals = ordinal_vals.astype('int8')

X = pd.DataFrame(ordinal_vals, columns=X.columns)

## 2) Classifying the Data

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier


from sklearn.tree import export_graphviz
from sklearn.datasets import load_wine
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

### split into test and train sets

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


### Decision Tree with gini

In [9]:
dtree_gini = DecisionTreeClassifier(criterion='gini')
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_gini, out_file="./02/dtree_gini.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 1.0


### Decision Tree with entropy

In [10]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy')
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_entropy, out_file="./02/dtree_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 1.0


### Random Forest with gini

In [11]:
rf_gini = RandomForestClassifier(n_jobs=-1, n_estimators=50,
                                 criterion='gini')

rf_model = rf_gini.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_gini.estimators_[0], out_file="./02/rf_gini.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 1.0


### Random Forest with entropy

In [12]:
rf_entropy = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='entropy')
rf_model = rf_entropy.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_entropy.estimators_[0], out_file="./02/rf_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 1.0


### visualize

In [13]:
# In order to see each tree in jupyter notebook
# uncomment following lines and execute them in 
# separate cells

# graph = Source(export_graphviz(dtree_gini, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(dtree_entropy, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

# random forests have many estimators so we should travers them
# or just visualize one of them

# graph = Source(export_graphviz(rf_gini.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(rf_entropy.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

In [14]:
param = {'min_samples_split': [2, 4, 6, 8],
        'max_depth': [5, 10, 15, 25, None]}

grid search on decision tree with gini

In [15]:
gs1 = GridSearchCV(dtree_gini, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit1 = gs1.fit(X, y)
pd.DataFrame(gs_fit1.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
12,0.010141,0.000539,0.001691,0.000308,25,2,"{'max_depth': 25, 'min_samples_split': 2}",0.832437,1.0,1.0,...,0.954608,0.065289,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
9,0.009428,0.00116,0.001744,0.000201,15,4,"{'max_depth': 15, 'min_samples_split': 4}",0.832437,1.0,1.0,...,0.948454,0.067714,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.009594,0.000678,0.001771,0.000269,10,4,"{'max_depth': 10, 'min_samples_split': 4}",0.832437,1.0,1.0,...,0.947223,0.068455,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,0.011157,0.000767,0.002087,0.000424,5,8,"{'max_depth': 5, 'min_samples_split': 8}",0.832437,1.0,0.955385,...,0.938298,0.063738,4,0.998653,0.998269,0.989421,0.986346,0.997115,0.993961,0.005082
1,0.011179,0.001076,0.002229,0.000277,5,4,"{'max_depth': 5, 'min_samples_split': 4}",0.832437,1.0,0.955385,...,0.938298,0.063738,4,0.998653,0.998269,0.989421,0.986346,0.997115,0.993961,0.005082


grid search on decision tree with entropy

In [16]:
gs2 = GridSearchCV(dtree_entropy, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit2 = gs2.fit(X, y)
pd.DataFrame(gs_fit2.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
13,0.005377,0.000394,0.001036,1.5e-05,25.0,4,"{'max_depth': 25, 'min_samples_split': 4}",0.832437,1.0,1.0,...,0.954608,0.065289,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
19,0.007424,0.002079,0.001332,0.000324,,8,"{'max_depth': None, 'min_samples_split': 8}",0.832437,1.0,1.0,...,0.948454,0.067714,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,0.008006,0.000773,0.001586,0.000292,5.0,8,"{'max_depth': 5, 'min_samples_split': 8}",0.832437,0.991538,0.942308,...,0.935221,0.06104,3,0.998653,0.976919,0.988844,0.988077,0.985962,0.987691,0.006938
5,0.005451,0.00034,0.001046,1.8e-05,10.0,4,"{'max_depth': 10, 'min_samples_split': 4}",0.832437,1.0,1.0,...,0.913833,0.109775,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,0.005597,0.000547,0.001069,5.4e-05,15.0,2,"{'max_depth': 15, 'min_samples_split': 2}",0.832437,1.0,1.0,...,0.913833,0.109775,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0


grid search on random forest with gini

In [17]:
gs3 = GridSearchCV(rf_gini, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit3 = gs3.fit(X, y)
pd.DataFrame(gs_fit3.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
17,0.126273,0.00315,0.110111,0.008311,,4,"{'max_depth': None, 'min_samples_split': 4}",0.847041,1.0,0.993846,...,0.905678,0.123773,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,0.249666,0.006407,0.11164,0.001339,5.0,6,"{'max_depth': 5, 'min_samples_split': 6}",0.847041,0.997692,0.959231,...,0.902447,0.111871,2,0.989419,0.991152,0.998846,0.988654,0.999808,0.993576,0.004775
19,0.124897,0.002938,0.109117,0.010219,,8,"{'max_depth': None, 'min_samples_split': 8}",0.847041,1.0,0.989231,...,0.899831,0.131899,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
10,0.123311,0.004034,0.103546,0.000376,15.0,6,"{'max_depth': 15, 'min_samples_split': 6}",0.847041,1.0,0.986154,...,0.897523,0.134538,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,0.15222,0.042898,0.108824,0.006992,10.0,6,"{'max_depth': 10, 'min_samples_split': 6}",0.847041,1.0,0.986154,...,0.896446,0.136488,5,1.0,1.0,1.0,1.0,1.0,1.0,0.0


grid search on random forest with entropy

In [18]:
gs4 = GridSearchCV(rf_entropy, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit4 = gs4.fit(X, y)
pd.DataFrame(gs_fit4.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
18,0.127208,0.003025,0.113708,0.010005,,6,"{'max_depth': None, 'min_samples_split': 6}",0.847041,1.0,0.998462,...,0.905524,0.126344,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
13,0.177176,0.036644,0.103692,0.000533,25.0,4,"{'max_depth': 25, 'min_samples_split': 4}",0.847041,1.0,0.992308,...,0.904755,0.124642,2,1.0,1.0,1.0,1.0,1.0,1.0,0.0
16,0.130056,0.005746,0.117439,0.009073,,2,"{'max_depth': None, 'min_samples_split': 2}",0.847041,1.0,0.992308,...,0.903062,0.127645,3,1.0,1.0,1.0,1.0,1.0,1.0,0.0
15,0.130484,0.009171,0.111416,0.009915,25.0,8,"{'max_depth': 25, 'min_samples_split': 8}",0.847041,1.0,0.998462,...,0.897369,0.140974,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.209265,0.041511,0.108659,0.00704,10.0,4,"{'max_depth': 10, 'min_samples_split': 4}",0.847041,1.0,0.989231,...,0.896907,0.137177,5,1.0,1.0,1.0,1.0,1.0,1.0,0.0


# Test on Unknown Data

In [19]:
df2_val = pd.read_csv("../Dataset/Dataset2_Unknown.csv")
res2 = pd.DataFrame()


In [20]:
df2_val = df2_val.drop(['veil-type'], axis=1)

ordinal_vals = ordinal_enc.transform(df2_val)
ordinal_vals = ordinal_vals.astype('int8')

df2_val = pd.DataFrame(ordinal_vals, columns=df2_val.columns)
df2_val.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,0,8,0,1,1,0,1,9,0,...,2,2,7,7,2,1,4,3,4,0
1,5,3,3,0,2,1,0,0,2,0,...,1,1,0,6,2,1,2,1,4,4
2,5,0,9,0,2,1,0,0,3,0,...,1,1,6,0,2,1,2,1,5,4
3,0,0,3,0,5,1,1,0,2,0,...,1,2,7,7,2,2,4,7,2,1
4,5,0,2,1,5,1,0,0,7,1,...,2,2,6,6,2,1,4,3,5,0


## predict with decision tree using gini

In [21]:
dtree_gini = DecisionTreeClassifier(criterion='gini',
                                   max_depth=25,
                                   min_samples_split=2)
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(df2_val)

res2["dtree_gini"] = y_pred

## predict with decision tree using entropy

In [22]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy',
                                   max_depth=25,
                                   min_samples_split=4)
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(df2_val)

res2["dtree_entropy"] = y_pred

## predict with random forest using gini

In [23]:
rf_gini = RandomForestClassifier(criterion='gini',
                                 n_estimators=50,
                                 max_depth=None,
                                 min_samples_split=4)
cls = rf_gini.fit(X_train,y_train)
y_pred = cls.predict(df2_val)

res2["rf_gini"] = y_pred

## predict with random forest using entropy

In [24]:
rf_entropy = RandomForestClassifier(criterion='entropy',
                                 n_estimators=50,
                                 max_depth=None,
                                 min_samples_split=6)
cls = rf_entropy.fit(X_train,y_train)
y_pred = cls.predict(df2_val)

res2["rf_entropy"] = y_pred

In [25]:
res2.to_csv("./02/prediction.csv")