# Decision Trees

## 1) Preparing Data:

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [None]:
df2 = pd.read_csv("./Data/5/Dataset2.csv")
print(df2.columns)
df2.head()

### check for null values

In [None]:
print(df2.info())

### drop unwanted attributes

In [None]:
for column in df2.columns:
    print(column, ": ", set(df2[column].values))
    
df2 = df2.drop(['veil-type'], axis=1)

### encode categorical attributes

In [None]:
def replace_poison(x):
    poison = x['poisonous']
    if poison == 'p':
        return 1
    else :
        return 0

df2['poisonous'] = df2.apply(replace_poison, axis=1)

X = df2.drop(['poisonous'], axis=1)
y = df2['poisonous']

In [None]:
ordinal_enc = OrdinalEncoder()
ordinal_vals = ordinal_enc.fit_transform(X)
ordinal_vals = ordinal_vals.astype('int8')

X = pd.DataFrame(ordinal_vals, columns=X.columns)

## 2) Classifying the Data

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score 

from sklearn.tree import export_graphviz
from sklearn.datasets import load_wine
from IPython.display import SVG
# from graphviz import Source
from IPython.display import display

### split into test and train sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


### Decision Tree with gini

In [None]:
dtree_gini = DecisionTreeClassifier(criterion='gini')
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_gini, out_file="./Data/5/dtree_gini.dot",
                feature_names=X_train.columns,
                filled = True)

### Decision Tree with entropy

In [None]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy')
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_entropy, out_file="./Data/5/dtree_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

### visualize

In [None]:
# In order to see each tree in jupyter notebook
# uncomment following lines and execute them in 
# separate cells

# graph = Source(export_graphviz(dtree_gini, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(dtree_entropy, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

# random forests have many estimators so we should travers them
# or just visualize one of them

# graph = Source(export_graphviz(rf_gini.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(rf_entropy.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# ------------ you can also try this: ------------ #
# from sklearn.externals.six import StringIO  
# from IPython.display import Image  
# from sklearn.tree import export_graphviz
# import pydotplus
# dot_data = StringIO()
# export_graphviz(clf, out_file=dot_data,  
#                 filled=True, rounded=True,
#                 special_characters=True, feature_names = feature_cols,class_names=['0','1'])
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_png('diabetes.png')
# ------------------------------------------------ #



### Grid Search 

In [None]:
param = {'min_samples_split': [2, 4, 6, 8],
        'max_depth': [5, 10, 15, 25, None]}

grid search on decision tree with gini

In [None]:
gs1 = GridSearchCV(dtree_gini, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit1 = gs1.fit(X, y)
pd.DataFrame(gs_fit1.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

grid search on decision tree with entropy

In [None]:
gs2 = GridSearchCV(dtree_entropy, param, cv=5,
                   n_jobs=-1, return_train_score=True)
gs_fit2 = gs2.fit(X, y)
pd.DataFrame(gs_fit2.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

# Test on Unknown Data

In [None]:
df2_val = pd.read_csv("./Data/5/Dataset2_Unknown.csv")
res2 = pd.DataFrame()


In [None]:
df2_val = df2_val.drop(['veil-type'], axis=1)

ordinal_vals = ordinal_enc.transform(df2_val)
ordinal_vals = ordinal_vals.astype('int8')

df2_val = pd.DataFrame(ordinal_vals, columns=df2_val.columns)
df2_val.head()

## predict with decision tree using gini

In [None]:
dtree_gini = DecisionTreeClassifier(criterion='gini',
                                   max_depth=25,
                                   min_samples_split=2)
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(df2_val)

res2["dtree_gini"] = y_pred

## predict with decision tree using entropy

In [None]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy',
                                   max_depth=25,
                                   min_samples_split=4)
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(df2_val)

res2["dtree_entropy"] = y_pred

In [None]:
res2.to_csv("./Data/5/prediction.csv")