# Dataset 3

## 1) Preparing Data:

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [None]:
df3 = pd.read_csv("../Dataset/Dataset3.csv")
df3.head()

### check for null values

In [None]:
print(df3.info())

### generalize numerical attributes

In [None]:
print("age: ",set(df3["age"].values))
df3["age"] = df3["age"].map(lambda x: x//10)

In [None]:
print("trestbps: ",sorted(set(df3["trestbps"].values)))
df3['trestbps'] = (df3['trestbps']-min(df3['trestbps']))//np.std(df3['trestbps'])
df3['trestbps'] = df3['trestbps'].astype('int8')

In [None]:
print("chol: ",sorted(set(df3["chol"].values)))
df3['chol'] = (df3['chol']-min(df3['chol']))//np.std(df3['chol'])
df3['chol'] = df3['chol'].astype('int8')

In [None]:
print("thalach: ",sorted(set(df3["thalach"].values)))
df3['thalach'] = (df3['thalach']-min(df3['thalach']))//np.std(df3['thalach'])
df3['thalach'] = df3['thalach'].astype('int8')

In [None]:
print("oldpeak: ",sorted(set(df3["oldpeak"].values)))
df3['oldpeak'] = df3['oldpeak']*10
df3['oldpeak'] = (df3['oldpeak']-min(df3['oldpeak']))//np.std(df3['oldpeak'])
df3['oldpeak'] = df3['oldpeak'].astype('int8')

### clean data

In [None]:
X = df3.drop(['disease'], axis=1)
y = df3['disease']
df3.head()


## 2) Classifying the Data

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier


from sklearn.tree import export_graphviz
from sklearn.datasets import load_wine
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

### split into test and train sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


### Decision Tree with gini

In [None]:
dtree_gini = DecisionTreeClassifier(criterion='gini')
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_gini, out_file="./03/dtree_gini.dot",
                feature_names=X_train.columns,
                filled = True)

### Decision Tree with entropy

In [None]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy')
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_entropy, out_file="./03/dtree_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

### Random Forest with gini

In [None]:
rf_gini = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='gini')
rf_model = rf_gini.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_gini.estimators_[0], out_file="./03/rf_gini.dot",
                feature_names=X_train.columns,
                filled = True)

### Random Forest with entropy

In [None]:
rf_entropy = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='entropy')
rf_model = rf_entropy.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_entropy.estimators_[0], out_file="./03/rf_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

### visualize

In [None]:
# In order to see each tree in jupyter notebook
# uncomment following lines and execute them in 
# separate cells

# graph = Source(export_graphviz(dtree_gini, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(dtree_entropy, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

# random forests have many estimators so we should travers them
# or just visualize one of them

# graph = Source(export_graphviz(rf_gini.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(rf_entropy.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

### grid search

In [None]:
param = {'min_samples_split': [2, 5, 10],
        'max_depth': [5, 10, 15, None]}

grid search on decision tree with gini

In [None]:
gs1 = GridSearchCV(dtree_gini, param, cv=5,
                   n_jobs=-1, iid=True,
                   return_train_score=True)
gs_fit1 = gs1.fit(X, y)
pd.DataFrame(gs_fit1.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

grid search on decision tree with entropy

In [None]:
gs2 = GridSearchCV(dtree_entropy, param, cv=5,
                   n_jobs=-1, iid=True,
                   return_train_score=True)
gs_fit2 = gs2.fit(X, y)
pd.DataFrame(gs_fit2.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

grid search on random forest with gini

In [None]:
gs3 = GridSearchCV(rf_gini, param, cv=5,
                   n_jobs=-1, iid=True,
                   return_train_score=True)
gs_fit3 = gs3.fit(X, y)
pd.DataFrame(gs_fit3.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

grid search on random forest with entropy

In [None]:
gs4 = GridSearchCV(rf_entropy, param, cv=5,
                   n_jobs=-1, iid=True,
                   return_train_score=True)
gs_fit4 = gs4.fit(X, y)
pd.DataFrame(gs_fit4.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

# Test on Unknown Data

In [None]:
df3_val = pd.read_csv("../Dataset/Dataset3_Unknown.csv")
res3 = pd.DataFrame()


In [None]:
df3_val["age"] = df3_val["age"].map(lambda x: x//10)

df3_val['trestbps'] = (df3_val['trestbps']-94)//17.521
df3_val['trestbps'] = df3_val['trestbps'].astype('int8')

df3_val['chol'] = (df3_val['chol']-126)//54.007
df3_val['chol'] = df3_val['chol'].astype('int8')

df3_val['thalach'] = (df3_val['thalach']-88)//22.15
df3_val['thalach'] = df3_val['thalach'].astype('int8')

df3_val['oldpeak'] = df3_val['oldpeak']*10//11.762
df3_val['oldpeak'] = df3_val['oldpeak'].astype('int8')



## predict with decision tree using gini

In [None]:
dtree_gini = DecisionTreeClassifier(criterion='gini',
                                   max_depth=5,
                                   min_samples_split=10)
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(df3_val)

res3["dtree_gini"] = y_pred

## predict with decision tree using entropy

In [None]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy',
                                   max_depth=5,
                                   min_samples_split=10)
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(df3_val)

res3["dtree_entropy"] = y_pred

## predict with random forest using gini

In [None]:
rf_gini = RandomForestClassifier(criterion='gini',
                                 n_estimators=50,
                                 max_depth=10,
                                 min_samples_split=5)
cls = rf_gini.fit(X_train,y_train)
y_pred = cls.predict(df3_val)

res3["rf_gini"] = y_pred

## predict with random forest using entropy

In [None]:
rf_entropy = RandomForestClassifier(criterion='entropy',
                                 n_estimators=50,
                                 max_depth=5,
                                 min_samples_split=5)
cls = rf_entropy.fit(X_train,y_train)
y_pred = cls.predict(df3_val)

res3["rf_entropy"] = y_pred

In [None]:
res3.to_csv("./03/prediction.csv")