# Dataset 3

## 1) Preparing Data:

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df3 = pd.read_csv("../Dataset/Dataset3.csv")
df3.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
1,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
2,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
3,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
4,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1


### check for null values

In [3]:
print(df3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242 entries, 0 to 241
Data columns (total 14 columns):
age         242 non-null int64
sex         242 non-null int64
cp          242 non-null int64
trestbps    242 non-null int64
chol        242 non-null int64
fbs         242 non-null int64
restecg     242 non-null int64
thalach     242 non-null int64
exang       242 non-null int64
oldpeak     242 non-null float64
slope       242 non-null int64
ca          242 non-null int64
thal        242 non-null int64
disease     242 non-null int64
dtypes: float64(1), int64(13)
memory usage: 26.5 KB
None


### generalize numerical attributes

In [4]:
print("age: ",set(df3["age"].values))
df3["age"] = df3["age"].map(lambda x: x//10)

age:  {34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 76, 77}


In [5]:
print("trestbps: ",sorted(set(df3["trestbps"].values)))
df3['trestbps'] = (df3['trestbps']-min(df3['trestbps']))//np.std(df3['trestbps'])
df3['trestbps'] = df3['trestbps'].astype('int8')

trestbps:  [94, 100, 101, 102, 104, 105, 106, 108, 110, 112, 114, 115, 117, 118, 120, 122, 123, 124, 125, 126, 128, 129, 130, 132, 134, 135, 138, 140, 142, 144, 145, 146, 148, 150, 152, 154, 155, 156, 160, 164, 170, 172, 178, 180, 192]


In [6]:
print("chol: ",sorted(set(df3["chol"].values)))
df3['chol'] = (df3['chol']-min(df3['chol']))//np.std(df3['chol'])
df3['chol'] = df3['chol'].astype('int8')

chol:  [126, 131, 141, 149, 157, 160, 164, 166, 167, 168, 172, 174, 175, 176, 177, 178, 180, 182, 183, 186, 187, 188, 192, 193, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 211, 212, 213, 214, 215, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 239, 240, 243, 244, 245, 246, 247, 248, 250, 252, 253, 254, 255, 256, 257, 258, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 273, 274, 275, 276, 277, 278, 281, 282, 283, 284, 286, 288, 289, 293, 294, 295, 298, 299, 300, 302, 303, 304, 305, 307, 308, 309, 311, 313, 315, 318, 321, 322, 325, 326, 327, 330, 335, 340, 341, 342, 354, 360, 394, 407, 409, 417, 564]


In [7]:
print("thalach: ",sorted(set(df3["thalach"].values)))
df3['thalach'] = (df3['thalach']-min(df3['thalach']))//np.std(df3['thalach'])
df3['thalach'] = df3['thalach'].astype('int8')

thalach:  [88, 90, 95, 96, 99, 103, 105, 108, 109, 111, 112, 113, 114, 115, 116, 117, 118, 120, 122, 125, 126, 128, 129, 130, 131, 132, 133, 134, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 178, 179, 180, 181, 182, 185, 186, 187, 188, 190, 195]


In [8]:
print("oldpeak: ",sorted(set(df3["oldpeak"].values)))
df3['oldpeak'] = df3['oldpeak']*10
df3['oldpeak'] = (df3['oldpeak']-min(df3['oldpeak']))//np.std(df3['oldpeak'])
df3['oldpeak'] = df3['oldpeak'].astype('int8')

oldpeak:  [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.8, 1.9, 2.0, 2.2, 2.4, 2.5, 2.6, 2.8, 2.9, 3.0, 3.1, 3.2, 3.4, 3.5, 3.6, 4.0, 4.2, 4.4, 5.6, 6.2]


### clean data

In [9]:
X = df3.drop(['disease'], axis=1)
y = df3['disease']
df3.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,3,1,2,2,2,0,1,4,0,2,0,0,2,1
1,5,1,1,1,2,0,1,4,0,0,2,0,2,1
2,5,0,0,1,4,0,1,3,1,0,2,0,2,1
3,5,1,0,2,1,0,1,2,0,0,1,0,1,1
4,5,0,1,2,3,0,0,2,0,1,1,0,2,1


## 2) Classifying the Data

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score 
from sklearn.ensemble import RandomForestClassifier


from sklearn.tree import export_graphviz
from sklearn.datasets import load_wine
from IPython.display import SVG
from graphviz import Source
from IPython.display import display

### split into test and train sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


### Decision Tree with gini

In [12]:
dtree_gini = DecisionTreeClassifier(criterion='gini')
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_gini, out_file="./03/dtree_gini.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 0.8571428571428571


### Decision Tree with entropy

In [13]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy')
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(dtree_entropy, out_file="./03/dtree_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 0.7755102040816326


### Random Forest with gini

In [14]:
rf_gini = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='gini')
rf_model = rf_gini.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_gini.estimators_[0], out_file="./03/rf_gini.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 0.7959183673469388


### Random Forest with entropy

In [15]:
rf_entropy = RandomForestClassifier(n_jobs=-1, n_estimators=50, criterion='entropy')
rf_model = rf_entropy.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

export_graphviz(rf_entropy.estimators_[0], out_file="./03/rf_entropy.dot",
                feature_names=X_train.columns,
                filled = True)

Accuracy: 0.7755102040816326


### visualize

In [16]:
# In order to see each tree in jupyter notebook
# uncomment following lines and execute them in 
# separate cells

# graph = Source(export_graphviz(dtree_gini, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(dtree_entropy, out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

# random forests have many estimators so we should travers them
# or just visualize one of them

# graph = Source(export_graphviz(rf_gini.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))


# graph = Source(export_graphviz(rf_entropy.estimators_[0], out_file=None,
#                                     feature_names=X_train.columns,
#                                     filled = True))
# display(SVG(graph.pipe(format='svg')))

### grid search

In [17]:
param = {'min_samples_split': [2, 5, 10],
        'max_depth': [5, 10, 15, None]}

grid search on decision tree with gini

In [18]:
gs1 = GridSearchCV(dtree_gini, param, cv=5,
                   n_jobs=-1, iid=True,
                   return_train_score=True)
gs_fit1 = gs1.fit(X, y)
pd.DataFrame(gs_fit1.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
2,0.003972,0.000221,0.001858,3e-05,5.0,10,"{'max_depth': 5, 'min_samples_split': 10}",0.836735,0.918367,0.795918,...,0.822314,0.056929,1,0.88601,0.880829,0.906736,0.917526,0.887179,0.895656,0.014041
1,0.003906,0.000495,0.001846,0.000296,5.0,5,"{'max_depth': 5, 'min_samples_split': 5}",0.77551,0.877551,0.795918,...,0.797521,0.050068,2,0.906736,0.911917,0.927461,0.943299,0.928205,0.923524,0.013
11,0.00285,0.000525,0.001295,0.000298,,10,"{'max_depth': None, 'min_samples_split': 10}",0.816327,0.877551,0.734694,...,0.789256,0.06254,3,0.906736,0.901554,0.917098,0.917526,0.892308,0.907044,0.009574
0,0.00439,0.000252,0.002151,5.3e-05,5.0,2,"{'max_depth': 5, 'min_samples_split': 2}",0.734694,0.897959,0.77551,...,0.785124,0.062128,4,0.911917,0.92228,0.932642,0.948454,0.933333,0.929725,0.012205
8,0.003335,0.000494,0.001419,0.000269,15.0,10,"{'max_depth': 15, 'min_samples_split': 10}",0.816327,0.877551,0.714286,...,0.77686,0.078004,5,0.906736,0.901554,0.917098,0.917526,0.892308,0.907044,0.009574


grid search on decision tree with entropy

In [19]:
gs2 = GridSearchCV(dtree_entropy, param, cv=5,
                   n_jobs=-1, iid=True,
                   return_train_score=True)
gs_fit2 = gs2.fit(X, y)
pd.DataFrame(gs_fit2.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
2,0.003905,0.000402,0.001708,0.000222,5,10,"{'max_depth': 5, 'min_samples_split': 10}",0.836735,0.877551,0.795918,...,0.793388,0.0617,1,0.88601,0.875648,0.906736,0.907216,0.902564,0.895635,0.012629
8,0.001869,0.000291,0.000823,0.000115,15,10,"{'max_depth': 15, 'min_samples_split': 10}",0.795918,0.836735,0.755102,...,0.768595,0.059515,2,0.906736,0.906736,0.917098,0.912371,0.907692,0.910127,0.004064
1,0.004693,0.000928,0.001905,0.000356,5,5,"{'max_depth': 5, 'min_samples_split': 5}",0.734694,0.857143,0.77551,...,0.764463,0.065014,3,0.906736,0.906736,0.927461,0.938144,0.938462,0.923508,0.014255
5,0.001914,6.5e-05,0.000848,3.2e-05,10,10,"{'max_depth': 10, 'min_samples_split': 10}",0.755102,0.836735,0.77551,...,0.760331,0.056813,4,0.906736,0.901554,0.917098,0.907216,0.912821,0.909085,0.005365
0,0.003239,0.000592,0.001452,0.000224,5,2,"{'max_depth': 5, 'min_samples_split': 2}",0.714286,0.836735,0.755102,...,0.752066,0.047197,5,0.917098,0.917098,0.937824,0.938144,0.94359,0.930751,0.011334


grid search on random forest with gini

In [20]:
gs3 = GridSearchCV(rf_gini, param, cv=5,
                   n_jobs=-1, iid=True,
                   return_train_score=True)
gs_fit3 = gs3.fit(X, y)
pd.DataFrame(gs_fit3.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
4,0.186716,0.010457,0.111165,0.003896,10,5,"{'max_depth': 10, 'min_samples_split': 5}",0.897959,0.938776,0.77551,...,0.859504,0.056369,1,0.948187,0.948187,0.953368,0.953608,0.964103,0.95349,0.005812
2,0.158939,0.010751,0.105984,0.002601,5,10,"{'max_depth': 5, 'min_samples_split': 10}",0.897959,0.959184,0.816327,...,0.855372,0.064221,2,0.917098,0.911917,0.92228,0.92268,0.923077,0.919411,0.004333
1,0.155226,0.017278,0.105592,0.002009,5,5,"{'max_depth': 5, 'min_samples_split': 5}",0.877551,0.938776,0.816327,...,0.85124,0.050899,3,0.927461,0.906736,0.927461,0.927835,0.948718,0.927642,0.013277
0,0.147678,0.010194,0.104645,0.000509,5,2,"{'max_depth': 5, 'min_samples_split': 2}",0.857143,0.938776,0.795918,...,0.847107,0.054392,4,0.943005,0.937824,0.937824,0.948454,0.958974,0.945216,0.007924
5,0.18452,0.008824,0.104903,0.000476,10,10,"{'max_depth': 10, 'min_samples_split': 10}",0.877551,0.918367,0.77551,...,0.847107,0.048474,4,0.911917,0.927461,0.932642,0.938144,0.938462,0.929725,0.009777


grid search on random forest with entropy

In [21]:
gs4 = GridSearchCV(rf_entropy, param, cv=5,
                   n_jobs=-1, iid=True,
                   return_train_score=True)
gs_fit4 = gs4.fit(X, y)
pd.DataFrame(gs_fit4.cv_results_).sort_values('mean_test_score',
                                             ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
1,0.145746,0.012286,0.10539,0.001238,5.0,5,"{'max_depth': 5, 'min_samples_split': 5}",0.877551,0.938776,0.836735,...,0.863636,0.049882,1,0.92228,0.917098,0.927461,0.92268,0.933333,0.924571,0.005473
5,0.173521,0.010256,0.105176,0.000824,10.0,10,"{'max_depth': 10, 'min_samples_split': 10}",0.877551,0.918367,0.836735,...,0.855372,0.043306,2,0.92228,0.917098,0.92228,0.927835,0.94359,0.926617,0.009141
11,0.163835,0.005746,0.105837,0.000625,,10,"{'max_depth': None, 'min_samples_split': 10}",0.877551,0.918367,0.816327,...,0.855372,0.037885,2,0.911917,0.927461,0.917098,0.93299,0.948718,0.927637,0.012894
6,0.161257,0.013688,0.106127,0.003668,15.0,2,"{'max_depth': 15, 'min_samples_split': 2}",0.877551,0.938776,0.77551,...,0.847107,0.060276,4,1.0,1.0,1.0,1.0,1.0,1.0,0.0
0,0.144343,0.005004,0.104912,0.000893,5.0,2,"{'max_depth': 5, 'min_samples_split': 2}",0.836735,0.938776,0.816327,...,0.842975,0.051302,5,0.943005,0.92228,0.943005,0.93299,0.953846,0.939025,0.01066


# Test on Unknown Data

In [22]:
df3_val = pd.read_csv("../Dataset/Dataset3_Unknown.csv")
res3 = pd.DataFrame()


In [23]:
df3_val["age"] = df3_val["age"].map(lambda x: x//10)

df3_val['trestbps'] = (df3_val['trestbps']-94)//17.521
df3_val['trestbps'] = df3_val['trestbps'].astype('int8')

df3_val['chol'] = (df3_val['chol']-126)//54.007
df3_val['chol'] = df3_val['chol'].astype('int8')

df3_val['thalach'] = (df3_val['thalach']-88)//22.15
df3_val['thalach'] = df3_val['thalach'].astype('int8')

df3_val['oldpeak'] = df3_val['oldpeak']*10//11.762
df3_val['oldpeak'] = df3_val['oldpeak'].astype('int8')



## predict with decision tree using gini

In [25]:
dtree_gini = DecisionTreeClassifier(criterion='gini',
                                   max_depth=5,
                                   min_samples_split=10)
cls = dtree_gini.fit(X_train,y_train)
y_pred = cls.predict(df3_val)

res3["dtree_gini"] = y_pred

## predict with decision tree using entropy

In [26]:
dtree_entropy = DecisionTreeClassifier(criterion='entropy',
                                   max_depth=5,
                                   min_samples_split=10)
cls = dtree_entropy.fit(X_train,y_train)
y_pred = cls.predict(df3_val)

res3["dtree_entropy"] = y_pred

## predict with random forest using gini

In [28]:
rf_gini = RandomForestClassifier(criterion='gini',
                                 n_estimators=50,
                                 max_depth=10,
                                 min_samples_split=5)
cls = rf_gini.fit(X_train,y_train)
y_pred = cls.predict(df3_val)

res3["rf_gini"] = y_pred

## predict with random forest using entropy

In [29]:
rf_entropy = RandomForestClassifier(criterion='entropy',
                                 n_estimators=50,
                                 max_depth=5,
                                 min_samples_split=5)
cls = rf_entropy.fit(X_train,y_train)
y_pred = cls.predict(df3_val)

res3["rf_entropy"] = y_pred

In [30]:
res3.to_csv("./03/prediction.csv")