In [1]:
# import necessary library
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

In [2]:
# import csv dataset
heart_disease_file = "tubes2_HeartDisease_train.csv"
df = pd.read_csv(heart_disease_file)

df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14
0,54,1,4,125,216,0,0,140,0,0.0,?,?,?,1
1,55,1,4,158,217,0,0,110,1,2.5,2,?,?,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0.0,?,?,?,0
4,50,1,4,120,0,0,1,156,1,0.0,1,?,6,3


In [3]:
# split feature and label
feature = df.drop("Column14",inplace=False,axis=1)
label = df["Column14"]

### Pre-Processing Data

In [4]:
# handle missing value

header = feature.columns.values.tolist()
feature = feature.replace('?',np.nan)

imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

discrete_value = ['Column1','Column2','Column3','Column6','Column7','Column9','Column13']
continues_value = ['Column4','Column5','Column8','Column10','Column11','Column12']

imputer_mode.fit(feature[discrete_value])
feature[discrete_value] = imputer_mode.transform(feature[discrete_value])

imputer_mean.fit(feature[continues_value])
feature[continues_value] = imputer_mean.transform(feature[continues_value])

feature.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,54,1,4,125.0,216.0,0,0,140.0,0,0.0,1.762089,0.686792,3
1,55,1,4,158.0,217.0,0,0,110.0,1,2.5,2.0,0.686792,3
2,54,0,3,135.0,304.0,1,0,170.0,0,0.0,1.0,0.0,3
3,48,0,3,120.0,195.0,0,0,125.0,0,0.0,1.762089,0.686792,3
4,50,1,4,120.0,0.0,0,1,156.0,1,0.0,1.0,0.686792,6


In [5]:
# feature scaling
feature = pd.DataFrame(preprocessing.scale(feature), columns=header)
feature.head()

  


Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,0.051624,0.532316,0.794607,-0.396834,0.145063,-0.407637,-0.745075,0.065227,-0.765871,-0.522566,-4.385103e-16,0.0,-0.57954
1,0.156899,0.532316,0.794607,1.383611,0.154309,-0.407637,-0.745075,-1.119498,1.305703,-0.19077,0.4698444,0.0,-0.57954
2,0.051624,-1.878582,-0.28567,0.142695,0.958673,2.453165,-0.745075,1.249952,-0.765871,-0.522566,-1.50503,-1.261202,-0.57954
3,-0.580027,-1.878582,-0.28567,-0.666598,-0.049093,-0.407637,-0.745075,-0.527135,-0.765871,-0.522566,-4.385103e-16,0.0,-0.57954
4,-0.369477,0.532316,0.794607,-0.666598,-1.851978,-0.407637,0.492481,0.69708,1.305703,-0.522566,-1.50503,0.0,1.219112


In [6]:
# feature selection
# display feature correlation with label
for feat in feature:
    print(feat, feature[feat].corr(label))
    
# drop feature with low correlation
feature.drop("Column4",axis=1,inplace=True)
feature.drop("Column6",axis=1,inplace=True)
feature.drop("Column7",axis=1,inplace=True)

Column1 0.3550066873718202
Column2 0.2591122494555664
Column3 0.387827436918273
Column4 0.11219603416675161
Column5 -0.2266061937175543
Column6 0.12959342746947924
Column7 0.14499388173393074
Column8 -0.350979434996953
Column9 0.35525623299589737
Column10 0.2198676461058815
Column11 0.26465461503796067
Column12 0.3088923320458529
Column13 0.29820868614856394


In [7]:
# split train and test 
X_train,X_test,y_train,y_test = train_test_split(feature,label,random_state=0)

In [8]:
# construct model
dt_model = tree.DecisionTreeClassifier(
    criterion='entropy'
)

dt_model = dt_model.fit(X_train, y_train)
# dt_model = dt_model.fit(feature, label)

# predict test data
y_predict = dt_model.predict(X_test)

# display accuracy score
print('Accuracy score: ',round(accuracy_score(y_test, y_predict),2))

Accuracy score:  0.48


In [9]:
kf = KFold(n_splits=10)

best_model = None
worst_model = None
best_accuracy = 0.0
worst_accuracy = 1.0
sum_accuracy = 0.0

models = []

df_to_fold = pd.concat([feature,label], axis=1)

for train_idx,test_idx in kf.split(df_to_fold):
    train = df_to_fold.iloc[train_idx]
    test = df_to_fold.iloc[test_idx]
    
    X_train = train.drop('Column14',axis=1)
    y_train = train['Column14']
    
    X_test = test.drop('Column14',axis=1)
    y_test = test['Column14']
    
    cur_model = tree.DecisionTreeClassifier(
        criterion='entropy', 
        min_samples_leaf=34, 
        max_depth=5, 
        splitter="best",

    ).fit(X_train, y_train)

    y_predict = cur_model.predict(X_test)
    cur_accuracy = accuracy_score(y_test, y_predict)
    
#     print(cur_accuracy)
    
    if (cur_accuracy > best_accuracy):
        best_model = cur_model
        best_accuracy = cur_accuracy
        
    if (cur_accuracy < worst_accuracy):
        worst_model = cur_model
        worst_accuracy = cur_accuracy
        
    models.append(cur_model)

print(best_accuracy)

0.6282051282051282
