In [143]:
# import necessary library
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [144]:
# import csv dataset
file = "tubes2_HeartDisease_train.csv"
df = pd.read_csv(file)

In [145]:
# split feature and label
feature = df.drop("Column14",inplace=False,axis=1)
label = df["Column14"]

In [146]:
# handle missing value

header = feature.columns.values.tolist()
feature_impute = feature.replace('?',np.nan)

imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

discrete_value = ['Column1','Column2','Column3','Column6','Column7','Column9','Column13']
continues_value = ['Column4','Column5','Column8','Column10','Column11','Column12']

imputer_mode.fit(feature_impute[discrete_value])
feature_impute[discrete_value] = imputer_mode.transform(feature_impute[discrete_value])

imputer_mean.fit(feature_impute[continues_value])
feature_impute[continues_value] = imputer_mean.transform(feature_impute[continues_value])

feature_impute['Column13'] = pd.to_numeric(feature_impute['Column13'])
feature_impute.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,54,1,4,125.0,216.0,0,0,140.0,0,0.0,1.762089,0.686792,3
1,55,1,4,158.0,217.0,0,0,110.0,1,2.5,2.0,0.686792,3
2,54,0,3,135.0,304.0,1,0,170.0,0,0.0,1.0,0.0,3
3,48,0,3,120.0,195.0,0,0,125.0,0,0.0,1.762089,0.686792,3
4,50,1,4,120.0,0.0,0,1,156.0,1,0.0,1.0,0.686792,6


In [149]:
idx_to_drop = []

for item in continues_value:
    mean = feature_impute[item].mean()
    std = feature_impute[item].std()
    low_threshold = mean - 2*std
    high_threshold = mean + 2*std
    
#     print(item, low_threshold, high_threshold)
    
    for i in range(feature_impute[item].shape[0]):
        cur_value = feature_impute[item].iloc[i]
        if (cur_value < low_threshold or cur_value > high_threshold):
            idx_to_drop.append(i)

feature_impute.drop(feature_impute.index[idx_to_drop],inplace=True)
feature_impute.describe()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column8,Column10,Column11,Column12,Column13
count,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0,478.0
mean,52.029289,0.780335,3.209205,129.416284,198.48029,141.270555,1.230037,1.655492,0.575953,3.709205
std,9.136926,0.414454,0.900826,11.438446,101.494904,22.348225,1.743816,0.381227,0.284683,1.485485
min,28.0,0.0,1.0,106.0,0.0,95.0,-2.0,1.0,0.0,3.0
25%,45.0,1.0,2.0,120.0,180.25,124.25,0.0,1.762089,0.686792,3.0
50%,53.0,1.0,3.0,130.0,221.0,140.0,0.0,1.762089,0.686792,3.0
75%,58.0,1.0,4.0,140.0,266.75,160.0,2.0,2.0,0.686792,3.0
max,77.0,1.0,4.0,154.0,394.0,186.0,9.0,2.0,1.0,7.0


In [5]:
# feature scale
feature_scale = pd.DataFrame(preprocessing.scale(feature_impute), columns=header)
feature_scale.head()

  


Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,0.051624,0.532316,0.794607,-0.396834,0.145063,-0.407637,-0.745075,0.065227,-0.765871,-0.522566,-4.385103e-16,0.0,-0.57954
1,0.156899,0.532316,0.794607,1.383611,0.154309,-0.407637,-0.745075,-1.119498,1.305703,-0.19077,0.4698444,0.0,-0.57954
2,0.051624,-1.878582,-0.28567,0.142695,0.958673,2.453165,-0.745075,1.249952,-0.765871,-0.522566,-1.50503,-1.261202,-0.57954
3,-0.580027,-1.878582,-0.28567,-0.666598,-0.049093,-0.407637,-0.745075,-0.527135,-0.765871,-0.522566,-4.385103e-16,0.0,-0.57954
4,-0.369477,0.532316,0.794607,-0.666598,-1.851978,-0.407637,0.492481,0.69708,1.305703,-0.522566,-1.50503,0.0,1.219112


In [6]:
# display feature correlation with label
for feat in feature_scale:
    print(feat, feature_scale[feat].corr(label))

Column1 0.35500668737181995
Column2 0.2591122494555663
Column3 0.387827436918273
Column4 0.11219603416675152
Column5 -0.22660619371755436
Column6 0.12959342746947924
Column7 0.1449938817339308
Column8 -0.35097943499695294
Column9 0.3552562329958971
Column10 0.21986764610588136
Column11 0.26465461503796067
Column12 0.3088923320458529
Column13 0.29820868614856366


In [32]:
# drop feature with low correlation
# feature_scale.drop("Column4",axis=1,inplace=True)
# feature_scale.drop("Column6",axis=1,inplace=True)
# feature_scale.drop("Column7",axis=1,inplace=True)

import itertools
feature_combinations = []

# print(list(itertools.combinations(header,1)))
for i in range(1,14):
    feature_combinations.append(list(itertools.combinations(header,i)))

In [8]:
# split train and test 
X_train,X_test,y_train,y_test = train_test_split(feature_scale,label,test_size=0.1)

In [9]:
# construct model
naive_bayes_model = GaussianNB(var_smoothing=1e-6).fit(X_train,y_train)

# predict x_test
y_predict = naive_bayes_model.predict(X_test)

# display accuracy score
print('Accuracy score: ',round(accuracy_score(y_test, y_predict),2))

Accuracy score:  0.62


In [57]:
kf = KFold(n_splits=9)

best_model = None
worst_model = None
best_accuracy = 0.0
worst_accuracy = 1.0
sum_accuracy = 0.0

best_header = None

models = []

# print(feature_combinations[12])

# print(feature_combinations[3])

i = 5

for j in range(len(feature_combinations[i])):
    cur_header = []
    for k in range(len(feature_combinations[i][j])):
        cur_header.append(feature_combinations[i][j][k])

    cur_feature = feature_scale[cur_header]
    df_to_fold = pd.concat([cur_feature,label],axis=1)

    for train_idx,test_idx in kf.split(df_to_fold):
        train = df_to_fold.iloc[train_idx]
        test = df_to_fold.iloc[test_idx]

        X_train = train.drop('Column14',axis=1)
        y_train = train['Column14']

        X_test = test.drop('Column14',axis=1)
        y_test = test['Column14']

        cur_model = GaussianNB(var_smoothing=0.001).fit(X_train, y_train)
        y_predict = cur_model.predict(X_test)
        cur_accuracy = accuracy_score(y_test, y_predict)

    #     print(cur_accuracy)

        if (cur_accuracy > best_accuracy):
            best_model = cur_model
            best_accuracy = cur_accuracy
            best_header = cur_header

        if (cur_accuracy < worst_accuracy):
            worst_model = cur_model
            worst_accuracy = cur_accuracy

        models.append(cur_model)
    
        
#     print(i)
    
# df_to_fold = pd.concat([feature_scale,label], axis=1)

print(best_header)
print(best_accuracy)

['Column1', 'Column2', 'Column5', 'Column9', 'Column10', 'Column13']
0.7241379310344828
