In [20]:
# import necessary library
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [21]:
# import csv dataset
file = "tubes2_HeartDisease_train.csv"
df = pd.read_csv(file)

In [22]:
# split feature and label
feature = df.drop("Column14",inplace=False,axis=1)
label = df["Column14"]

In [23]:
# handle missing value

header = feature.columns.values.tolist()
feature_impute = feature.replace('?',np.nan)

imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

discrete_value = ['Column1','Column2','Column3','Column6','Column7','Column9','Column13']
continues_value = ['Column4','Column5','Column8','Column10','Column11','Column12']

imputer_mode.fit(feature_impute[discrete_value])
feature_impute[discrete_value] = imputer_mode.transform(feature_impute[discrete_value])

imputer_mean.fit(feature_impute[continues_value])
feature_impute[continues_value] = imputer_mean.transform(feature_impute[continues_value])

feature_impute['Column13'] = pd.to_numeric(feature_impute['Column13'])
feature_impute.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,54,1,4,125.0,216.0,0,0,140.0,0,0.0,1.762089,0.686792,3
1,55,1,4,158.0,217.0,0,0,110.0,1,2.5,2.0,0.686792,3
2,54,0,3,135.0,304.0,1,0,170.0,0,0.0,1.0,0.0,3
3,48,0,3,120.0,195.0,0,0,125.0,0,0.0,1.762089,0.686792,3
4,50,1,4,120.0,0.0,0,1,156.0,1,0.0,1.0,0.686792,6


In [45]:
idx_to_drop = []

for item in continues_value:
    mean = feature_impute[item].mean()
    std = feature_impute[item].std()
    low_threshold = mean - 2*std
    high_threshold = mean + 2*std
    
#     print(item, low_threshold, high_threshold)
    
    for i in range(feature_impute[item].shape[0]):
        cur_value = feature_impute[item].iloc[i]
        if (cur_value < low_threshold or cur_value > high_threshold):
            idx_to_drop.append(i)

feature_impute.drop(feature_impute.index[idx_to_drop],inplace=True)
label.drop(label.index[idx_to_drop],inplace=True)
# feature_impute.describe()

In [46]:
# feature scale
feature_scale = pd.DataFrame(preprocessing.scale(feature_impute), columns=header)
feature_scale.head()

  


Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,0.179906,0.471025,0.838627,-0.396379,0.248428,-0.406994,-0.663378,0.120344,-0.687654,-0.765248,0.090167,-0.247896,-0.454556
1,-0.457029,-2.123028,-0.260263,-0.840702,0.052644,-0.406994,-0.663378,-0.58634,-0.687654,-0.765248,0.090167,-0.247896,-0.454556
2,-0.244717,0.471025,0.838627,-0.840702,-1.765353,-0.406994,0.892693,0.874141,1.454219,-0.765248,-2.383673,-0.247896,1.651923
3,1.13531,0.471025,0.838627,0.047944,1.10615,-0.406994,-0.663378,0.026119,1.454219,0.662111,0.862457,-0.247896,-0.454556
4,0.60453,0.471025,-1.359154,0.047944,0.574736,-0.406994,-0.663378,-1.293025,-0.687654,-0.765248,0.090167,-0.247896,-0.454556


In [47]:
# display feature correlation with label
for feat in feature_scale:
    print(feat, feature_scale[feat].corr(label))

Column1 -0.01507737770502659
Column2 0.0343546927378733
Column3 0.013242427166461701
Column4 -0.022986392733475473
Column5 0.05659714512994221
Column6 -0.0634634852005589
Column7 0.16250345170248143
Column8 0.05202597139561553
Column9 -0.05049990743093237
Column10 -0.06653854433481521
Column11 0.01834681816188873
Column12 0.03230400915858137
Column13 0.08533736753424677


In [48]:
# drop feature with low correlation
# feature_scale.drop("Column4",axis=1,inplace=True)
# feature_scale.drop("Column6",axis=1,inplace=True)
# feature_scale.drop("Column7",axis=1,inplace=True)

import itertools
feature_combinations = []

# print(list(itertools.combinations(header,1)))
for i in range(1,14):
    feature_combinations.append(list(itertools.combinations(header,i)))

In [49]:
# split train and test 
X_train,X_test,y_train,y_test = train_test_split(feature_scale,label,test_size=0.1)

In [50]:
# construct model
naive_bayes_model = GaussianNB(var_smoothing=1e-6).fit(X_train,y_train)

# predict x_test
y_predict = naive_bayes_model.predict(X_test)

# display accuracy score
print('Accuracy score: ',round(accuracy_score(y_test, y_predict),2))

Accuracy score:  0.42


In [57]:
kf = KFold(n_splits=9)

best_model = None
worst_model = None
best_accuracy = 0.0
worst_accuracy = 1.0
sum_accuracy = 0.0

best_header = None

models = []

# print(feature_combinations[12])

# print(feature_combinations[3])

i = 8

for j in range(len(feature_combinations[i])):
    cur_header = []
    for k in range(len(feature_combinations[i][j])):
        cur_header.append(feature_combinations[i][j][k])

    cur_feature = feature_scale[cur_header]
#     df_to_fold = pd.concat([cur_feature,label],axis=1)
#     print(df_to_fold.isnull().sum())
    for train_idx,test_idx in kf.split(cur_feature):
#         train = df_to_fold.iloc[train_idx]
#         test = df_to_fold.iloc[test_idx]

        X_train = cur_feature.iloc[train_idx] #train.drop('Column14',axis=1)
        y_train = label.iloc[train_idx]       #train['Column14']

        X_test = cur_feature.iloc[test_idx] #test.drop('Column14',axis=1)
        y_test = label.iloc[test_idx] #test['Column14']

        cur_model = GaussianNB(var_smoothing=0.001).fit(X_train, y_train)
        y_predict = cur_model.predict(X_test)
        cur_accuracy = accuracy_score(y_test, y_predict)

    #     print(cur_accuracy)

        if (cur_accuracy > best_accuracy):
            best_model = cur_model
            best_accuracy = cur_accuracy
            best_header = cur_header

        if (cur_accuracy < worst_accuracy):
            worst_model = cur_model
            worst_accuracy = cur_accuracy

        models.append(cur_model)
    
        
#     print(i)
    
# df_to_fold = pd.concat([feature_scale,label], axis=1)

print(best_header)
print(best_accuracy)

Column1     190
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column9     190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column11    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column12    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column9     190
Column1

Column1     190
Column2     190
Column3     190
Column4     190
Column6     190
Column7     190
Column9     190
Column11    190
Column12    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column6     190
Column7     190
Column9     190
Column11    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column6     190
Column7     190
Column9     190
Column12    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column6     190
Column7     190
Column10    190
Column11    190
Column12    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column6     190
Column7     190
Column10    190
Column11    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column4     190
Column6     190
Column7     190
Column10    190
Column12    190
Column1

Column1     190
Column2     190
Column3     190
Column5     190
Column7     190
Column8     190
Column11    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column5     190
Column7     190
Column9     190
Column10    190
Column11    190
Column12    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column5     190
Column7     190
Column9     190
Column10    190
Column11    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column5     190
Column7     190
Column9     190
Column10    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column5     190
Column7     190
Column9     190
Column11    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column3     190
Column5     190
Column7     190
Column10    190
Column11    190
Column12    190
Column1

Column1     190
Column2     190
Column4     190
Column5     190
Column7     190
Column8     190
Column10    190
Column11    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column4     190
Column5     190
Column7     190
Column8     190
Column10    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column4     190
Column5     190
Column7     190
Column8     190
Column11    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column4     190
Column5     190
Column7     190
Column9     190
Column10    190
Column11    190
Column12    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column4     190
Column5     190
Column7     190
Column9     190
Column10    190
Column11    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column2     190
Column4     190
Column5     190
Column7     190
Column9     190
Column10    190
Column12    190
Column1

Column1     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column9     190
Column12    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column9     190
Column13    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column11    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column12    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column11    190
Column1

Column1     190
Column3     190
Column5     190
Column6     190
Column7     190
Column8     190
Column9     190
Column11    190
Column12    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column5     190
Column6     190
Column7     190
Column8     190
Column9     190
Column11    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column5     190
Column6     190
Column7     190
Column8     190
Column9     190
Column12    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column11    190
Column12    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column11    190
Column13    190
Column14    190
dtype: int64
Column1     190
Column3     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column12    190
Column1

Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column11    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column12    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column13    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column11    190
Column12    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column11    190
Column13    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column12    190
Column1

Column2     190
Column3     190
Column5     190
Column6     190
Column7     190
Column8     190
Column10    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column5     190
Column6     190
Column7     190
Column8     190
Column11    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column5     190
Column6     190
Column7     190
Column9     190
Column10    190
Column11    190
Column12    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column5     190
Column6     190
Column7     190
Column9     190
Column10    190
Column11    190
Column13    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column5     190
Column6     190
Column7     190
Column9     190
Column10    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column2     190
Column3     190
Column5     190
Column6     190
Column7     190
Column9     190
Column11    190
Column12    190
Column1

Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column8     190
Column11    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column9     190
Column10    190
Column11    190
Column12    190
Column14    190
dtype: int64
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column9     190
Column10    190
Column11    190
Column13    190
Column14    190
dtype: int64
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column9     190
Column10    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column9     190
Column11    190
Column12    190
Column13    190
Column14    190
dtype: int64
Column3     190
Column4     190
Column5     190
Column6     190
Column7     190
Column10    190
Column11    190
Column12    190
Column1