In [231]:
# import necessary library
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

In [232]:
# import csv dataset
file = "tubes2_HeartDisease_train.csv"
df = pd.read_csv(file)

In [233]:
# split feature and label
feature = df.drop("Column14",inplace=False,axis=1)
label = df["Column14"]

In [234]:
# handle missing value

header = feature.columns.values.tolist()
feature = df[header].replace('?',np.nan)

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(feature)
feature_impute = pd.DataFrame(imputer.transform(feature), columns=header)

feature_impute.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,54.0,1.0,4.0,125.0,216.0,0.0,0.0,140.0,0.0,0.0,1.762089,0.686792,5.02965
1,55.0,1.0,4.0,158.0,217.0,0.0,0.0,110.0,1.0,2.5,2.0,0.686792,5.02965
2,54.0,0.0,3.0,135.0,304.0,1.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0
3,48.0,0.0,3.0,120.0,195.0,0.0,0.0,125.0,0.0,0.0,1.762089,0.686792,5.02965
4,50.0,1.0,4.0,120.0,0.0,0.0,1.0,156.0,1.0,0.0,1.0,0.686792,6.0


In [235]:
# feature scale
feature_scale = pd.DataFrame(preprocessing.scale(feature_impute), columns=header)
feature_scale.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,0.051624,0.532316,0.794607,-0.396834,0.145063,-0.457241,-0.747528,0.065227,-0.826357,-0.522566,-4.385103e-16,0.0,0.0
1,0.156899,0.532316,0.794607,1.383611,0.154309,-0.457241,-0.747528,-1.119498,1.282574,-0.19077,0.4698444,0.0,0.0
2,0.051624,-1.878582,-0.28567,0.142695,0.958673,2.43038,-0.747528,1.249952,-0.826357,-0.522566,-1.50503,-1.261202,-1.532348
3,-0.580027,-1.878582,-0.28567,-0.666598,-0.049093,-0.457241,-0.747528,-0.527135,-0.826357,-0.522566,-4.385103e-16,0.0,0.0
4,-0.369477,0.532316,0.794607,-0.666598,-1.851978,-0.457241,0.490914,0.69708,1.282574,-0.522566,-1.50503,0.0,0.732597


In [236]:
# display feature correlation with label
for feat in feature_scale:
    print(feat, feature_scale[feat].corr(label))

Column1 0.35500668737181995
Column2 0.2591122494555663
Column3 0.387827436918273
Column4 0.11219603416675152
Column5 -0.22660619371755436
Column6 0.14721822095303116
Column7 0.14511491279481586
Column8 -0.35097943499695294
Column9 0.37793788864053984
Column10 0.21986764610588136
Column11 0.26465461503796067
Column12 0.3088923320458529
Column13 0.3101321042169782


In [237]:
# drop feature with low correlation
feature_scale.drop("Column4",axis=1,inplace=True)
feature_scale.drop("Column6",axis=1,inplace=True)
feature_scale.drop("Column7",axis=1,inplace=True)

In [238]:
# split train and test 
X_train,X_test,y_train,y_test = train_test_split(feature_scale,label,test_size=0.1)

In [239]:
# construct model
naive_bayes_model = GaussianNB().fit(X_train,y_train)

# predict x_test
y_predict = naive_bayes_model.predict(X_test)

# display accuracy score
print('Accuracy score: ',round(accuracy_score(y_test, y_predict),2))

Accuracy score:  0.63
