In [1]:
# import necessary library
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
# import csv dataset
file = "tubes2_HeartDisease_train.csv"
df = pd.read_csv(file)

In [3]:
# split feature and label
feature = df.drop("Column14",inplace=False,axis=1)
label = df["Column14"]

In [4]:
# handle missing value

header = feature.columns.values.tolist()
feature_impute = feature.replace('?',np.nan)


imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

discrete_value = ['Column1','Column2','Column3','Column6','Column7','Column9','Column13']
continues_value = ['Column4','Column5','Column8','Column10','Column11','Column12']

imputer_mode.fit(feature_impute[discrete_value])
feature_impute[discrete_value] = imputer_mode.transform(feature_impute[discrete_value])

imputer_mean.fit(feature_impute[continues_value])
feature_impute[continues_value] = imputer_mean.transform(feature_impute[continues_value])

feature_impute.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,54,1,4,125.0,216.0,0,0,140.0,0,0.0,1.762089,0.686792,3
1,55,1,4,158.0,217.0,0,0,110.0,1,2.5,2.0,0.686792,3
2,54,0,3,135.0,304.0,1,0,170.0,0,0.0,1.0,0.0,3
3,48,0,3,120.0,195.0,0,0,125.0,0,0.0,1.762089,0.686792,3
4,50,1,4,120.0,0.0,0,1,156.0,1,0.0,1.0,0.686792,6


In [5]:
# feature scale
feature_scale = pd.DataFrame(preprocessing.scale(feature_impute), columns=header)
feature_scale.head()

  


Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,0.051624,0.532316,0.794607,-0.396834,0.145063,-0.407637,-0.745075,0.065227,-0.765871,-0.522566,-4.385103e-16,0.0,-0.57954
1,0.156899,0.532316,0.794607,1.383611,0.154309,-0.407637,-0.745075,-1.119498,1.305703,-0.19077,0.4698444,0.0,-0.57954
2,0.051624,-1.878582,-0.28567,0.142695,0.958673,2.453165,-0.745075,1.249952,-0.765871,-0.522566,-1.50503,-1.261202,-0.57954
3,-0.580027,-1.878582,-0.28567,-0.666598,-0.049093,-0.407637,-0.745075,-0.527135,-0.765871,-0.522566,-4.385103e-16,0.0,-0.57954
4,-0.369477,0.532316,0.794607,-0.666598,-1.851978,-0.407637,0.492481,0.69708,1.305703,-0.522566,-1.50503,0.0,1.219112


In [6]:
# display feature correlation with label
for feat in feature_scale:
    print(feat, feature_scale[feat].corr(label))

Column1 0.35500668737181995
Column2 0.2591122494555663
Column3 0.387827436918273
Column4 0.11219603416675152
Column5 -0.22660619371755436
Column6 0.12959342746947924
Column7 0.1449938817339308
Column8 -0.35097943499695294
Column9 0.3552562329958971
Column10 0.21986764610588136
Column11 0.26465461503796067
Column12 0.3088923320458529
Column13 0.29820868614856366


In [7]:
# drop feature with low correlation
feature_scale.drop("Column4",axis=1,inplace=True)
feature_scale.drop("Column6",axis=1,inplace=True)
feature_scale.drop("Column7",axis=1,inplace=True)

In [8]:
# split train and test 
X_train,X_test,y_train,y_test = train_test_split(feature_scale,label,test_size=0.1)

In [9]:
# # construct model
# mlp = MLPClassifier(solver='adam', alpha=0.1,
#                      hidden_layer_sizes=(11, 17, 11), activation='logistic', learning_rate='adaptive')
# mlp_model = mlp.fit(X_train, y_train)
# mlp_model


In [10]:
# # predict x_test
# y_predict = mlp_model.predict(X_test)

# # display accuracy score
# print('Accuracy score: ',round(accuracy_score(y_test, y_predict),2))

In [11]:
kf = KFold(n_splits=10)

best_model = None
worst_model = None
best_accuracy = 0.0
worst_accuracy = 1.0
sum_accuracy = 0.0

models = []

df_to_fold = pd.concat([feature_scale,label], axis=1)

for train_idx,test_idx in kf.split(df_to_fold):
    train = df_to_fold.iloc[train_idx]
    test = df_to_fold.iloc[test_idx]
    
    X_train = train.drop('Column14',axis=1)
    y_train = train['Column14']
    
    X_test = test.drop('Column14',axis=1)
    y_test = test['Column14']
    
    cur_model = MLPClassifier(solver='lbfgs', alpha=0.1,
                     hidden_layer_sizes=(7, 10, 7), activation='logistic', learning_rate='constant').fit(X_train, y_train)
    y_predict = cur_model.predict(X_test)
    cur_accuracy = accuracy_score(y_test, y_predict)
    
#     print(cur_accuracy)
    
    if (cur_accuracy > best_accuracy):
        best_model = cur_model
        best_accuracy = cur_accuracy
        
    if (cur_accuracy < worst_accuracy):
        worst_model = cur_model
        worst_accuracy = cur_accuracy
        
    models.append(cur_model)

print(best_accuracy)

0.6282051282051282


In [24]:
# construct model
mlp = MLPClassifier(max_iter=100)
parameter_space = {
    'hidden_layer_sizes' : [(7,11,13,7), (7,11,7), (7,5)],
    'activation' : ['logistic'],
    'solver' : ['sgd', 'adam', 'lbfgs'],
    'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1],
    'learning_rate' : ['constant','adaptive']
}

from sklearn.model_selection import GridSearchCV

mlp_model = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
mlp_model.fit(X_train, y_train)

print('Best parameters found:\n',mlp_model.best_params_)
means = mlp_model.cv_results_['mean_test_score']
stds = mlp_model.cv_results_['std_test_score']
# for mean, std, params in zip(means, stds, mlp_model.cv_results_['params']):
#     print("%0.3f (+/-%0.03f) for %r" % (mean, std*2, params))

Best parameters found:
 {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (7, 11, 7), 'learning_rate': 'constant', 'solver': 'lbfgs'}




In [16]:
# # predict x_test
# y_predict = mlp_model.predict(X_test)

# # display accuracy score
# print('Accuracy score: ',round(accuracy_score(y_test, y_predict),2))

In [14]:
# y_true, y_pred = y_test, mlp_model.predict(X_test)

# from sklearn.metrics import classification_report
# print('Result on the test set:\n')
# print(classification_report(y_true, y_pred))