In [1]:
# import necessary library
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
# import csv dataset
file = "tubes2_HeartDisease_train.csv"
df = pd.read_csv(file)

In [3]:
# split feature and label
feature = df.drop("Column14",inplace=False,axis=1)
label = df["Column14"]

In [4]:
# handle missing value

header = feature.columns.values.tolist()
feature_impute = feature.replace('?',np.nan)

imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

discrete_value = ['Column1','Column2','Column3','Column6','Column7','Column9','Column13']
continues_value = ['Column4','Column5','Column8','Column10','Column11','Column12']

imputer_mode.fit(feature_impute[discrete_value])
feature_impute[discrete_value] = imputer_mode.transform(feature_impute[discrete_value])

imputer_mean.fit(feature_impute[continues_value])
feature_impute[continues_value] = imputer_mean.transform(feature_impute[continues_value])

feature_impute['Column13'] = pd.to_numeric(feature_impute['Column13'])
feature_impute.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,54,1,4,125.0,216.0,0,0,140.0,0,0.0,1.762089,0.686792,3
1,55,1,4,158.0,217.0,0,0,110.0,1,2.5,2.0,0.686792,3
2,54,0,3,135.0,304.0,1,0,170.0,0,0.0,1.0,0.0,3
3,48,0,3,120.0,195.0,0,0,125.0,0,0.0,1.762089,0.686792,3
4,50,1,4,120.0,0.0,0,1,156.0,1,0.0,1.0,0.686792,6


In [5]:
idx_to_drop = []

for item in continues_value:
    mean = feature_impute[item].mean()
    std = feature_impute[item].std()
    low_threshold = mean - 2*std
    high_threshold = mean + 2*std
    
#     print(item, low_threshold, high_threshold)
    
    for i in range(feature_impute[item].shape[0]):
        cur_value = feature_impute[item].iloc[i]
        if (cur_value < low_threshold or cur_value > high_threshold):
            idx_to_drop.append(i)

feature_impute.drop(feature_impute.index[idx_to_drop],inplace=True)
label.drop(label.index[idx_to_drop],inplace=True)
# feature_impute.describe()

In [6]:
# feature scale
feature_scale = pd.DataFrame(preprocessing.scale(feature_impute), columns=header)
feature_scale.head()

  


Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,0.176725,0.555458,0.852692,-0.380985,0.149048,-0.387298,-0.693683,-0.01527,-0.700908,-0.567213,0.253637,0.396182,-0.510509
1,0.285116,0.555458,0.852692,1.952471,0.158792,-0.387298,-0.693683,-1.299403,1.426721,0.036852,0.86704,0.396182,-0.510509
2,0.176725,-1.800315,-0.236859,0.326123,1.006587,2.581989,-0.693683,1.268864,-0.700908,-0.567213,-1.711246,-1.883308,-0.510509
3,-0.473623,-1.800315,-0.236859,-0.734539,-0.055593,-0.387298,-0.693683,-0.657336,-0.700908,-0.567213,0.253637,0.396182,-0.510509
4,-0.25684,0.555458,0.852692,-0.734539,-1.955823,-0.387298,0.619047,0.669601,1.426721,-0.567213,-1.711246,0.396182,1.417604


In [7]:
# display feature correlation with label
for feat in feature_scale:
    print(feat, feature_scale[feat].corr(label))

Column1 0.00950298929003149
Column2 0.002169923491672243
Column3 -0.0019815812945992647
Column4 -0.06693641003884787
Column5 -0.1177640309076227
Column6 -0.03524705294950225
Column7 -0.05383415418850949
Column8 0.029848809975586067
Column9 -0.05664339720406507
Column10 -0.08849686660664911
Column11 -0.0795584160413824
Column12 -0.02364959703871219
Column13 0.061955162333051754


In [8]:
# # drop feature with low correlation
# # feature_scale.drop("Column1",axis=1,inplace=True)
# # feature_scale.drop("Column2",axis=1,inplace=True)
# # feature_scale.drop("Column3",axis=1,inplace=True)
# # feature_scale.drop("Column4",axis=1,inplace=True)
# # feature_scale.drop("Column5",axis=1,inplace=True)
# # feature_scale.drop("Column6",axis=1,inplace=True)
# # feature_scale.drop("Column7",axis=1,inplace=True)
# # feature_scale.drop("Column8",axis=1,inplace=True)
# # feature_scale.drop("Column9",axis=1,inplace=True)
# # feature_scale.drop("Column10",axis=1,inplace=True)
# # feature_scale.drop("Column11",axis=1,inplace=True)
# # feature_scale.drop("Column12",axis=1,inplace=True)
# # feature_scale.drop("Column13",axis=1,inplace=True)


import itertools
feature_combinations = []

# print(list(itertools.combinations(header,1)))
for i in range(1,14):
    feature_combinations.append(list(itertools.combinations(header,i)))

In [9]:
# split train and test 
X_train,X_test,y_train,y_test = train_test_split(feature_scale,label,test_size=0.1)

In [None]:
kf = KFold(n_splits=10)

best_model = None
worst_model = None
best_accuracy = 0.0
worst_accuracy = 1.0
sum_accuracy = 0.0

best_header = None

models = []

# print(feature_combinations[12])

# print(feature_combinations[3])

i = 3
s = 50
e = 100
for j in range(s,e):
    cur_header = []
    for k in range(len(feature_combinations[i][j])):
        cur_header.append(feature_combinations[i][j][k])

    cur_feature = feature_scale[cur_header]
#     df_to_fold = pd.concat([cur_feature,label],axis=1)
#     print(df_to_fold.isnull().sum())
    for train_idx,test_idx in kf.split(cur_feature):
#         train = df_to_fold.iloc[train_idx]
#         test = df_to_fold.iloc[test_idx]

        X_train = cur_feature.iloc[train_idx] #train.drop('Column14',axis=1)
        y_train = label.iloc[train_idx]       #train['Column14']

        X_test = cur_feature.iloc[test_idx] #test.drop('Column14',axis=1)
        y_test = label.iloc[test_idx] #test['Column14']

        cur_model = MLPClassifier(solver='lbfgs', alpha=0.001,
                     hidden_layer_sizes=(7, 11, 7), activation='logistic', learning_rate='constant', random_state=100).fit(X_train,y_train)
        y_predict = cur_model.predict(X_test)
        cur_accuracy = accuracy_score(y_test, y_predict)

    #     print(cur_accuracy)

        if (cur_accuracy > best_accuracy):
            best_model = cur_model
            best_accuracy = cur_accuracy
            best_header = cur_header

        if (cur_accuracy < worst_accuracy):
            worst_model = cur_model
            worst_accuracy = cur_accuracy

        models.append(cur_model)
    
        
#     print(i)
    
# df_to_fold = pd.concat([feature_scale,label], axis=1)

print(best_header)
print(best_accuracy)

In [11]:
# # construct model
# mlp = MLPClassifier(solver='adam', alpha=0.1,
#                      hidden_layer_sizes=(11, 17, 11), activation='logistic', learning_rate='adaptive')
# mlp_model = mlp.fit(X_train, y_train)
# mlp_model


In [12]:
# # predict x_test
# y_predict = mlp_model.predict(X_test)

# # display accuracy score
# print('Accuracy score: ',round(accuracy_score(y_test, y_predict),2))

In [13]:
# # construct model
# mlp = MLPClassifier(max_iter=100)
# parameter_space = {
#     'hidden_layer_sizes' : [(7,11,7), (100,)],
#     'activation' : ['logistic','tanh','relu'],
#     'solver' : ['sgd', 'adam', 'lbfgs'],
#     'alpha' : [0.00001, 0.0001, 0.001, 0.01, 0.1],
#     'learning_rate' : ['constant','adaptive']
# }

# from sklearn.model_selection import GridSearchCV

# mlp_model = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=10)
# mlp_model.fit(X_train, y_train)

# print('Best parameters found:\n',mlp_model.best_params_)
# # print(mlp_model.cv_results_)
# means = mlp_model.cv_results_['mean_test_score']
# stds = mlp_model.cv_results_['std_test_score']
# # for mean, std, params in zip(means, stds, mlp_model.cv_results_['params']):
# #     print("%0.3f (+/-%0.03f) for %r" % (mean, std*2, params))

In [14]:
# # predict x_test
# y_predict = mlp_model.predict(X_test)

# # display accuracy score
# print('Accuracy score: ',(accuracy_score(y_test, y_predict)))

In [15]:
# y_true, y_pred = y_test, mlp_model.predict(X_test)

# from sklearn.metrics import classification_report
# print('Result on the test set:\n')
# print(classification_report(y_true, y_pred))