In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

#Plot
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

testData = pd.read_csv("test.csv")
testY = pd.read_csv("output.csv")
testData['stroke'] = testY['stroke']
trainData = pd.read_csv("train.csv")

#Not Significant for Strokes

trainData = trainData.drop( columns=['ever_married', 'gender', 'id','avg_glucose_level'] )
testData = testData.drop( columns=['ever_married', 'gender', 'id','avg_glucose_level'] )

#Filling Missing BMI Data === train

temp = trainData['bmi'].mean() + trainData['bmi'].median()
temp /= 2
temp
trainData['bmi'] = trainData['bmi'].fillna(temp)

#Filling Missing BMI Data ===test

temp = testData['bmi'].mean() + testData['bmi'].median()
temp /= 2
temp
testData['bmi'] = testData['bmi'].fillna(temp)

# Segregating into Two Models Smoking & Non-Smoking

trainDataNS = trainData[ trainData['smoking_status'].isna() ]
trainDataNS = trainDataNS.drop( columns=['smoking_status'] )
trainDataS = trainData[ trainData['smoking_status'].notna() ]

testDataNS = testData[ testData['smoking_status'].isna() ]
testDataNS = testDataNS.drop( columns=['smoking_status'] )
testDataS = testData[ testData['smoking_status'].notna() ]

# X and Y division

yTrainDataNS = trainDataNS['stroke']
xTrainDataNS = trainDataNS.drop( columns=['stroke'] )

yTrainDataS = trainDataS['stroke']
xTrainDataS = trainDataS.drop( columns=['stroke'] )

yTestDataNS = testDataNS['stroke']
xTestDataNS = testDataNS.drop( columns=['stroke'] )

yTestDataS = testDataS['stroke']
xTestDataS = testDataS.drop( columns=['stroke'] )

#Nominal Categories to one-hot encoding === Train

#Smoke Data
xTrainDataS = pd.get_dummies( xTrainDataS, 
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTrainDataNS = pd.get_dummies( xTrainDataNS,
                              columns=[ 'work_type' , 'Residence_type']
                            , prefix=['work_type' , 'res_type'] )

#Nominal Categories to one-hot encoding ===Test

#Smoke Data
xTestDataS = pd.get_dummies( xTestDataS,
                            columns=['work_type' , 'Residence_type', 'smoking_status']
                            , prefix=['work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTestDataNS = pd.get_dummies( xTestDataNS,
                             columns=['work_type' , 'Residence_type']
                            , prefix=['work_type' , 'res_type'] )

smokeCol = xTrainDataS.columns
nsmokeCol = xTrainDataNS.columns


#Sampling for removing imbalance

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
smote = RandomOverSampler(random_state=0)

xTrainDataNS, yTrainDataNS = smote.fit_sample(xTrainDataNS, yTrainDataNS)
xTrainDataS, yTrainDataS = smote.fit_sample(xTrainDataS, yTrainDataS)

xTrainDataS = pd.DataFrame( xTrainDataS , columns=smokeCol )
xTrainDataNS = pd.DataFrame( xTrainDataNS , columns=nsmokeCol )
yTrainDataS = pd.DataFrame( yTrainDataS , columns=['stroke'])
yTrainDataNS = pd.DataFrame( yTrainDataNS , columns=['stroke'])

In [6]:
a = { 'b' : 4 }
a['b']

4

In [8]:
xTrainDataS.tail()

Unnamed: 0,age,hypertension,heart_disease,bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,res_type_Rural,res_type_Urban,smoke_formerly smoked,smoke_never smoked,smoke_smokes
58935,39.0,0.0,0.0,26.3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
58936,78.0,1.0,0.0,32.6,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
58937,52.0,0.0,0.0,26.4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
58938,80.0,1.0,0.0,28.4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
58939,71.0,0.0,1.0,26.4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [3]:
from sklearn.model_selection import train_test_split

#Train Splitter

x_train, x_test, y_train, y_test = train_test_split(xTrainDataNS, np.ravel( yTrainDataNS , order='C' ) 
                                                    , test_size=0.35, random_state=0)

x_trainS, x_testS, y_trainS, y_testS = train_test_split(xTrainDataS, np.ravel( yTrainDataS , order='C' ),
                                                        test_size=0.35, random_state=0)

In [4]:
from sklearn.preprocessing import StandardScaler
import sys

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
xTestDataNS = scaler.transform( xTestDataNS )



scaler = StandardScaler()
scaler.fit(x_trainS)

x_trainS = scaler.transform(x_trainS)
x_testS = scaler.transform(x_testS)
xTestDataS = scaler.transform( xTestDataS )


  


In [5]:
#MLP

from sklearn.neural_network import MLPClassifier

sModel = MLPClassifier( hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.00005  )


sModel.fit(x_trainS, y_trainS)


predictions = sModel.predict(x_testS)

print(sModel.score(x_testS, y_testS))
print(metrics.confusion_matrix(y_testS, predictions))


#Blind Test

predictions = sModel.predict(xTestDataS)


print(sModel.score(xTestDataS, yTestDataS))
print(metrics.confusion_matrix(yTestDataS, predictions))




0.9166707062872655
[[ 8857  1411]
 [  308 10053]]
0.6640466926070039
[[8115 1328]
 [2989  418]]


In [None]:

"""
0.7962576954772408
[[7369 2899]
 [1304 9057]]
0.5964980544747082
[[6704 2739]
 [2446  961]]

$ hidden_layer_sizes=(16,64,64,64) , max_iter=300 , alpha=0.00015 

0.9261234184885355
[[ 8849  1419]
 [  105 10256]]
0.6644357976653696
[[8089 1354]
 [2958  449]]

$ dropped glucose hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.00015

0.8628144844636192
[[8092 2176]
 [ 654 9707]]
0.6327626459143969
[[7447 1996]
 [2723  684]]

$ hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.0003

0.8783266275631393
[[ 8073  2195]
 [  315 10046]]
0.6292607003891051
[[7398 2045]
 [2719  688]]

$ hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.0006

0.8612632701536672
[[7882 2386]
 [ 476 9885]]
0.6214007782101167
[[7214 2229]
 [2636  771]]

$ hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.0006 scaled

0.9007222841630714
[[9067 1201]
 [ 847 9514]]
0.674863813229572
[[8312 1131]
 [3047  360]]

$ hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.0001 scaled

0.9171069853119395
[[ 8657  1611]
 [   99 10262]]
0.6598443579766538
[[7974 1469]
 [2902  505]]

$ hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.00005 scaled ==best

0.9002375296912114
[[ 8299  1969]
 [   89 10272]]
0.6368093385214008
[[7576 1867]
 [2800  607]]

$ hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.00005  smote

0.9247257097613392
[[8210  570]
 [ 761 8141]]
0.7001556420233463
[[8789  654]
 [3199  208]]


"""

In [10]:
from joblib import dump

dump( sModel , 'smokemlp.joblib' )

['smokemlp.joblib']

In [18]:
#MLP

from sklearn.neural_network import MLPClassifier

model = MLPClassifier( hidden_layer_sizes=(16,24,24) , max_iter=450 , alpha=0.001  )


model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))


#Blind Test

predictions = model.predict(xTestDataNS)


print(model.score(xTestDataNS, yTestDataNS))
print(metrics.confusion_matrix(yTestDataNS, predictions))



0.955254151349981
[[3562  353]
 [   0 3974]]
0.688923665449487
[[3806  401]
 [1388  156]]


In [14]:
dump( model , 'nosmokemlp.joblib' )

['nosmokemlp.joblib']

In [36]:
from collections import Counter

Counter(y_testS)

Counter({1: 8902, 0: 8780})