In [5]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

#Plot
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

testData = pd.read_csv("test.csv")
testY = pd.read_csv("output.csv")
testData['stroke'] = testY['stroke']
trainData = pd.read_csv("train.csv")

#Not Significant for Strokes

trainData = trainData.drop( columns=['ever_married', 'gender', 'id'] )
testData = testData.drop( columns=['ever_married', 'gender', 'id'] )

#Filling Missing BMI Data === train

temp = trainData['bmi'].mean() + trainData['bmi'].median()
temp /= 2
temp
trainData['bmi'] = trainData['bmi'].fillna(temp)

#Filling Missing BMI Data ===test

temp = testData['bmi'].mean() + testData['bmi'].median()
temp /= 2
temp
testData['bmi'] = testData['bmi'].fillna(temp)

# Segregating into Two Models Smoking & Non-Smoking

trainDataNS = trainData[ trainData['smoking_status'].isna() ]
trainDataNS = trainDataNS.drop( columns=['smoking_status'] )
trainDataS = trainData[ trainData['smoking_status'].notna() ]

testDataNS = testData[ testData['smoking_status'].isna() ]
testDataNS = testDataNS.drop( columns=['smoking_status'] )
testDataS = testData[ testData['smoking_status'].notna() ]

# X and Y division

yTrainDataNS = trainDataNS['stroke']
xTrainDataNS = trainDataNS.drop( columns=['stroke'] )

yTrainDataS = trainDataS['stroke']
xTrainDataS = trainDataS.drop( columns=['stroke'] )

yTestDataNS = testDataNS['stroke']
xTestDataNS = testDataNS.drop( columns=['stroke'] )

yTestDataS = testDataS['stroke']
xTestDataS = testDataS.drop( columns=['stroke'] )

#Nominal Categories to one-hot encoding === Train

#Smoke Data
xTrainDataS = pd.get_dummies( xTrainDataS, 
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTrainDataNS = pd.get_dummies( xTrainDataNS,
                              columns=[ 'work_type' , 'Residence_type']
                            , prefix=['[work_type' , 'res_type'] )

#Nominal Categories to one-hot encoding ===Test

#Smoke Data
xTestDataS = pd.get_dummies( xTestDataS,
                            columns=['work_type' , 'Residence_type', 'smoking_status']
                            , prefix=['work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTestDataNS = pd.get_dummies( xTestDataNS,
                             columns=['work_type' , 'Residence_type']
                            , prefix=['work_type' , 'res_type'] )

smokeCol = xTrainDataS.columns
nsmokeCol = xTrainDataNS.columns


#Sampling for removing imbalance

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
smote = RandomOverSampler(random_state=0)

xTrainDataNS, yTrainDataNS = smote.fit_sample(xTrainDataNS, yTrainDataNS)
xTrainDataS, yTrainDataS = smote.fit_sample(xTrainDataS, yTrainDataS)

xTrainDataS = pd.DataFrame( xTrainDataS , columns=smokeCol )
xTrainDataNS = pd.DataFrame( xTrainDataNS , columns=nsmokeCol )
yTrainDataS = pd.DataFrame( yTrainDataS , columns=['stroke'])
yTrainDataNS = pd.DataFrame( yTrainDataNS , columns=['stroke'])

In [8]:
from sklearn.model_selection import train_test_split

#Train Splitter

x_train, x_test, y_train, y_test = train_test_split(xTrainDataNS, np.ravel( yTrainDataNS , order='C' ) 
                                                    , test_size=0.35, random_state=0)

x_trainS, x_testS, y_trainS, y_testS = train_test_split(xTrainDataS, np.ravel( yTrainDataS , order='C' ),
                                                        test_size=0.35, random_state=0)

In [9]:
#MLP

from sklearn.neural_network import MLPClassifier

sModel = MLPClassifier(alpha=0.00015 )


sModel.fit(x_trainS, y_trainS)


predictions = sModel.predict(x_testS)

print(sModel.score(x_testS, y_testS))
print(metrics.confusion_matrix(y_testS, predictions))


#Blind Test

predictions = sModel.predict(xTestDataS)


print(sModel.score(xTestDataS, yTestDataS))
print(metrics.confusion_matrix(yTestDataS, predictions))


"""
0.7962576954772408
[[7369 2899]
 [1304 9057]]
0.5964980544747082
[[6704 2739]
 [2446  961]]


"""

0.7962576954772408
[[7369 2899]
 [1304 9057]]
0.5964980544747082
[[6704 2739]
 [2446  961]]


In [10]:
from joblib import dump

dump( sModel , 'smokemlp.joblib' )

['smokemlp.joblib']

In [13]:

model = MLPClassifier(alpha=0.0002, max_iter=400 )


model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))

predictions = model.predict(xTestDataNS)


print(model.score(xTestDataNS, yTestDataNS))
print(metrics.confusion_matrix(yTestDataNS, predictions))


"""
0.8876453330435727
[[3606  972]
 [  62 4563]]
0.6395409494001043
[[3340  867]
 [1206  338]]

"""

0.8876453330435727
[[3606  972]
 [  62 4563]]
0.6395409494001043
[[3340  867]
 [1206  338]]


In [14]:
dump( model , 'nosmokemlp.joblib' )

['nosmokemlp.joblib']