In [1]:
# Project Name: AI Based Early Stroke Detection 
# Author List: Nehal Kalnad,Ashley Lobo, e-Yantra Team 
# Filename: stroke-mlp-final.py 
# Functions: None, 
# Global Variables:testData,testY,trainData,temp,scaler,trainDataNS,trainDataS,testDataNS,testDataS,xTrainDataNS,yTrainDataNS,model,predictions, 
#                  xTrainDataS,yTrainDataS,smokeCol,nsmokeCol,x_train,x_test,y_train,y_test,x_trainS,x_testS,y_trainS,y_testS

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

#Plot
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

testData = pd.read_csv("test.csv")
testY = pd.read_csv("output.csv")
testData['stroke'] = testY['stroke']
trainData = pd.read_csv("train.csv")

#Not Significant for Strokes

trainData = trainData.drop( columns=['ever_married', 'gender', 'id','avg_glucose_level'] )
testData = testData.drop( columns=['ever_married', 'gender', 'id','avg_glucose_level'] )

#Filling Missing BMI Data === train

temp = trainData['bmi'].mean() + trainData['bmi'].median()
temp /= 2
temp
trainData['bmi'] = trainData['bmi'].fillna(temp)

#Filling Missing BMI Data ===test

temp = testData['bmi'].mean() + testData['bmi'].median()
temp /= 2
temp
testData['bmi'] = testData['bmi'].fillna(temp)

# Segregating into Two Models Smoking & Non-Smoking

trainDataNS = trainData[ trainData['smoking_status'].isna() ]
trainDataNS = trainDataNS.drop( columns=['smoking_status'] )
trainDataS = trainData[ trainData['smoking_status'].notna() ]

testDataNS = testData[ testData['smoking_status'].isna() ]
testDataNS = testDataNS.drop( columns=['smoking_status'] )
testDataS = testData[ testData['smoking_status'].notna() ]

# X and Y division

yTrainDataNS = trainDataNS['stroke']
xTrainDataNS = trainDataNS.drop( columns=['stroke'] )

yTrainDataS = trainDataS['stroke']
xTrainDataS = trainDataS.drop( columns=['stroke'] )

yTestDataNS = testDataNS['stroke']
xTestDataNS = testDataNS.drop( columns=['stroke'] )

yTestDataS = testDataS['stroke']
xTestDataS = testDataS.drop( columns=['stroke'] )

#Nominal Categories to one-hot encoding === Train

#Smoke Data
xTrainDataS = pd.get_dummies( xTrainDataS, 
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTrainDataNS = pd.get_dummies( xTrainDataNS,
                              columns=[ 'work_type' , 'Residence_type']
                            , prefix=['work_type' , 'res_type'] )

#Nominal Categories to one-hot encoding ===Test

#Smoke Data
xTestDataS = pd.get_dummies( xTestDataS,
                            columns=['work_type' , 'Residence_type', 'smoking_status']
                            , prefix=['work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTestDataNS = pd.get_dummies( xTestDataNS,
                             columns=['work_type' , 'Residence_type']
                            , prefix=['work_type' , 'res_type'] )

smokeCol = xTrainDataS.columns
nsmokeCol = xTrainDataNS.columns


#Sampling for removing imbalance


# Function Name:imblearn.over_sampling.smote.fit_sample 
# Input:X:{array-like, sparse matrix}, shape (n_samples, n_features)
#               y:array-like, shape (n_samples,) 
# Output:X_resampled:{array-like, sparse matrix}, shape (n_samples_new, n_features) 
#               y_resampled:array-like, shape (n_samples_new,) 
# Logic:Used to balance dataset
# Example Call: X_res, y_res = sm.fit_sample(X, y) 

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
smote = RandomOverSampler(random_state=0)

xTrainDataNS, yTrainDataNS = smote.fit_sample(xTrainDataNS, yTrainDataNS)
xTrainDataS, yTrainDataS = smote.fit_sample(xTrainDataS, yTrainDataS)

xTrainDataS = pd.DataFrame( xTrainDataS , columns=smokeCol )
xTrainDataNS = pd.DataFrame( xTrainDataNS , columns=nsmokeCol )
yTrainDataS = pd.DataFrame( yTrainDataS , columns=['stroke'])
yTrainDataNS = pd.DataFrame( yTrainDataNS , columns=['stroke'])

In [2]:
from sklearn.model_selection import train_test_split

#Train Splitter

x_train, x_test, y_train, y_test = train_test_split(xTrainDataNS, np.ravel( yTrainDataNS , order='C' ) 
                                                    , test_size=0.35, random_state=0)

x_trainS, x_testS, y_trainS, y_testS = train_test_split(xTrainDataS, np.ravel( yTrainDataS , order='C' ),
                                                        test_size=0.35, random_state=0)

In [3]:
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
import sys

# Function Name:sklearn.neural_network.MLPClassifier.predict, 
# Input:X : {array-like, sparse matrix}, shape (n_samples, n_features)              
# Output:y : array-like, shape (n_samples,) or (n_samples, n_classes)
# Logic:Predict using the multi-layer perceptron classifier
# Example Call: X_res= sm.preditc(X), 

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
xTestDataNS = scaler.transform( xTestDataNS )



scaler = StandardScaler()
scaler.fit(x_trainS)

x_trainS = scaler.transform(x_trainS)
x_testS = scaler.transform(x_testS)
xTestDataS = scaler.transform( xTestDataS )


In [4]:
#MLP

from sklearn.neural_network import MLPClassifier

sModel = MLPClassifier( hidden_layer_sizes=(16,64,64) , max_iter=300 , alpha=0.00008 )


sModel.fit(x_trainS, y_trainS)


# Function Name:sklearn.neural_network.MLPClassifier.predict, 
# Input:X : {array-like, sparse matrix}, shape (n_samples, n_features)              
# Output:y : array-like, shape (n_samples,) or (n_samples, n_classes)
# Logic:Predict using the multi-layer perceptron classifier
# Example Call: X_res= sModel.preditc(X), 

#Synthetic Test

predictions = sModel.predict(x_testS)
print(sModel.score(x_testS, y_testS))
print(metrics.confusion_matrix(y_testS, predictions))


#Blind Test

predictions = sModel.predict(xTestDataS)
print(sModel.score(xTestDataS, yTestDataS))
print(metrics.confusion_matrix(yTestDataS, predictions))




"""
 
0.9208395947452616
[[ 8687  1581]
 [   52 10309]]
0.6601556420233463
[[7998 1445]
 [2922  485]]



"""


0.8661108148722672
[[8390 1878]
 [ 884 9477]]
0.6378210116731517
[[7614 1829]
 [2825  582]]


'\n \n0.9208395947452616\n[[ 8687  1581]\n [   52 10309]]\n0.6601556420233463\n[[7998 1445]\n [2922  485]]\n\n\n\n'

In [5]:
#Creates Model for Flask running

from joblib import dump

dump( sModel , 'smokemlp.joblib' )

ModuleNotFoundError: No module named 'joblib'

In [None]:
######## No Smoke Data - mlp didnt work


from sklearn import linear_model

model = linear_model.RidgeClassifier()

model.fit(x_train, y_train)


# Synthetic DataTest

predictions = model.predict(x_test)
print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))

#BlindTest

predictions = model.predict(xTestDataNS)
print(model.score(xTestDataNS, yTestDataNS))
print(metrics.confusion_matrix(yTestDataNS, predictions))


In [None]:
dump( model , 'nosmokemlp.joblib' )