In [3]:
# Project Name: AI Based Early Stroke Detection 
# Author List: Nehal Kalnad,Ashley Lobo, e-Yantra Team 
# Filename: neural-network.ipynb
# Functions: imblearn.over_sampling.smote.fit_sample ,sklearn.preprocessing.StandardScaler, sklearn.neural_network.MLPClassifier.predict
# Global Variables:testData,testY,trainData,temp,scaler,trainDataNS,trainDataS,testDataNS,testDataS,xTrainDataNS,yTrainDataNS,model,predictions, 
#                  xTrainDataS,yTrainDataS,smokeCol,nsmokeCol,x_train,x_test,y_train,y_test,x_trainS,x_testS,y_trainS,y_testS

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split

#Plot
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

# Data set obtained from
# https://www.kaggle.com/asaumya/healthcare-dataset-stroke-data

testData = pd.read_csv("data/test.csv")
testY = pd.read_csv("data/output.csv")
testData['stroke'] = testY['stroke']
trainData = pd.read_csv("data/train.csv")

#Not Significant for Strokes

trainData = trainData.drop( columns=['ever_married', 'gender', 'id','avg_glucose_level'] )
testData = testData.drop( columns=['ever_married', 'gender', 'id','avg_glucose_level'] )

#Filling Missing BMI Data === train

temp = trainData['bmi'].mean() + trainData['bmi'].median()
temp /= 2
temp
trainData['bmi'] = trainData['bmi'].fillna(temp)

#Filling Missing BMI Data ===test

temp = testData['bmi'].mean() + testData['bmi'].median()
temp /= 2
temp
testData['bmi'] = testData['bmi'].fillna(temp)

# Segregating into Two Models Smoking & Non-Smoking


trainDataS = trainData[ trainData['smoking_status'].notna() ]
testDataS = testData[ testData['smoking_status'].notna() ]

# X and Y division

yTrainDataS = trainDataS['stroke']
xTrainDataS = trainDataS.drop( columns=['stroke'] )

yTestDataS = testDataS['stroke']
xTestDataS = testDataS.drop( columns=['stroke'] )

#Nominal Categories to one-hot encoding === Train

#Smoke Data
xTrainDataS = pd.get_dummies( xTrainDataS, 
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )


#Nominal Categories to one-hot encoding ===Test

#Smoke Data
xTestDataS = pd.get_dummies( xTestDataS,
                            columns=['work_type' , 'Residence_type', 'smoking_status']
                            , prefix=['work_type' , 'res_type', 'smoke'] )




smokeCol = xTrainDataS.columns
# nsmokeCol = xTrainDataNS.columns


#Sampling for removing imbalance


# Function Name:imblearn.over_sampling.smote.fit_sample 
# Input:X:{array-like, sparse matrix}, shape (n_samples, n_features)
#               y:array-like, shape (n_samples,) 
# Output:X_resampled:{array-like, sparse matrix}, shape (n_samples_new, n_features) 
#               y_resampled:array-like, shape (n_samples_new,) 
# Logic:Used to balance dataset
# Example Call: X_res, y_res = sm.fit_sample(X, y) 

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
smote = RandomOverSampler(random_state=0)


xTrainDataS, yTrainDataS = smote.fit_sample(xTrainDataS, yTrainDataS)

xTrainDataS = pd.DataFrame( xTrainDataS , columns=smokeCol )
yTrainDataS = pd.DataFrame( yTrainDataS , columns=['stroke'])

In [6]:
from sklearn.model_selection import train_test_split

#Train Splitter

x_train, x_test, y_train, y_test = train_test_split(xTrainDataS, np.ravel( yTrainDataS , order='C' ),
                                                        test_size=0.35, random_state=0)

In [14]:
#Naive Base
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))

#naiva base on test data

predictions = model.predict(xTestDataS)


print(model.score(xTestDataS, yTestDataS))
print(metrics.confusion_matrix(yTestDataS, predictions))

print("=======")
print(metrics.f1_score(yTestDataS, predictions))
print(metrics.accuracy_score(yTestDataS, predictions))
print(metrics.precision_score(yTestDataS, predictions))

0.5184449076542731
[[  334  9934]
 [    0 10361]]
0.3145525291828794
[[ 382 8807]
 [   1 3660]]
0.45386904761904767
0.3145525291828794
0.29357503810058555


In [15]:
## RidgeClassifier

from sklearn import linear_model

model = linear_model.RidgeClassifier()

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))

## Blindtest

predictions = model.predict(xTestDataS)

print(model.score(xTestDataS, yTestDataS))
print(metrics.confusion_matrix(yTestDataS, predictions))


print("=======")
print(metrics.f1_score(yTestDataS, predictions))
print(metrics.accuracy_score(yTestDataS, predictions))
print(metrics.precision_score(yTestDataS, predictions))

0.7460371321925445
[[7277 2991]
 [2248 8113]]
0.9592217898832684
[[8825  364]
 [ 160 3501]]
0.9303747010364072
0.9592217898832684
0.9058214747736093


In [19]:
#Logistic Reg =====Best

from sklearn.linear_model import LogisticRegression as LR

model = LR()

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))

#LR on Test

predictions = model.predict(xTestDataS)


print(model.score(xTestDataS, yTestDataS))
print(metrics.confusion_matrix(yTestDataS, predictions))


print("=======")
print(metrics.f1_score(yTestDataS, predictions))
print(metrics.accuracy_score(yTestDataS, predictions))
print(metrics.precision_score(yTestDataS, predictions))



0.7458432304038005
[[7436 2832]
 [2411 7950]]
0.966614785992218
[[8968  221]
 [ 208 3453]]
0.9415132924335378
0.966614785992218
0.9398475775721284


In [21]:
import joblib as jl 
jl.dump(model, 'regmodel.joblib')

['regmodel.joblib']

In [17]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier



model = DecisionTreeClassifier()

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))

#LR on Test

predictions = model.predict(xTestDataS)


print(model.score(xTestDataS, yTestDataS))
print(metrics.confusion_matrix(yTestDataS, predictions))


print("=======")
print(metrics.f1_score(yTestDataS, predictions))
print(metrics.accuracy_score(yTestDataS, predictions))
print(metrics.precision_score(yTestDataS, predictions))


0.9837607251926899
[[ 9933   335]
 [    0 10361]]
0.7322178988326848
[[9103   86]
 [3355  306]]
0.15099925980754997
0.7322178988326848
0.7806122448979592


In [18]:
from sklearn import svm

model = svm.SVC()

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))

#LR on Test

predictions = model.predict(xTestDataS)


print(model.score(xTestDataS, yTestDataS))
print(metrics.confusion_matrix(yTestDataS, predictions))


print("=======")
print(metrics.f1_score(yTestDataS, predictions))
print(metrics.accuracy_score(yTestDataS, predictions))
print(metrics.precision_score(yTestDataS, predictions))



0.8668864220272432
[[7972 2296]
 [ 450 9911]]
0.8309727626459144
[[8462  727]
 [1445 2216]]
0.6711084191399151
0.8309727626459144
0.7529731566428814
