In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Plot
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

testData = pd.read_csv("test.csv")
testY = pd.read_csv("output.csv")
testData['stroke'] = testY['stroke']
trainData = pd.read_csv("train.csv")

#Not Significant for Strokes

trainData = trainData.drop( columns=['ever_married', 'gender', 'id'] )
testData = testData.drop( columns=['ever_married', 'gender', 'id'] )

#Filling Missing BMI Data === train

temp = trainData['bmi'].mean() + trainData['bmi'].median()
temp /= 2
temp
trainData['bmi'] = trainData['bmi'].fillna(temp)
trainData['smoking_status'] = trainData['smoking_status'].fillna("unknown")

#Filling Missing BMI Data ===test

temp = testData['bmi'].mean() + testData['bmi'].median()
temp /= 2
temp
testData['bmi'] = testData['bmi'].fillna(temp)
testData['smoking_status'] = testData['smoking_status'].fillna("unknown")

In [35]:
data = trainData.append(testData)

from collections import Counter

Counter( data['stroke'] )

Counter({0: 56267, 1: 5734})

In [68]:
data.to_csv(path_or_buf='data.csv')

In [36]:
from sklearn.model_selection import train_test_split

#Train Splitter

x_train, x_test, y_train, y_test = train_test_split(data.drop( columns=['stroke'] ),
                                                    np.ravel( data['stroke'] , order='C' )  ,
                                                    test_size=0.25, random_state=0)

In [32]:
# print( Counter( y_train ) )
# print( Counter( y_test ) )
print( x_train.describe() )

                age  hypertension  heart_disease  avg_glucose_level  \
count  46500.000000  46500.000000   46500.000000       46500.000000   
mean      42.153671      0.093527       0.047935         104.332569   
std       22.544443      0.291173       0.213632          42.845126   
min        0.080000      0.000000       0.000000          55.000000   
25%       24.000000      0.000000       0.000000          77.480000   
50%       44.000000      0.000000       0.000000          91.600000   
75%       60.000000      0.000000       0.000000         112.142500   
max       82.000000      1.000000       1.000000         291.050000   

                bmi  work_type_Govt_job  work_type_Never_worked  \
count  46500.000000        46500.000000            46500.000000   
mean      28.564693            0.125054                0.003892   
std        7.648760            0.330783                0.062269   
min       10.100000            0.000000                0.000000   
25%       23.400000      

In [37]:
#Train Data

x_train = pd.get_dummies( x_train, 
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )


#Test Data
x_test = pd.get_dummies( x_test,
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )



In [39]:
#Sampling for removing imbalance

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
smote = RandomOverSampler(random_state=0)

xcols = x_train.columns
x_train, y_train = smote.fit_sample(x_train, y_train)

x_train = pd.DataFrame( x_train , columns=xcols )
y_train = pd.DataFrame( y_train , columns=['stroke'] )

  y = column_or_1d(y, warn=True)


In [41]:
import warnings
warnings.filterwarnings("ignore")

In [43]:
#SVM
from sklearn import metrics
from sklearn.svm import LinearSVC

model = LinearSVC()

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))

0.09470356751177343
[[    0 14033]
 [    0  1468]]


In [44]:
## RidgeClassifier

from sklearn import linear_model

model = linear_model.RidgeClassifier()

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))


0.5919618089155538
[[8480 5553]
 [ 772  696]]


In [66]:
#Logistic Reg

from sklearn.linear_model import LogisticRegression as LR

model = LR()

good = ['age', 'avg_glucose_level',
        'bmi', 'work_type_Private', 'res_type_Urban', 'smoke_never smoked', 'smoke_smokes', 'smoke_unknown']

X = x_train
x = x_test
model.fit(X, y_train)


predictions = model.predict(x)

print(model.score(x, y_test))
print(metrics.confusion_matrix(y_test, predictions))

0.5920263208825237
[[8481 5552]
 [ 772  696]]


In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE

model = DecisionTreeClassifier()
# create the RFE model and select 8 attributes
rfe = RFE(model, 12)
rfe = rfe.fit(x_train, y_train)
# summarize the selection of the attributes
print('Selected features: %s' % list(x_train.columns[rfe.support_]))

Selected features: ['age', 'hypertension', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 'work_type_Private', 'work_type_Self-employed', 'res_type_Urban', 'smoke_formerly smoked', 'smoke_never smoked', 'smoke_smokes', 'smoke_unknown']


In [62]:
good = ['age', 'avg_glucose_level',
        'bmi', 'work_type_Private', 'res_type_Urban', 'smoke_never smoked', 'smoke_smokes', 'smoke_unknown']


X = x_train
x = x_test
model.fit(X, y_train)


predictions = model.predict(x)

print(model.score(x, y_test))
print(metrics.confusion_matrix(y_test, predictions))

0.8271079285207406
[[12674  1359]
 [ 1321   147]]
