In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#Plot
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

testData = pd.read_csv("test.csv")
testY = pd.read_csv("output.csv")
testData['stroke'] = testY['stroke']
trainData = pd.read_csv("train.csv")

#Not Significant for Strokes

trainData = trainData.drop( columns=['ever_married', 'gender', 'id'] )
testData = testData.drop( columns=['ever_married', 'gender', 'id'] )

#Filling Missing BMI Data === train

temp = trainData['bmi'].mean() + trainData['bmi'].median()
temp /= 2
temp
trainData['bmi'] = trainData['bmi'].fillna(temp)
trainData['smoking_status'] = trainData['smoking_status'].fillna("unknown")

#Filling Missing BMI Data ===test

temp = testData['bmi'].mean() + testData['bmi'].median()
temp /= 2
temp
testData['bmi'] = testData['bmi'].fillna(temp)
testData['smoking_status'] = testData['smoking_status'].fillna("unknown")

In [2]:
data = trainData.append(testData)

from collections import Counter

Counter( data['stroke'] )

Counter({0: 56267, 1: 5734})

In [68]:
data.to_csv(path_or_buf='data.csv')

In [32]:
from sklearn.model_selection import train_test_split

#Train Splitter

x_train, x_test, y_train, y_test = train_test_split(data.drop( columns=['stroke'] ),
                                                    np.ravel( data['stroke'] , order='C' )  ,
                                                    test_size=0.25, random_state=0)

In [33]:
# print( Counter( y_train ) )
# print( Counter( y_test ) )
print( x_train.columns )

Index(['age', 'hypertension', 'heart_disease', 'work_type', 'Residence_type',
       'avg_glucose_level', 'bmi', 'smoking_status'],
      dtype='object')


In [4]:
#Train Data

x_train = pd.get_dummies( x_train, 
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )


#Test Data
x_test = pd.get_dummies( x_test,
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )



In [5]:
#Sampling for removing imbalance

from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
smote = RandomOverSampler(random_state=0)

xcols = x_train.columns
x_train, y_train = smote.fit_sample(x_train, y_train)

x_train = pd.DataFrame( x_train , columns=xcols )
y_train = pd.DataFrame( y_train , columns=['stroke'] )

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [16]:
import warnings
warnings.filterwarnings("ignore")

In [31]:
#SVM
from sklearn import metrics
from sklearn.neural_network import MLPClassifier

alpha = 0.00038
print(" using alpha =" , alpha)
model = MLPClassifier(solver='lbfgs',alpha=alpha)

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))
print("**********************")
    


 using alpha = 0.00038
0.6076382168892329
[[8826 5207]
 [ 875  593]]
**********************


In [None]:
"""
0.000085

0.6274433907489839
[[9174 4859]
 [ 916  552]]

**********************

 using alpha = 0.00038
0.6076382168892329
[[8826 5207]
 [ 875  593]]
**********************



"""

In [34]:
from joblib import dump

dump(model , 'nosplit.joblib')

['nosplit.joblib']

In [8]:
Counter(y_test)

Counter({0: 14033, 1: 1468})

In [44]:
## RidgeClassifier

from sklearn import linear_model

model = linear_model.RidgeClassifier()

model.fit(x_train, y_train)


predictions = model.predict(x_test)

print(model.score(x_test, y_test))
print(metrics.confusion_matrix(y_test, predictions))


0.5919618089155538
[[8480 5553]
 [ 772  696]]


In [66]:
#Logistic Reg

from sklearn.linear_model import LogisticRegression as LR

model = LR()

good = ['age', 'avg_glucose_level',
        'bmi', 'work_type_Private', 'res_type_Urban', 'smoke_never smoked', 'smoke_smokes', 'smoke_unknown']

X = x_train
x = x_test
model.fit(X, y_train)


predictions = model.predict(x)

print(model.score(x, y_test))
print(metrics.confusion_matrix(y_test, predictions))

0.5920263208825237
[[8481 5552]
 [ 772  696]]


In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE

model = DecisionTreeClassifier()
# create the RFE model and select 8 attributes
rfe = RFE(model, 12)
rfe = rfe.fit(x_train, y_train)
# summarize the selection of the attributes
print('Selected features: %s' % list(x_train.columns[rfe.support_]))

Selected features: ['age', 'hypertension', 'avg_glucose_level', 'bmi', 'work_type_Govt_job', 'work_type_Private', 'work_type_Self-employed', 'res_type_Urban', 'smoke_formerly smoked', 'smoke_never smoked', 'smoke_smokes', 'smoke_unknown']


In [62]:
good = ['age', 'avg_glucose_level',
        'bmi', 'work_type_Private', 'res_type_Urban', 'smoke_never smoked', 'smoke_smokes', 'smoke_unknown']


X = x_train
x = x_test
model.fit(X, y_train)


predictions = model.predict(x)

print(model.score(x, y_test))
print(metrics.confusion_matrix(y_test, predictions))

0.8271079285207406
[[12674  1359]
 [ 1321   147]]
