In [117]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split

testData = pd.read_csv("test.csv")
testY = pd.read_csv("output.csv")
testData['stroke'] = testY['stroke']
trainData = pd.read_csv("train.csv")

In [118]:
trainData = trainData.drop( columns=['ever_married', 'gender'] )
testData = testData.drop( columns=['ever_married', 'gender'] )

In [119]:
#Filling Missing BMI Data === train

temp = trainData['bmi'].mean() + trainData['bmi'].median()
temp /= 2
temp
trainData['bmi'] = trainData['bmi'].fillna(temp)

In [120]:
#Filling Missing BMI Data ===test

temp = testData['bmi'].mean() + testData['bmi'].median()
temp /= 2
temp
testData['bmi'] = testData['bmi'].fillna(temp)

In [None]:
print( "Empty Smoking Status(Train) in " , 
     len( list( filter( lambda x: x == True , pd.isna(trainData['smoking_status']) ) ) )
     )


print( "Empty Smoking Status(Test) in " , 
     len( list( filter( lambda x: x == True , pd.isna(testData['smoking_status']) ) ) )
     )

In [121]:
# Segregating into Two Models

trainDataNS = trainData[ trainData['smoking_status'].isna() ]
trainDataNS = trainDataNS.drop( columns=['smoking_status'] )
trainDataS = trainData[ trainData['smoking_status'].notna() ]

testDataNS = testData[ testData['smoking_status'].isna() ]
testDataNS = testDataNS.drop( columns=['smoking_status'] )
testDataS = testData[ testData['smoking_status'].notna() ]

In [None]:
print( "Imbalance in train is ", trainDataS['stroke'].value_counts() )
print( "Imbalance in test is ", testDataS['stroke'].value_counts() )

In [122]:
# X and Y division

yTrainDataNS = trainDataNS['stroke']
xTrainDataNS = trainDataNS.drop( columns=['stroke'] )

yTrainDataS = trainDataS['stroke']
xTrainDataS = trainDataS.drop( columns=['stroke'] )

In [123]:
yTestDataNS = testDataNS['stroke']
xTestDataNS = testDataNS.drop( columns=['stroke'] )

yTestDataS = testDataS['stroke']
xTestDataS = testDataS.drop( columns=['stroke'] )

In [124]:
#Nominal Categories to one-hot encoding === Train

#Smoke Data
xTrainDataS = pd.get_dummies( xTrainDataS, 
                             columns=[ 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix= ['work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTrainDataNS = pd.get_dummies( xTrainDataNS,
                              columns=[ 'work_type' , 'Residence_type']
                            , prefix=['[work_type' , 'res_type'] )



In [125]:
#Nominal Categories to one-hot encoding ===Test

#Smoke Data
xTestDataS = pd.get_dummies( xTestDataS,
                            columns=['work_type' , 'Residence_type', 'smoking_status']
                            , prefix=['work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTestDataNS = pd.get_dummies( xTestDataNS,
                             columns=['work_type' , 'Residence_type']
                            , prefix=['work_type' , 'res_type'] )

In [126]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)

xTrainDataNS, yTrainDataNS = smote.fit_sample(xTrainDataNS, yTrainDataNS)
xTrainDataS, yTrainDataS = smote.fit_sample(xTrainDataS, yTrainDataS)


In [127]:
from sklearn.model_selection import train_test_split

#Train Splitter

x_train, x_test, y_train, y_test = train_test_split(xTrainDataNS, yTrainDataNS, test_size=0.35, random_state=0)

x_trainS, x_testS, y_trainS, y_testS = train_test_split(xTrainDataS, yTrainDataS, test_size=0.35, random_state=0)

In [128]:
#Decison Tree

from sklearn import tree
clf = tree.DecisionTreeClassifier()

#No Smoke
# clf = clf.fit(x_train, y_train)
#Smoke
clf = clf.fit(x_trainS, y_trainS)

In [129]:
predictions = clf.predict(x_testS)
clf.score(x_testS, y_testS)

0.9703815017693538

0.9854395305878518

In [130]:
from sklearn import metrics

metrics.confusion_matrix(y_testS, predictions)

array([[ 9943,   325],
       [  286, 10075]])

In [73]:
xTestDataS.info()

(12850, 18)

In [75]:
xTrainDataS.info()

AttributeError: 'numpy.ndarray' object has no attribute 'info'

In [131]:
#Testing
predictions = clf.predict(xTestDataS)
print(clf.score(xTestDataS, yTestDataS))

metrics.confusion_matrix(yTestDataS, predictions)

0.7186770428015564


array([[9112,  331],
       [3284,  123]])

In [133]:
###XGBOOST

from xgboost import XGBClassifier as XGBC

model = XGBC()

model.fit(x_trainS, y_trainS)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [134]:
predictions = model.predict(x_testS)
print(model.score(x_testS, y_testS))

metrics.confusion_matrix(y_testS, predictions)

0.9686363856706578


array([[10268,     0],
       [  647,  9714]])

In [135]:
predictions = model.predict(xTestDataS.values)
print(model.score(xTestDataS.values, yTestDataS.values))

metrics.confusion_matrix(yTestDataS, predictions)

0.7347859922178989


array([[9442,    1],
       [3407,    0]])

Imbalance in test is  0    9443
1    3407
Name: stroke, dtype: int64


In [138]:
##Logistic Regression

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

rfe = RFE(logreg, 20)
rfe = rfe.fit(x_trainS, y_trainS)
print(rfe.support_)
print(rfe.ranking_)



[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [139]:
import statsmodels.api as sm
logit_model=sm.Logit(x_trainS, y_trainS)
result=logit_model.fit()
print(result.summary2())

ModuleNotFoundError: No module named 'statsmodels'

In [141]:
model = logreg.fit(x_trainS, y_trainS)

predictions = model.predict(x_testS)
print(model.score(x_testS, y_testS))

metrics.confusion_matrix(y_testS, predictions)



0.7772553201803286


array([[7537, 2731],
       [1864, 8497]])

In [142]:
predictions = model.predict(xTestDataS.values)
print(model.score(xTestDataS.values, yTestDataS.values))

metrics.confusion_matrix(yTestDataS, predictions)

0.5978988326848249


array([[6798, 2645],
       [2522,  885]])