In [66]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split

testData = pd.read_csv("test.csv")
testY = pd.read_csv("output.csv")
testData['stroke'] = testY['stroke']
trainData = pd.read_csv("train.csv")

In [67]:
#Filling Missing BMI Data === train

temp = trainData['bmi'].mean() + trainData['bmi'].median()
temp /= 2
temp
trainData['bmi'] = trainData['bmi'].fillna(temp)

In [68]:
#Filling Missing BMI Data ===test

temp = testData['bmi'].mean() + testData['bmi'].median()
temp /= 2
temp
testData['bmi'] = testData['bmi'].fillna(temp)

In [69]:
print( "Empty Smoking Status(Train) in " , 
     len( list( filter( lambda x: x == True , pd.isna(trainData['smoking_status']) ) ) )
     )


print( "Empty Smoking Status(Test) in " , 
     len( list( filter( lambda x: x == True , pd.isna(testData['smoking_status']) ) ) )
     )

Empty Smoking Status(Train) in  13292
Empty Smoking Status(Test) in  5751


In [70]:
# Segregating into Two Models

trainDataNS = trainData[ trainData['smoking_status'].isna() ]
trainDataNS = trainDataNS.drop( columns=['smoking_status'] )
trainDataS = trainData[ trainData['smoking_status'].notna() ]

testDataNS = testData[ testData['smoking_status'].isna() ]
testDataNS = testDataNS.drop( columns=['smoking_status'] )
testDataS = testData[ testData['smoking_status'].notna() ]

In [71]:
print( "Imbalance in train is ", trainDataS['stroke'].value_counts() )
print( "Imbalance in test is ", testDataS['stroke'].value_counts() )

Imbalance in train is  0    29470
1      638
Name: stroke, dtype: int64
Imbalance in test is  0    9443
1    3407
Name: stroke, dtype: int64


In [72]:
# X and Y division

yTrainDataNS = trainDataNS['stroke']
xTrainDataNS = trainDataNS.drop( columns=['stroke'] )

yTrainDataS = trainDataS['stroke']
xTrainDataS = trainDataS.drop( columns=['stroke'] )

In [73]:
yTestDataNS = testDataNS['stroke']
xTestDataNS = testDataNS.drop( columns=['stroke'] )

yTestDataS = testDataS['stroke']
xTestDataS = testDataS.drop( columns=['stroke'] )

In [74]:
#Nominal Categories to one-hot encoding === Train

#Smoke Data
xTrainDataS = pd.get_dummies( xTrainDataS, 
                             columns=['gender', 'ever_married', 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix=['sex', 'married', 'work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTrainDataNS = pd.get_dummies( xTrainDataNS,
                              columns=['gender', 'ever_married', 'work_type' , 'Residence_type']
                            , prefix=['sex', 'married', 'work_type' , 'res_type'] )



In [75]:
#Nominal Categories to one-hot encoding ===Test

#Smoke Data
xTestDataS = pd.get_dummies( xTestDataS,
                            columns=['gender', 'ever_married', 'work_type' , 'Residence_type', 'smoking_status']
                            , prefix=['sex', 'married', 'work_type' , 'res_type', 'smoke'] )


#No Smoke Data
xTestDataNS = pd.get_dummies( xTestDataNS,
                             columns=['gender', 'ever_married', 'work_type' , 'Residence_type']
                            , prefix=['sex', 'married', 'work_type' , 'res_type'] )

In [76]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)

xTrainDataNS, yTrainDataNS = smote.fit_sample(xTrainDataNS, yTrainDataNS)
xTrainDataS, yTrainDataS = smote.fit_sample(xTrainDataS, yTrainDataS)


In [61]:
from sklearn.model_selection import train_test_split

#Train Splitter

x_train, x_test, y_train, y_test = train_test_split(xTrainDataNS, yTrainDataNS, test_size=0.35, random_state=0)

x_trainS, x_testS, y_trainS, y_testS = train_test_split(xTrainDataS, yTrainDataS, test_size=0.35, random_state=0)

In [62]:
#Decison Tree

from sklearn import tree
clf = tree.DecisionTreeClassifier()

#No Smoke
clf = clf.fit(x_train, y_train)
#Smoke
# clfS = clf.fit(x_trainS, y_trainS)

In [63]:
predictions = clf.predict(x_test)
clf.score(x_test, y_test)

0.9855481908073455

In [64]:
from sklearn import metrics

metrics.confusion_matrix(y_test, predictions)

array([[4498,   80],
       [  53, 4572]])

In [65]:
#Testing
predictions = clf.predict(xTestDataNS)
print(clf.score(xTestDataNS, yTestDataNS))

metrics.confusion_matrix(yTestDataNS, predictions)

0.7219613980177361


array([[4121,   86],
       [1513,   31]])

In [88]:
###XGBOOST

import xgboost as xgb

data_dmatrix = xgb.DMatrix(data=xTrainDataNS,label=yTrainDataNS)

param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }

cv_results = xgb.train(dtrain=data_dmatrix, params=params,
                    num_boost_round=50)



[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=5
[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=5
[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=5
[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=5
[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 0 pruned nodes, max_depth=5
[23:16:38] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 42 extra nodes, 0 pruned nodes, max_

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17'] ['id', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi', 'sex_Female', 'sex_Male', 'sex_Other', 'married_No', 'married_Yes', 'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'res_type_Rural', 'res_type_Urban']
expected f7, f11, f13, f1, f3, f4, f9, f5, f12, f15, f0, f6, f8, f2, f16, f10, f14, f17 in input data
training data did not have the following fields: hypertension, sex_Male, avg_glucose_level, married_Yes, sex_Other, res_type_Urban, work_type_Govt_job, work_type_Private, work_type_children, id, heart_disease, age, res_type_Rural, bmi, married_No, work_type_Never_worked, work_type_Self-employed, sex_Female