# Stroke Predicton - Feature Selection and Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [2]:
dataframe = pd.read_csv('healthcare-dataset-stroke-data.csv',index_col=0)
media_bmi = np.round(dataframe["bmi"].mean(),2)
dataframe['bmi'] = dataframe['bmi'].fillna(media_bmi)
dataframe.isnull().sum()
X = dataframe.dropna(axis = 0, how ='any')
X = pd.concat([X,pd.get_dummies(X['gender'], prefix='gender')],axis=1)
X = pd.concat([X,pd.get_dummies(X['ever_married'], prefix='ever_married')],axis=1)
X = pd.concat([X,pd.get_dummies(X['work_type'], prefix='work_type')],axis=1)
X = pd.concat([X,pd.get_dummies(X['Residence_type'], prefix='Residence_type')],axis=1)
X = pd.concat([X,pd.get_dummies(X['smoking_status'], prefix='smoking_status')],axis=1)
X = X.drop(['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], axis = 1)
df = X[['age',                            
    'hypertension',               
    'heart_disease',                  
    'avg_glucose_level',               
    'bmi',                                                   
    'gender_Female',                  
    'gender_Male',                     
    'gender_Other',                      
    'ever_married_No',                
   'ever_married_Yes',                
   'work_type_Govt_job',               
   'work_type_Never_worked',         
   'work_type_Private',                 
   'work_type_Self-employed',          
   'work_type_children',              
   'Residence_type_Rural',             
   'Residence_type_Urban',            
   'smoking_status_Unknown',          
   'smoking_status_formerly smoked',  
   'smoking_status_never smoked',     
   'smoking_status_smokes',
   'stroke']]


In [3]:
labels = ['age',                            
    'hypertension',               
    'heart_disease',                  
    'avg_glucose_level',               
    'bmi',                                                   
    'gender_Female',                  
    'gender_Male',                     
    'gender_Other',                      
    'ever_married_No',                
   'ever_married_Yes',                
   'work_type_Govt_job',               
   'work_type_Never_worked',         
   'work_type_Private',                 
   'work_type_Self-employed',          
   'work_type_children',              
   'Residence_type_Rural',             
   'Residence_type_Urban',            
   'smoking_status_Unknown',          
   'smoking_status_formerly smoked',  
   'smoking_status_never smoked',     
   'smoking_status_smokes']
df = df.reset_index(drop=True)
X = df[labels]
y = df['stroke']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=66)
sm = SMOTE(random_state = 5)
X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)
X_train = pd.DataFrame(X_train_oversampled, columns=X_train.columns)
y_train = pd.DataFrame(y_train_oversampled ,columns=['stroke'])
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [4]:
X = X_train  #independent columns
y = y_train   #target column i.e price range

#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(20,'Score'))  #print 10 best features

                             Specs         Score
0                              age  24267.092824
3                avg_glucose_level  11680.633450
8                  ever_married_No   1181.241012
17          smoking_status_Unknown    708.552301
14              work_type_children    532.840278
15            Residence_type_Rural    362.914245
19     smoking_status_never smoked    355.370407
10              work_type_Govt_job    316.160279
5                    gender_Female    263.403990
6                      gender_Male    256.013710
20           smoking_status_smokes    214.072727
16            Residence_type_Urban    160.794045
18  smoking_status_formerly smoked    150.085221
12               work_type_Private    125.767742
9                 ever_married_Yes     69.931034
13         work_type_Self-employed     67.626834
4                              bmi     51.357325
11          work_type_Never_worked     18.000000
1                     hypertension      9.417957
2                   