In [None]:
#sklearn
from sklearn.metrics import roc_curve,auc,classification_report,precision_score,recall_score,f1_score,plot_confusion_matrix,confusion_matrix,make_scorer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.preprocessing import FunctionTransformer,LabelEncoder,OneHotEncoder,StandardScaler,QuantileTransformer
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.utils import shuffle,resample
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

#plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")


import  numpy as np
import pandas as pd
import scipy
from copy import deepcopy
import itertools

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
df=df.drop(['id'],axis=1)

df=df.drop([df[df.gender=='Other'].index][0],axis=0)

In [None]:
df.stroke.value_counts()

### Missing Data

In [None]:
df.bmi.isna().value_counts()

In [None]:
missing=df[df.bmi.isna()==True]
missing_no_con=missing.drop(['age','bmi','avg_glucose_level'],axis=1)


no_missing=df[df.bmi.isna()==False]
no_missing_no_con=no_missing.drop(['age','bmi','avg_glucose_level'],axis=1)

In [None]:
comb=[]

groups={}

for f in missing_no_con.columns:
  comb.append(tuple(np.unique(df[f])))

comb=list(itertools.product(comb[0],comb[1],comb[2],comb[3],comb[4],comb[5],comb[6],comb[7]))


for idx,c in enumerate(comb):
  get=missing_no_con[(missing_no_con[missing_no_con.columns[0]]==c[0])&
               (missing_no_con[missing_no_con.columns[1]]==c[1])&
               (missing_no_con[missing_no_con.columns[2]]==c[2])&
               (missing_no_con[missing_no_con.columns[3]]==c[3])&
               (missing_no_con[missing_no_con.columns[4]]==c[4])&
               (missing_no_con[missing_no_con.columns[5]]==c[5])&
               (missing_no_con[missing_no_con.columns[6]]==c[6])&
               (missing_no_con[missing_no_con.columns[7]]==c[7])]
  
  if len(get.index)!=0:
    groups[c]=list(get.index)

In [None]:
std=StandardScaler()
nn=NearestNeighbors(n_neighbors=1)
for k,v in groups.items():
  get=no_missing_no_con[(no_missing_no_con[no_missing_no_con.columns[0]]==k[0])&
               (no_missing_no_con[no_missing_no_con.columns[1]]==k[1])&
               (no_missing_no_con[no_missing_no_con.columns[2]]==k[2])&
               (no_missing_no_con[no_missing_no_con.columns[3]]==k[3])&
               (no_missing_no_con[no_missing_no_con.columns[4]]==k[4])&
               (no_missing_no_con[no_missing_no_con.columns[5]]==k[5])&
               (no_missing_no_con[no_missing_no_con.columns[6]]==k[6])&
               (no_missing_no_con[no_missing_no_con.columns[7]]==k[7])]
  index=list(get.index)  
  if len(index)!=0: 
    neighbors=df.loc[index][['age','avg_glucose_level']]
    neighbors=pd.DataFrame(std.fit_transform(neighbors),columns=neighbors.columns,index=neighbors.index)
    nn.fit(neighbors)
    for idx in v:
      target=missing.loc[[idx]]
      target=target[['age','avg_glucose_level']] 
      nbrs_idx=nn.kneighbors(std.fit_transform(target),1,return_distance=False)
      nbrs_idx=neighbors.index[nbrs_idx[0][0]]
      df.at[idx,'bmi']=np.array(df.bmi.loc[[nbrs_idx]])[0]

In [None]:
df.bmi.isna().value_counts()

In [None]:
df['bmi']=df.bmi.fillna(df.bmi.median())

In [None]:
df.bmi.isna().value_counts()

### Visualize 

In [None]:
fig,ax=plt.subplots(ncols=3,figsize=(40,10))

for i,f in enumerate(df[['age','avg_glucose_level','bmi']].columns):
  sns.histplot(data=df,x=f,hue='stroke',ax=ax[i])

In [None]:
fig,ax=plt.subplots(ncols=3,figsize=(40,10))
df_=deepcopy(df)

df_=np.log(df_[['age','avg_glucose_level','bmi']])

df_['stroke']=df.stroke
for i,f in enumerate(df[['age','avg_glucose_level','bmi']].columns):
  sns.histplot(data=df_,x=f,hue='stroke',ax=ax[i])

* BMI

In [None]:

fig,ax=plt.subplots(ncols=len(df.columns)-4,figsize=(40,10))
for i,f in enumerate(df.drop(['stroke','bmi','age','avg_glucose_level'],axis=1).columns):
  sns.boxplot(data=df,x=f,y='bmi',hue='stroke',ax=ax[i])

* glucose level

In [None]:
fig,ax=plt.subplots(ncols=len(df.columns)-4,figsize=(40,10))
for i,f in enumerate(df.drop(['stroke','bmi','age','avg_glucose_level'],axis=1).columns):
  sns.boxplot(data=df,x=f,y='avg_glucose_level',hue='stroke',ax=ax[i])

### Statistical Testing

In [None]:
from scipy.stats import chi2_contingency,f,ttest_ind

* Chi-squared independence test
  * Test whether categorical variable indepent to whether stroke

In [None]:
p_vals=[]
for f in df.drop(['age','bmi','avg_glucose_level','stroke'],axis=1).columns:
  obs=pd.crosstab(df[f],df.stroke)
  chi2, p, dof, ex=chi2_contingency(obs)
  if p<=0.05:
    print(f'({f} and Stroke)--- Reject--- independence Assumption , p-val :{p}')
  else:
    print(f'({f} and Stroke)---Accept--- independence Assumption , p-val :{p}')

  p_vals.append(p)

In [None]:
plt.barh(df.drop(['age','bmi','avg_glucose_level','stroke'],axis=1).columns,-np.log(p_vals))
plt.title('-log(p value)')
plt.show()

* two sample t-test
  * Test whether stroke guy's bmi or glucose level higher than the no stroke 

In [None]:
df_=deepcopy(df)

df_=np.log(df_[['age','avg_glucose_level','bmi']])

df_['stroke']=df.stroke

In [None]:
#avg glucose level


stroke=df_[df_.stroke==1]['avg_glucose_level']

no_stroke=df_[df_.stroke==0]['avg_glucose_level']



F=np.var(stroke)/np.var(no_stroke)

df1=len(stroke)
df2=len(no_stroke)


alpha = 0.05 
p_value = scipy.stats.f.sf(F, df1, df2)
print('p:',p_value)
if p_value > alpha:

  print('accept VarX=VarY')
  t,p=ttest_ind(stroke,no_stroke,equal_var=True)
else:
  print('reject VarX=VarY')
  t,p=ttest_ind(stroke,no_stroke,equal_var=False)


if p/2<alpha:
    print(f't-test p :{p/2} , Reject H0')
else:
    print(f't-test p :{p/2} , Accept H0')

In [None]:
#bmi


stroke=df_[df_.stroke==1]['bmi']

no_stroke=df_[df_.stroke==0]['bmi']


F=np.var(stroke)/np.var(no_stroke)

df1=len(stroke)
df2=len(no_stroke)


alpha = 0.05 
p_value = scipy.stats.f.sf(F, df1, df2)
print('p:',p_value)
if p_value > alpha:

  print('accept VarX=VarY')
  t,p=ttest_ind(stroke,no_stroke,equal_var=True)
else:
  print('reject VarX=VarY')
  t,p=ttest_ind(stroke,no_stroke,equal_var=False)
    

if p/2<alpha:
    print(f't-test p :{p/2} , Reject H0')
else:
    print(f't-test p :{p/2} , Accept H0')

### Augment Function

* Encode Categorical 
 * If more than 2 categorical then use one-hot encoding
 * else use label encoding

* Continuous
 * Polynomial Feature with only interaction

In [None]:
class Augmentor:
  def __init__(self,degree=2,
        interaction_only=False,
        use_ohe=True,
        assign_drop=None):
    self.Label_Enc={}
    self.OneHot_Enc={}
    self.Poly=None
    self.degree=degree
    self.interaction_only=interaction_only
    self.use_ohe=use_ohe
    self.assign_drop=assign_drop
    self.continuous=['age','avg_glucose_level','bmi']

    try:
      for x in self.continuous:
        if x in self.assign_drop:
          self.continuous.remove(x)
    except:
      None

  def __call__(self,X,y=None):
    X_=deepcopy(X) 

    if self.use_ohe:
      X_=self.one_hot_encoder(X_)

    #call by address, no return 
    self.label_encoder(X_)
    if self.degree>1:
      X_=self.polynomial(X_)

    if self.assign_drop!=None:
      X_=X_.drop(self.assign_drop,axis=1)
    return X_

  def one_hot_encoder(self,X):
    for f in X.columns:
      if f in ['work_type','smoking_status']:
        try:
          col_names=self.OneHot_Enc[f].get_feature_names()
        except:
          self.OneHot_Enc[f]=OneHotEncoder()
          self.OneHot_Enc[f].fit(X[f].values.reshape(-1,1))
          col_names=self.OneHot_Enc[f].get_feature_names()
        X[col_names]=self.OneHot_Enc[f].transform(X[f].values.reshape(-1,1)).toarray()

    try:
      X=X.drop(['work_type','smoking_status'],axis=1)
    except:
      pass
    return X
    

  def label_encoder(self,X):
    '''
    input : Dataframe
    '''
    for f in X.columns:
      if f not in self.continuous:
        try:
          self.Label_Enc[f].classes_
          X[f]=self.Label_Enc[f].transform(X[f])
        except:
          self.Label_Enc[f]=LabelEncoder()
          X[f]=self.Label_Enc[f].fit_transform(X[f])


  def polynomial(self,X):
    if self.Poly==None:
      self.Poly=PolynomialFeatures(degree=self.degree,interaction_only=self.interaction_only,include_bias=False)
      self.Poly.fit(X[self.continuous])
    col_names=self.Poly.get_feature_names()
    X[col_names]=self.Poly.transform(X[self.continuous])
    try:
      for x in self.continuous:
        X=X.drop([x],axis=1)
    except:
      pass
    return X



* Custom Scorer

* Since data is imbalance , we need to measure by percentage rather than sample number

  * How much percentage Negative I predict correstly 

In [None]:
def scaled_f1(y_true,y_pred):
    C=confusion_matrix(y_true,y_pred).astype('float32')
    C[0,:]=C[0,:]/C[0,:].sum()
    C[1,:]=C[1,:]/C[1,:].sum()
    TP=C[1][1]
    FP=C[0][1]
    TN=C[0][0]
    FN=C[1][0]
    precision=TP/(TP+FP)
    recall=TP/(TP+FN)
    f1=2*precision*recall/(precision+recall)
    return f1

### Modeling

* 70% Training , 30% Testing

* Don't use resampling before because the evaluation has no interpretation 
  * 95% precision recall is fake because model has seen these "copied" data in training set 
  
  
 * Regularzation
 
   * elastic net for model selection purpose
     * L1+L2 term : since I use polynomial feature and expand categorical by one hot encoding , feature number is large , so I use L1 term for feature selection and L2 term for feature weighting
     
   
   * class weight for imbalance data 
   
   
* Tuning 

  * $\alpha$ : streghth of regularzation term
  
  * l1-ratio $\lambda$ : $\frac{(1-\lambda)}{2}*L2+\frac{\lambda}{2}*L1$ 
    * If $\lambda$=0 , only L2 term
    
    * If $\lambda$=1 , only L1 term
    
  * class weight
  
  * 30-Fold Cross Validation

In [None]:
X_train,X_test,y_train,y_test=train_test_split(df.drop(['stroke'],axis=1),df.stroke,test_size=0.3,random_state=0,shuffle=True)

In [None]:
lr=make_pipeline(FunctionTransformer(Augmentor(degree=3,interaction_only=True)),
                 StandardScaler(),SGDClassifier(loss='log',average=True,
                 max_iter=200,
                 early_stopping=True,random_state=0))


params={'sgdclassifier__alpha':[1e-4,1e-3,0.01],
    'sgdclassifier__class_weight':[{0:1,1:10},{0:1,1:15}],
     'sgdclassifier__penalty':['elasticnet'],
      'sgdclassifier__l1_ratio':[0,0.15,0.5,1]}

scorer=make_scorer(scaled_f1)

clf=GridSearchCV(lr,params,scoring=scorer,n_jobs=-1,cv=30,verbose=0)

clf=clf.fit(X_train,y_train)

In [None]:
clf.best_params_

* Training Set score

In [None]:
y_pred=clf.predict(X_train)

print(classification_report(y_train,y_pred))

* Testing Set Score

In [None]:
y_pred=clf.predict(X_test)

print(classification_report(y_test,y_pred))

* Confusion Matrix

In [None]:
plot_confusion_matrix(clf,X_test,y_test)
plt.show()

* score
  

In [None]:
C=confusion_matrix(y_test,y_pred).astype('float32')

C[0,:]=C[0,:]/C[0,:].sum()
C[1,:]=C[1,:]/C[1,:].sum()

C

In [None]:
TP=C[1][1]
FP=C[0][1]

TN=C[0][0]

FN=C[1][0]

precision=TP/(TP+FP)

recall=TP/(TP+FN)

f1=2*precision*recall/(precision+recall)

print('test precision:',precision)
print('test recall',recall)
print('test f1-score :' ,f1)

* Model interpretation (fix other variable)

* For Logistic Regression

  * if age +1 , log odds ratio +13
  
  * if childen +1 , log odss ratio -8

In [None]:
plt.figure(figsize=(12,12))
plt.barh(clf.best_estimator_.steps[0][1].transform(df.drop(['stroke'],axis=1)).columns,
         clf.best_estimator_.steps[2][1].coef_[0])
plt.title('Regression Weight')
plt.show()

In [None]:
nan