In [None]:
# import libraries

import os
from operator import index

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')


In [None]:
os.chdir('res')

In [None]:
# load data and EDA
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df.drop(columns=['id'],inplace=True)
df.head()

In [None]:
# stroke labels
plt.figure()
sns.countplot(x = 'stroke', data=df)
plt.title('Distribution of Stroke Class')
plt.show()

In [None]:
# missing value : decision tree regressor
df.isna().sum()
# bmi <-> age/ gender

steps = [('scale',StandardScaler()), # data standardization
         ('dtr',DecisionTreeRegressor(random_state=42))] # dtr model

DT_bmi_pipe = Pipeline(steps=steps)

In [None]:
# work type, age, smoking_status, avg_glucose_level
X = df[['work_type','age','smoking_status','avg_glucose_level','bmi']].copy()
X.info()

In [None]:
work_type_dict = {'Private' : 1,
                  'Self-employed' : 3,
                  'Govt_job' : 2,
                  'children' : -1,
                  'Never_worked' : 0}

smoking_status_dict = {'never smoked' : 0,
                       'smokes' : 1,
                       'formerly smoked' : 2,
                       'Unknown' : -1}

In [None]:
def cnvrt_cat2num(col,col_dict,data = X):
    data[col].replace(col_dict,inplace=True)
    data[col] = pd.to_numeric(data[col],errors='coerce')
    
X['work_type'].replace(work_type_dict,inplace=True)
X['smoking_status'].replace(smoking_status_dict,inplace=True)

In [None]:
# nan bmi values rows
missing_data = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
y = X.pop('bmi')
y

In [None]:
X

In [None]:
# train model with full data
DT_bmi_pipe.fit(X,y)

# predict missing bmi data
# use gender and age

params = ['work_type','age','smoking_status','avg_glucose_level']
predicted_bmi = pd.Series(DT_bmi_pipe.predict(missing_data[params]), index = missing_data.index)
df.loc[missing_data.index, 'bmi'] = predicted_bmi
df.isna().sum()

In [None]:
df_copied = df.copy()
df_copied

In [None]:
work_type_dict = {'Private' : 1,
                  'Self-employed' : 3,
                  'Govt_job' : 2,
                  'children' : -1,
                  'Never_worked' : 0}

residence_type_dict = {'Urban' : 0,
                       'Rural' : 1 }

smoking_status_dict = {'never smoked' : 0,
                       'smokes' : 1,
                       'formerly smoked' : 2,
                       'Unknown' : -1}

gender_dict = {'Male' : 1,
               'Female' : 0,
               'Other' : -1}

ever_married_dict = {'No' : 0 , 'Yes' : 1}

In [None]:
df_copied.info()

In [None]:
cnvrt_cat2num('work_type',work_type_dict,df_copied)
cnvrt_cat2num('smoking_status',smoking_status_dict,df_copied)
cnvrt_cat2num('Residence_type',residence_type_dict,df_copied)
cnvrt_cat2num('gender',gender_dict,df_copied)
cnvrt_cat2num('ever_married',ever_married_dict,df_copied)

In [None]:
df_copied.info()

In [None]:
plt.figure(figsize=(15,20))
sns.heatmap(df_copied.select_dtypes(include=['number']).corr()[['stroke']].sort_values(by='stroke',ascending=False),
            annot=True, linewidths=2, linecolor='white',cbar=False, cmap = 'coolwarm')
plt.title('STROKE CORRELATION ANALYSIS')
plt.show()

In [None]:
X = df_copied[['age','heart_disease','avg_glucose_level','hypertension','ever_married']]
y = df_copied['stroke']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
steps = [('scale',StandardScaler()), ('LR',LogisticRegression())]
log_reg_pipe = Pipeline(steps=steps)

In [None]:
# model training
log_reg_pipe.fit(X_train, y_train)

# model test
y_pred = log_reg_pipe.predict(X_test)


In [30]:
print("accuracy : \n",accuracy_score(y_test, y_pred))
print("cr : \n",classification_report(y_test, y_pred))   
print("cm : \n",confusion_matrix(y_test, y_pred))


accuracy : 
 0.9452054794520548
cr : 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       483
           1       0.00      0.00      0.00        28

    accuracy                           0.95       511
   macro avg       0.47      0.50      0.49       511
weighted avg       0.89      0.95      0.92       511

cm : 
 [[483   0]
 [ 28   0]]
