In [2]:
# import library
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [3]:
# load dataset
dataset = 'healthcare-dataset-stroke-data.csv'

df = pd.read_csv(dataset)
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Data Dictionary

 | Column Name         | Description                                           |
|---------------------|-------------------------------------------------------|
| id   | unique identifier                     |
| gender            | "Male", "Female" or "Other"           |
| age       | age of the patient     |
| hypertension            | 0 if the patient doesn't have hypertension, 1 if the patient has hypertension               |
| heart_disease       | 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease                      |
| ever_married       |  "No" or "Yes"                      |
| work_type   | "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"                  |
| residence_type   | "Rural" or "Urban"                  |
| avg_glucose_level   | average glucose level in blood                  |
| bmi   | body mass index                  |
| smoke_status   | "formerly smoked", "never smoked", "smokes" or "Unknown"                  |
| stroke   | 1 if the patient had a stroke or 0 if not                 |

In [4]:
df.shape

(5110, 12)

In [5]:
df.drop('id', axis=1, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


In [7]:
nums = [i for i in df.columns if df[i].dtypes != 'object']
cats = [i for i in df.columns if df[i].dtypes == 'object']

In [8]:
df[nums].describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.245,23.5,0.0
50%,45.0,0.0,0.0,91.885,28.1,0.0
75%,61.0,0.0,0.0,114.09,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


In [9]:
df[cats].describe()

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,5110,5110,5110,5110,5110
unique,3,2,5,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2994,3353,2925,2596,1892


In [10]:
def print_cat(df):
  for i in cats:
    print(f'column: {i}')
    print(df[i].value_counts())
    print()

In [11]:
df2 = df[(df.gender != 'Other') & (df.smoking_status != 'formerly smoked') & (df.smoking_status != 'Unknown')]

In [12]:
df2.isnull().sum()

gender                0
age                   0
hypertension          0
heart_disease         0
ever_married          0
work_type             0
Residence_type        0
avg_glucose_level     0
bmi                  92
smoking_status        0
stroke                0
dtype: int64

In [13]:
df2.bmi = df2.bmi.fillna(df2.bmi.mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.bmi = df2.bmi.fillna(df2.bmi.mean())


In [14]:
df2.bmi.isnull().sum()

0

In [15]:
def unique_val(df2):
  for col in df2:
    if df2[col].dtypes == 'object':
      print(f'{col} = {df2[col].unique()}')

In [16]:
unique_val(df2)

gender = ['Female' 'Male']
ever_married = ['Yes' 'No']
work_type = ['Self-employed' 'Private' 'Govt_job' 'children' 'Never_worked']
Residence_type = ['Rural' 'Urban']
smoking_status = ['never smoked' 'smokes']


In [17]:
df2['smoking_status'] = df2['smoking_status'].replace({'never smoked': 'No', 'smokes': 'Yes'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['smoking_status'] = df2['smoking_status'].replace({'never smoked': 'No', 'smokes': 'Yes'})


In [18]:
df2['gender'] = df2['gender'].replace({'Female': 0, 'Male': 1})
df2['Residence_type'] = df2['Residence_type'].replace({'Rural': 0, 'Urban': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['gender'] = df2['gender'].replace({'Female': 0, 'Male': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Residence_type'] = df2['Residence_type'].replace({'Rural': 0, 'Urban': 1})


In [19]:
unique_val(df2)

ever_married = ['Yes' 'No']
work_type = ['Self-employed' 'Private' 'Govt_job' 'children' 'Never_worked']
smoking_status = ['No' 'Yes']


In [20]:
col_yes_no = ['ever_married', 'smoking_status']

for i in col_yes_no:
  df2[i].replace({'No': 0, 'Yes':1}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[i].replace({'No': 0, 'Yes':1}, inplace=True)


In [21]:
df3 = pd.get_dummies(df2, columns=['work_type'])

In [22]:
df3 = df3.astype('int64')

In [23]:
df3.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children
1,0,61,0,0,1,0,202,30,0,1,0,0,0,1,0
2,1,80,0,1,1,0,105,32,0,1,0,0,1,0,0
3,0,49,0,0,1,1,171,34,1,1,0,0,1,0,0
4,0,79,1,0,1,0,174,24,0,1,0,0,0,1,0
6,1,74,1,1,1,0,70,27,0,1,0,0,1,0,0


In [24]:
df3 = df3.rename(columns={'work_type_Self-employed': 'self_employed', 'work_type_Govt_job': 'govt_job', 'work_type_Never_worked': 'never_worked', 'work_type_Private': 'private', 'work_type_children': 'children'})

In [25]:
df3 = pd.concat([df3.drop(columns=['stroke']), df[['stroke']]], axis=1)
df3.sample(1, random_state=0)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,govt_job,never_worked,private,self_employed,children,stroke
76,1.0,80.0,1.0,0.0,1.0,1.0,240.0,27.0,0.0,0.0,0.0,1.0,0.0,0.0,1


In [26]:
df3.dropna(inplace=True)

In [27]:
X = df3.drop('stroke', axis=1)
y = df3['stroke']

In [28]:
y.value_counts()

stroke
0    2549
1     132
Name: count, dtype: int64

In [29]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sampling, y_sampling = smote.fit_resample(X, y)

y_sampling.value_counts()

stroke
1    2549
0    2549
Name: count, dtype: int64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_sampling, y_sampling, test_size=0.2, random_state=42)

In [31]:
y_test.value_counts()

stroke
0    517
1    503
Name: count, dtype: int64

In [32]:
y_train.value_counts()

stroke
1    2046
0    2032
Name: count, dtype: int64

In [34]:
from sklearn.model_selection import GridSearchCV

dtr = DecisionTreeClassifier()

params = {
    'max_depth': [4,8,12,16],
    'min_samples_leaf': [2,4,6,8],
    'min_samples_split': [2,4,6,8],
    'criterion': ['gini', 'entropy'],
    'random_state': [0,42]
}

grid_dtr = GridSearchCV(dtr, param_grid=params, cv = 5, scoring = 'roc_auc', n_jobs = -1, verbose = 2)
grid_dtr.fit(X_train, y_train)

print('Best parameters found: ', grid_dtr.best_params_)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


Best parameters found:  {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 8, 'min_samples_split': 2, 'random_state': 0}


In [40]:
dtr = DecisionTreeClassifier(criterion= 'entropy', max_depth= 16, min_samples_leaf= 8, min_samples_split= 2, random_state= 0)

dtr.fit(X_train, y_train)

In [41]:
print('training accuracy: ',dtr.score(X_train,y_train))
print()
print('testing accuracy: ', dtr.score(X_test,y_test))

training accuracy:  0.9693477194703286

testing accuracy:  0.9480392156862745


In [42]:
y_pred = dtr.predict(X_test)

In [43]:
print('Decision Tree Classifier: \n', classification_report(y_test, y_pred))

Decision Tree Classifier: 
               precision    recall  f1-score   support

           0       0.93      0.97      0.95       517
           1       0.96      0.93      0.95       503

    accuracy                           0.95      1020
   macro avg       0.95      0.95      0.95      1020
weighted avg       0.95      0.95      0.95      1020



In [39]:
import pickle

pickle.dump(dtr, open('model_stroke.sav', 'wb'))