In [207]:
import pandas as pd
df = pd.read_csv("brain_stroke.csv")
print(df.isnull().sum())
df

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
4976,Male,41.0,0,0,No,Private,Rural,70.15,29.8,formerly smoked,0
4977,Male,40.0,0,0,Yes,Private,Urban,191.15,31.1,smokes,0
4978,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,31.8,smokes,0
4979,Male,40.0,0,0,Yes,Private,Rural,83.94,30.0,smokes,0


There is no missing data in the dataset.

In [208]:
df.groupby(["stroke"]).size()

stroke
0    4733
1     248
dtype: int64

The data is very unbalanced. This will have to be taken into account when training the model.

In [209]:
print(df.groupby(["Residence_type"]).size())
print(df.groupby(["ever_married"]).size())
print(df.groupby(["gender"]).size())

Residence_type
Rural    2449
Urban    2532
dtype: int64
ever_married
No     1701
Yes    3280
dtype: int64
gender
Female    2907
Male      2074
dtype: int64


There are only two values for these columns. These columns can be easily encoded.

In [210]:
print(df.groupby(["work_type"]).size())
print(df.groupby(["smoking_status", "stroke"]).size())

work_type
Govt_job          644
Private          2860
Self-employed     804
children          673
dtype: int64
smoking_status   stroke
Unknown          0         1453
                 1           47
formerly smoked  0          797
                 1           70
never smoked     0         1749
                 1           89
smokes           0          734
                 1           42
dtype: int64


Data about the work type can be encoded with one hot encoding, and the data about smoking_data looks useless, because there is a large part of the unknown fields.

In [211]:
from sklearn.preprocessing import minmax_scale, OrdinalEncoder
enc = OrdinalEncoder()
df[["Residence_type", "ever_married", "gender"]] = enc.fit_transform(df[["Residence_type", "ever_married", "gender"]])
df = pd.get_dummies(df, columns=["work_type"])
df = df.drop(columns=["smoking_status"])
df[['age','avg_glucose_level','bmi']] = minmax_scale(df[['age','avg_glucose_level','bmi']])
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Private,work_type_Self-employed,work_type_children
0,1.0,0.816895,0,1,1.0,1.0,0.801265,0.647564,1,0,1,0,0
1,1.0,0.975586,0,1,1.0,0.0,0.234512,0.530086,1,0,1,0,0
2,0.0,0.597168,0,0,1.0,1.0,0.536008,0.584527,1,0,1,0,0
3,0.0,0.963379,1,0,1.0,0.0,0.549349,0.286533,1,0,0,1,0
4,1.0,0.987793,0,0,1.0,1.0,0.605161,0.429799,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4976,1.0,0.499512,0,0,0.0,0.0,0.069384,0.452722,0,0,1,0,0
4977,1.0,0.487305,0,0,1.0,1.0,0.627966,0.489971,0,0,1,0,0
4978,0.0,0.548340,1,0,1.0,0.0,0.184194,0.510029,0,1,0,0,0
4979,1.0,0.487305,0,0,1.0,0.0,0.133044,0.458453,0,0,1,0,0


In [274]:
from sklearn.model_selection import train_test_split
X = df.drop("stroke", axis=1)
y = df["stroke"]
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=5)
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

To deal with the unbalanced dataset, I performed oversampling on the training dataset.

In [275]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred_train = lr.predict(X_train)
print(classification_report(y_test, y_pred))
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.98      0.74      0.85      1182
           1       0.14      0.75      0.23        64

    accuracy                           0.74      1246
   macro avg       0.56      0.75      0.54      1246
weighted avg       0.94      0.74      0.81      1246

              precision    recall  f1-score   support

           0       0.82      0.73      0.77      3551
           1       0.76      0.84      0.80      3551

    accuracy                           0.79      7102
   macro avg       0.79      0.79      0.79      7102
weighted avg       0.79      0.79      0.79      7102



In [278]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(max_depth=4, random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
y_pred_train = rfc.predict(X_train)
print(classification_report(y_test, y_pred))
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.98      0.71      0.82      1182
           1       0.13      0.80      0.22        64

    accuracy                           0.71      1246
   macro avg       0.56      0.75      0.52      1246
weighted avg       0.94      0.71      0.79      1246

              precision    recall  f1-score   support

           0       0.88      0.70      0.78      3551
           1       0.75      0.91      0.82      3551

    accuracy                           0.80      7102
   macro avg       0.82      0.80      0.80      7102
weighted avg       0.82      0.80      0.80      7102



The max_depth of the tree had to be significantly reduced to avoid overfitting. A shallow tree performs better than linear regression. The recall results are at the good level, which is satisfactory for this type of classification. In this case it is better to have more false positives (poor precision) than many false negatives.