# Homework 1

In [111]:
%pip install -U sklearn pandas imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.3/199.3 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.9.1 imblearn-0.0
Note: you may need to restart the kernel to use updated packages.


# Preprocessing

In [63]:
import pandas as pd
import functools

data = pd.read_csv("brain_stroke.csv")

data["ever_married"] = data["ever_married"] == "Yes"
data["urban"] = data["Residence_type"] == "Urban"
data = data.drop("Residence_type", axis=1)
data["male"] = data["gender"] == "Male"
data = data.drop("gender", axis=1)

def one_hot(data, column):
  return data.drop(column, axis=1).join(pd.get_dummies(data[column], prefix=column))

data = functools.reduce(one_hot, ["work_type", "smoking_status"], data)
data = data.rename(lambda x: x.lower(), axis=1)

data.head()

Unnamed: 0,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,urban,male,work_type_govt_job,work_type_private,work_type_self-employed,work_type_children,smoking_status_unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,True,228.69,36.6,1,True,True,0,1,0,0,0,1,0,0
1,80.0,0,1,True,105.92,32.5,1,False,True,0,1,0,0,0,0,1,0
2,49.0,0,0,True,171.23,34.4,1,True,False,0,1,0,0,0,0,0,1
3,79.0,1,0,True,174.12,24.0,1,False,False,0,0,1,0,0,0,1,0
4,81.0,0,0,True,186.21,29.0,1,True,True,0,1,0,0,0,1,0,0


# Train/test split

In [99]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.1)
train_X, train_y = train.drop("stroke", axis=1), train["stroke"]
test_X, test_y = test.drop("stroke", axis=1), test["stroke"]

# normalization
mean, std = train_X.mean(), train_X.std()
train_X = (train_X - mean) / std
test_X = (test_X - mean) / std

In [100]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

def metrics(name, pred, gt):
  print(f"Results for {name}")
  print(" - accuracy: ", accuracy_score(gt, pred))
  print(" - precision: ", precision_score(gt, pred))
  print(" - recall: ", recall_score(gt, pred))
  print(" - f1 score: ", f1_score(gt, pred))

# Logistic regression

Applied balanced class weight due to class imbalance.

In [101]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000, class_weight="balanced").fit(train_X, train_y)

pred = model.predict(test_X)

metrics("Logistic Regression", pred, test_y)

Results for Logistic Regression
 - accuracy:  0.7194388777555111
 - precision:  0.1518987341772152
 - recall:  0.8
 - f1 score:  0.2553191489361702


# Random forest

Similarly to previous, applied balanced class weight. Reduced max depth in order to prevent overfitting.

In [102]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight="balanced", max_depth=6).fit(train_X, train_y)

pred = model.predict(test_X)

metrics("Random Forest", pred, test_y)

Results for Random Forest
 - accuracy:  0.7635270541082164
 - precision:  0.16153846153846155
 - recall:  0.7
 - f1 score:  0.2625


# Neural network

This time in order to account for class imbalance applied oversampling. Also reduced hidden layer sizes to reduce overfitting.

In [129]:
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE

oversample = SMOTE()

X, y = oversample.fit_resample(train_X, train_y)

model = MLPClassifier(max_iter=10_000, hidden_layer_sizes=(7,)).fit(X, y)

pred = model.predict(test_X)

metrics("Random Forest", pred, test_y)

Results for Random Forest
 - accuracy:  0.7434869739478958
 - precision:  0.15
 - recall:  0.7
 - f1 score:  0.24705882352941178
