In [1]:
# Data Manipulation
import numpy as np
import pandas as pd

# Misc
import os

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# Model selection
from sklearn.model_selection import StratifiedKFold
import optuna

## Load the data

In [2]:
train_df = pd.read_csv(os.path.join("..", "data", "train_augmented_50.csv"))
test_df = pd.read_csv(os.path.join("..", "data", "test.csv"))

# Encode categorical data but save encodings to dict
cat_codes_train = {}
cat_codes_test = {}
for col in ["gender", "work_type", "smoking_status"]:
    cat_codes_train[col] = {i: val for i, val in enumerate(train_df[col].astype("category").cat.categories)}
    cat_codes_test[col] = {i: val for i, val in enumerate(test_df[col].astype("category").cat.categories)}
    
    train_df[col] = train_df[col].astype("category").cat.codes
    test_df[col] = test_df[col].astype("category").cat.codes
    
# Separate labels from data
X_train = train_df.drop("stroke", axis=1).to_numpy()
y_train = train_df.stroke.to_numpy()

X_test = test_df.drop("stroke", axis=1).to_numpy()
y_test = test_df.stroke.to_numpy()

In [3]:
train_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,urban_residence,is_obese,has_disease,stroke
0,0,36.0,False,False,True,2,68.48,24.3,2,True,False,False,0
1,0,27.0,False,False,False,2,104.21,35.7,2,False,True,True,0
2,0,40.0,False,False,True,2,72.99,46.4,0,True,True,True,0
3,0,44.0,False,False,True,2,124.06,20.8,2,True,False,False,0
4,0,81.0,False,False,True,5,95.84,21.5,2,True,False,False,1


In [4]:
print(f"Fraction of stroke samples in train data: {100 * train_df.stroke.sum() / len(train_df):.2f}%")
print(f"Fraction of stroke samples in test data: {100 * test_df.stroke.sum() / len(test_df):.2f}%")

Fraction of stroke samples in train data: 33.33%
Fraction of stroke samples in test data: 4.90%


## Model selection

We have a plethora of models to choose from. In addition, we can try different train sets (differing in the strength of augmentation). The models we will try are the following:

- Decision Tree
- Random Forest
- kNN
- SVM

Keep in mind that, in order to be 100% correct, we would need to conduct a nested cross-validation (CV) in order to perform both hyperparameter selection and model selection on the train dataset. We will skip this and use a single CV to find hyperparameters for each model. The resulting model will be tested in the test dataset.

For the selection of hyperparameters we will use optuna. This is a framework which enables us to use bayesian optimization to search the hyperparameter space as opposed to random search or grid search. The metric which we'll use is the F1-score. Being a standard classification metric, we can use it under the assumption that precision (fraction of false positives) and recall (fraction of false negatives) are both equally important to us.

In [5]:
def perform_cv(model_class, params, score):
    """
    Performs a 5-fold stratified CV with varying models, hyperparameters and scoring function
    """
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = []
    
    for train_index, val_index in cv.split(X_train, y_train):
        model = model_class(**params)
        model.fit(X_train[train_index], y_train[train_index])
        
        y_pred = model.predict(X_train[val_index])
        cv_results.append(score(y_train[val_index], y_pred))
        
    return np.mean(cv_results)  

In [6]:
def print_result_summary(y_test, y_pred):
    print(classification_report(y_test, y_pred, target_names=["Healthy", "Stroke"]))

    print("=======================================================\n")
    categories = ["TN", "FP", "FN", "TP"]
    conf_mat = confusion_matrix(y_test, y_pred).flatten()

    for cat, metric in zip(categories, conf_mat):
        print(f"{cat}: {metric}")

### Decision Tree

In [7]:
model_class = DecisionTreeClassifier
score = f1_score

In [8]:
study = optuna.create_study(direction="maximize")  # We want to maximize the F1 score

def objective_cv(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", None]),
        "random_state": 42,
    }
    
    # Perform cross validation
    return perform_cv(model_class, params, f1_score)

[32m[I 2021-05-07 11:20:27,194][0m A new study created in memory with name: no-name-e6c4f6bf-26bf-4d7c-9e26-56f3cf155795[0m


In [9]:
study.optimize(objective_cv, n_trials=200, n_jobs=6)

[32m[I 2021-05-07 11:20:27,317][0m Trial 1 finished with value: 0.6671447101940743 and parameters: {'max_depth': 3, 'min_samples_split': 2, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.6671447101940743.[0m
[32m[I 2021-05-07 11:20:27,351][0m Trial 3 finished with value: 0.7985162260490701 and parameters: {'max_depth': 18, 'min_samples_split': 5, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.7985162260490701.[0m
[32m[I 2021-05-07 11:20:27,367][0m Trial 0 finished with value: 0.7953658681328213 and parameters: {'max_depth': 30, 'min_samples_split': 4, 'max_features': 'sqrt'}. Best is trial 3 with value: 0.7985162260490701.[0m
[32m[I 2021-05-07 11:20:27,377][0m Trial 2 finished with value: 0.7677833459572319 and parameters: {'max_depth': 7, 'min_samples_split': 4, 'max_features': None}. Best is trial 3 with value: 0.7985162260490701.[0m
[32m[I 2021-05-07 11:20:27,393][0m Trial 4 finished with value: 0.7745655677369419 and parameters: {'max_depth': 8, 'min_s

[32m[I 2021-05-07 11:20:28,533][0m Trial 38 finished with value: 0.8094591631051944 and parameters: {'max_depth': 13, 'min_samples_split': 2, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:28,571][0m Trial 40 finished with value: 0.6983190617757998 and parameters: {'max_depth': 3, 'min_samples_split': 2, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:28,589][0m Trial 39 finished with value: 0.6987857202187675 and parameters: {'max_depth': 4, 'min_samples_split': 2, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:28,627][0m Trial 42 finished with value: 0.6987857202187675 and parameters: {'max_depth': 4, 'min_samples_split': 2, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:28,651][0m Trial 41 finished with value: 0.8107315625425817 and parameters: {'max_depth': 22, 'mi

[32m[I 2021-05-07 11:20:29,863][0m Trial 76 finished with value: 0.8066669032837742 and parameters: {'max_depth': 18, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:29,892][0m Trial 77 finished with value: 0.8066669032837742 and parameters: {'max_depth': 18, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:29,949][0m Trial 78 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:29,999][0m Trial 81 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:30,000][0m Trial 79 finished with value: 0.8106003920681472 and parameters: {'max_depth': 16, 

[32m[I 2021-05-07 11:20:31,221][0m Trial 114 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:31,255][0m Trial 115 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:31,344][0m Trial 116 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:31,366][0m Trial 117 finished with value: 0.8119186439279709 and parameters: {'max_depth': 14, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:31,380][0m Trial 118 finished with value: 0.8119186439279709 and parameters: {'max_depth':

[32m[I 2021-05-07 11:20:32,561][0m Trial 152 finished with value: 0.8119186439279709 and parameters: {'max_depth': 14, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:32,609][0m Trial 153 finished with value: 0.8119186439279709 and parameters: {'max_depth': 14, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:32,638][0m Trial 154 finished with value: 0.8119186439279709 and parameters: {'max_depth': 14, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:32,663][0m Trial 155 finished with value: 0.8119186439279709 and parameters: {'max_depth': 14, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:32,665][0m Trial 157 finished with value: 0.8119186439279709 and parameters: {'max_depth':

[32m[I 2021-05-07 11:20:33,934][0m Trial 191 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:33,954][0m Trial 193 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:33,957][0m Trial 192 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:33,957][0m Trial 190 finished with value: 0.8151531308063259 and parameters: {'max_depth': 15, 'min_samples_split': 3, 'max_features': None}. Best is trial 14 with value: 0.8151531308063259.[0m
[32m[I 2021-05-07 11:20:34,088][0m Trial 194 finished with value: 0.8151531308063259 and parameters: {'max_depth':

In [10]:
print(f"With the best trial we achieve a mean F1 score of {study.best_trial.value:.4f} over all splits")

With the best trial we achieve a mean F1 score of 0.8152 over all splits


In [11]:
tuned_model = model_class(**study.best_trial.params)
tuned_model.fit(X_train, y_train)

y_pred = tuned_model.predict(X_test)
f1_test = f1_score(y_test, y_pred)

print(f"On the independent test set, we achieve an F1 score of {f1_test:.4f} with {model_class.__name__}")

On the independent test set, we achieve an F1 score of 0.2138 with DecisionTreeClassifier


In [12]:
print_result_summary(y_test, y_pred)

              precision    recall  f1-score   support

     Healthy       0.96      0.91      0.93       971
      Stroke       0.16      0.34      0.21        50

    accuracy                           0.88      1021
   macro avg       0.56      0.62      0.57      1021
weighted avg       0.92      0.88      0.90      1021


TN: 879
FP: 92
FN: 33
TP: 17


### RandomForest

In [13]:
model_class = RandomForestClassifier
score = f1_score

In [14]:
study = optuna.create_study(direction="maximize")  # We want to maximize the F1 score

def objective_cv(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 25),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", None]),
        "n_estimators": trial.suggest_int("n_estimators", 5, 100),
        "random_state": 42,
    }
    
    # Perform cross validation
    return perform_cv(model_class, params, f1_score)

[32m[I 2021-05-07 11:20:34,178][0m A new study created in memory with name: no-name-d9577a57-35e9-4718-a2ab-281aec93143e[0m


In [15]:
study.optimize(objective_cv, n_trials=200, n_jobs=6)

[32m[I 2021-05-07 11:20:35,428][0m Trial 4 finished with value: 0.7292478438120533 and parameters: {'max_depth': 3, 'min_samples_split': 4, 'max_features': 'sqrt', 'n_estimators': 35}. Best is trial 4 with value: 0.7292478438120533.[0m
[32m[I 2021-05-07 11:20:35,626][0m Trial 3 finished with value: 0.7276261541758624 and parameters: {'max_depth': 3, 'min_samples_split': 3, 'max_features': 'sqrt', 'n_estimators': 42}. Best is trial 4 with value: 0.7292478438120533.[0m
[32m[I 2021-05-07 11:20:35,874][0m Trial 5 finished with value: 0.8625382704490431 and parameters: {'max_depth': 16, 'min_samples_split': 5, 'max_features': 'sqrt', 'n_estimators': 29}. Best is trial 5 with value: 0.8625382704490431.[0m
[32m[I 2021-05-07 11:20:36,215][0m Trial 2 finished with value: 0.755115628646327 and parameters: {'max_depth': 5, 'min_samples_split': 5, 'max_features': 'sqrt', 'n_estimators': 51}. Best is trial 5 with value: 0.8625382704490431.[0m
[32m[I 2021-05-07 11:20:37,277][0m Trial 8

[32m[I 2021-05-07 11:20:59,478][0m Trial 35 finished with value: 0.8748256586213566 and parameters: {'max_depth': 17, 'min_samples_split': 3, 'max_features': 'sqrt', 'n_estimators': 74}. Best is trial 31 with value: 0.8798651673616449.[0m
[32m[I 2021-05-07 11:21:00,179][0m Trial 36 finished with value: 0.8744023625989499 and parameters: {'max_depth': 18, 'min_samples_split': 3, 'max_features': 'sqrt', 'n_estimators': 79}. Best is trial 31 with value: 0.8798651673616449.[0m
[32m[I 2021-05-07 11:21:01,106][0m Trial 37 finished with value: 0.8754021117963626 and parameters: {'max_depth': 23, 'min_samples_split': 3, 'max_features': 'sqrt', 'n_estimators': 78}. Best is trial 31 with value: 0.8798651673616449.[0m
[32m[I 2021-05-07 11:21:01,709][0m Trial 38 finished with value: 0.8743998848226948 and parameters: {'max_depth': 18, 'min_samples_split': 3, 'max_features': 'sqrt', 'n_estimators': 77}. Best is trial 31 with value: 0.8798651673616449.[0m
[32m[I 2021-05-07 11:21:01,944]

[32m[I 2021-05-07 11:21:17,527][0m Trial 67 finished with value: 0.8613014732276504 and parameters: {'max_depth': 12, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 59}. Best is trial 69 with value: 0.8822954285837232.[0m
[32m[I 2021-05-07 11:21:19,397][0m Trial 70 finished with value: 0.8746407272525456 and parameters: {'max_depth': 16, 'min_samples_split': 2, 'max_features': None, 'n_estimators': 44}. Best is trial 69 with value: 0.8822954285837232.[0m
[32m[I 2021-05-07 11:21:19,479][0m Trial 71 finished with value: 0.871397775142548 and parameters: {'max_depth': 17, 'min_samples_split': 2, 'max_features': None, 'n_estimators': 46}. Best is trial 69 with value: 0.8822954285837232.[0m
[32m[I 2021-05-07 11:21:19,683][0m Trial 72 finished with value: 0.8737398187155263 and parameters: {'max_depth': 16, 'min_samples_split': 2, 'max_features': None, 'n_estimators': 43}. Best is trial 69 with value: 0.8822954285837232.[0m
[32m[I 2021-05-07 11:21:20,210][0m Tr

[32m[I 2021-05-07 11:21:39,025][0m Trial 102 finished with value: 0.8798651673616449 and parameters: {'max_depth': 22, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 94}. Best is trial 69 with value: 0.8822954285837232.[0m
[32m[I 2021-05-07 11:21:40,010][0m Trial 104 finished with value: 0.8745554780032517 and parameters: {'max_depth': 22, 'min_samples_split': 3, 'max_features': 'sqrt', 'n_estimators': 100}. Best is trial 69 with value: 0.8822954285837232.[0m
[32m[I 2021-05-07 11:21:41,062][0m Trial 108 finished with value: 0.8766590642430871 and parameters: {'max_depth': 23, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 41}. Best is trial 69 with value: 0.8822954285837232.[0m
[32m[I 2021-05-07 11:21:41,114][0m Trial 105 finished with value: 0.8738923339566196 and parameters: {'max_depth': 22, 'min_samples_split': 3, 'max_features': 'sqrt', 'n_estimators': 93}. Best is trial 69 with value: 0.8822954285837232.[0m
[32m[I 2021-05-07 11:21:41

[32m[I 2021-05-07 11:22:00,004][0m Trial 134 finished with value: 0.8757563905961169 and parameters: {'max_depth': 18, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 73}. Best is trial 128 with value: 0.8828503895567692.[0m
[32m[I 2021-05-07 11:22:00,761][0m Trial 138 finished with value: 0.8738882124222803 and parameters: {'max_depth': 17, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 46}. Best is trial 128 with value: 0.8828503895567692.[0m
[32m[I 2021-05-07 11:22:01,176][0m Trial 139 finished with value: 0.8722255180151169 and parameters: {'max_depth': 15, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 45}. Best is trial 128 with value: 0.8828503895567692.[0m
[32m[I 2021-05-07 11:22:01,755][0m Trial 140 finished with value: 0.8722255180151169 and parameters: {'max_depth': 15, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 45}. Best is trial 128 with value: 0.8828503895567692.[0m
[32m[I 2021-05-07 11:22

[32m[I 2021-05-07 11:22:18,262][0m Trial 171 finished with value: 0.8807925701035684 and parameters: {'max_depth': 16, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 68}. Best is trial 152 with value: 0.8832346662271716.[0m
[32m[I 2021-05-07 11:22:18,340][0m Trial 172 finished with value: 0.8811907057608682 and parameters: {'max_depth': 16, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 67}. Best is trial 152 with value: 0.8832346662271716.[0m
[32m[I 2021-05-07 11:22:18,813][0m Trial 173 finished with value: 0.8775701809670678 and parameters: {'max_depth': 15, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 67}. Best is trial 152 with value: 0.8832346662271716.[0m
[32m[I 2021-05-07 11:22:19,390][0m Trial 174 finished with value: 0.8775701809670678 and parameters: {'max_depth': 15, 'min_samples_split': 2, 'max_features': 'sqrt', 'n_estimators': 67}. Best is trial 152 with value: 0.8832346662271716.[0m
[32m[I 2021-05-07 11:22

In [16]:
print(f"With the best trial we achieve a mean F1 score of {study.best_trial.value:.4f} over all splits")

With the best trial we achieve a mean F1 score of 0.8832 over all splits


In [17]:
tuned_model = model_class(**study.best_trial.params)
tuned_model.fit(X_train, y_train)

y_pred = tuned_model.predict(X_test)
f1_test = f1_score(y_test, y_pred)

print(f"On the independent test set, we achieve an F1 score of {f1_test:.4f} with {model_class.__name__}")

On the independent test set, we achieve an F1 score of 0.2576 with RandomForestClassifier


In [18]:
print_result_summary(y_test, y_pred)

              precision    recall  f1-score   support

     Healthy       0.96      0.93      0.95       971
      Stroke       0.21      0.34      0.26        50

    accuracy                           0.90      1021
   macro avg       0.59      0.64      0.60      1021
weighted avg       0.93      0.90      0.91      1021


TN: 906
FP: 65
FN: 33
TP: 17


### k-Nearest Neighbor

In [19]:
model_class = KNeighborsClassifier
score = f1_score

In [20]:
study = optuna.create_study(direction="maximize")  # We want to maximize the F1 score

def objective_cv(trial):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 3, 10),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"])
    }
    
    # Perform cross validation
    return perform_cv(model_class, params, f1_score)

[32m[I 2021-05-07 11:22:32,506][0m A new study created in memory with name: no-name-ec2718f7-2c87-4f07-ad90-d4f4c1cbd8a4[0m


In [21]:
study.optimize(objective_cv, n_trials=16, n_jobs=6)

[32m[I 2021-05-07 11:22:32,794][0m Trial 3 finished with value: 0.8491419505215116 and parameters: {'n_neighbors': 3, 'weights': 'distance'}. Best is trial 3 with value: 0.8491419505215116.[0m
[32m[I 2021-05-07 11:22:32,804][0m Trial 0 finished with value: 0.8378977239317973 and parameters: {'n_neighbors': 8, 'weights': 'distance'}. Best is trial 3 with value: 0.8491419505215116.[0m
[32m[I 2021-05-07 11:22:32,812][0m Trial 5 finished with value: 0.8323638724956341 and parameters: {'n_neighbors': 9, 'weights': 'distance'}. Best is trial 3 with value: 0.8491419505215116.[0m
[32m[I 2021-05-07 11:22:32,835][0m Trial 2 finished with value: 0.8405171513544263 and parameters: {'n_neighbors': 5, 'weights': 'distance'}. Best is trial 3 with value: 0.8491419505215116.[0m
[32m[I 2021-05-07 11:22:32,842][0m Trial 4 finished with value: 0.852516761699248 and parameters: {'n_neighbors': 4, 'weights': 'distance'}. Best is trial 4 with value: 0.852516761699248.[0m
[32m[I 2021-05-07 11:

In [22]:
print(f"With the best trial we achieve a mean F1 score of {study.best_trial.value:.4f} over all splits")

With the best trial we achieve a mean F1 score of 0.8525 over all splits


In [23]:
tuned_model = model_class(**study.best_trial.params)
tuned_model.fit(X_train, y_train)

y_pred = tuned_model.predict(X_test)
f1_test = f1_score(y_test, y_pred)

print(f"On the independent test set, we achieve an F1 score of {f1_test:.4f} with {model_class.__name__}")

On the independent test set, we achieve an F1 score of 0.2176 with KNeighborsClassifier


In [24]:
print_result_summary(y_test, y_pred)

              precision    recall  f1-score   support

     Healthy       0.97      0.87      0.92       971
      Stroke       0.15      0.42      0.22        50

    accuracy                           0.85      1021
   macro avg       0.56      0.65      0.57      1021
weighted avg       0.93      0.85      0.88      1021


TN: 849
FP: 122
FN: 29
TP: 21


### Support Vector Machine

In [25]:
model_class = SVC
score = f1_score

In [26]:
study = optuna.create_study(direction="maximize")  # We want to maximize the F1 score

def objective_cv(trial):
    params = {
        "C": trial.suggest_uniform("C", 1e-5, 1),
        "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"]),
        "degree": trial.suggest_int("degree", 3, 10),
    }
    
    # Perform cross validation
    return perform_cv(model_class, params, f1_score)

[32m[I 2021-05-07 11:22:35,455][0m A new study created in memory with name: no-name-8b420175-8fb4-4b3f-a4e8-a104c5b9726f[0m


In [27]:
study.optimize(objective_cv, n_trials=30, n_jobs=6)

[32m[I 2021-05-07 11:22:37,580][0m Trial 3 finished with value: 0.6833592618127287 and parameters: {'C': 0.8195204062870496, 'kernel': 'rbf', 'degree': 3}. Best is trial 3 with value: 0.6833592618127287.[0m
[32m[I 2021-05-07 11:22:37,619][0m Trial 2 finished with value: 0.6803505102677475 and parameters: {'C': 0.2738085890956289, 'kernel': 'rbf', 'degree': 3}. Best is trial 3 with value: 0.6833592618127287.[0m
[32m[I 2021-05-07 11:22:38,259][0m Trial 4 finished with value: 0.6835102632068962 and parameters: {'C': 0.7547421547318046, 'kernel': 'rbf', 'degree': 10}. Best is trial 4 with value: 0.6835102632068962.[0m
[32m[I 2021-05-07 11:22:39,883][0m Trial 6 finished with value: 0.6833592618127287 and parameters: {'C': 0.7618543411821297, 'kernel': 'rbf', 'degree': 10}. Best is trial 4 with value: 0.6835102632068962.[0m
[32m[I 2021-05-07 11:22:40,306][0m Trial 5 finished with value: 0.3138392900298324 and parameters: {'C': 0.6147870314962345, 'kernel': 'sigmoid', 'degree': 

In [28]:
print(f"With the best trial we achieve a mean F1 score of {study.best_trial.value:.4f} over all splits")

With the best trial we achieve a mean F1 score of 0.7312 over all splits


In [29]:
tuned_model = model_class(**study.best_trial.params)
tuned_model.fit(X_train, y_train)

y_pred = tuned_model.predict(X_test)
f1_test = f1_score(y_test, y_pred)

print(f"On the independent test set, we achieve an F1 score of {f1_test:.4f} with {model_class.__name__}")

On the independent test set, we achieve an F1 score of 0.2796 with SVC


In [30]:
print_result_summary(y_test, y_pred)

              precision    recall  f1-score   support

     Healthy       0.97      0.89      0.93       971
      Stroke       0.19      0.52      0.28        50

    accuracy                           0.87      1021
   macro avg       0.58      0.70      0.60      1021
weighted avg       0.93      0.87      0.90      1021


TN: 861
FP: 110
FN: 24
TP: 26


## Conclusion

As you can see, every classifier of us achieves an accuracy of >= 90%. Now we can go get a beer and celebrate, we solved the task!

Jokes aside, in practice this would just be the starting point of modelling. Because of the imbalanced class problem (see "support" of Healthy and Stroke samples in the result summaries) accuracy is by no means suited as a metric for this problem.

A better metric is the F1-Score (although debatable, because this assumes that we assign the same importance to FNs (Recall) as well as FP (Precision). In practice, especially in medicine, we might have other assumptions, e.g. "FPs are not so bad, but we have to catch every stroke possible"). We see in terms of F1-Score, the classifiers are all pretty similar. The same is true for their performance on "Healthy" samples (Prec/Recall for all classifiers are similar). 

The difference between the approaches is their performance on the "Stroke" class. We see the SVC model achieves the best F1-Score of all the models by having a high recall but comparatively low precision on the strokes. This is also shown by the confusion matrix (Best overall TPs but worst FPs). The other approaches are more conservative, having a lower number of both TPs and FPs, resulting in a more balanced precision and recall score.

---

We didn't fully solve the task of predicting stroke patients, but we made the first step towards it. The main problem here is the sparsity of stroke patients, which we tackled via SMOTE. In addition, the features we have about the patients don't seem to be the most useful (e.g. work type, residence type). However, we did the data exploration and found some interesting patterns in the data, created new features and came up with classifiers which are by no means perfect but still an improvement. In this case the SVM classifier is the best one, especially if (like I described above) we make the assumption that in the medical domain the recall is more important than the precision.

What I didn't show in the notebooks is the comparisson between different levels of augmentation in the training dataset. There we see that the best case is the one were SMOTE augments so much data that we end up with a 50:50 class distribution.