In [1]:
# Data Manipulation
import numpy as np
import pandas as pd

# Misc
import os

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Model selection
from sklearn.model_selection import StratifiedKFold
import optuna

## Load the data

In [2]:
train_df = pd.read_csv(os.path.join("..", "data", "train_augmented_20.csv"))  # Without SMOTE
test_df = pd.read_csv(os.path.join("..", "data", "test.csv"))

# Encode categorical data but save encodings to dict
cat_codes_train = {}
cat_codes_test = {}
for col in ["gender", "work_type", "smoking_status"]:
    cat_codes_train[col] = {i: val for i, val in enumerate(train_df[col].astype("category").cat.categories)}
    cat_codes_test[col] = {i: val for i, val in enumerate(test_df[col].astype("category").cat.categories)}
    
    train_df[col] = train_df[col].astype("category").cat.codes
    test_df[col] = test_df[col].astype("category").cat.codes
    
# Separate labels from data
X_train = train_df.drop("stroke", axis=1).to_numpy()
y_train = train_df.stroke.to_numpy()

X_test = test_df.drop("stroke", axis=1).to_numpy()
y_test = test_df.stroke.to_numpy()

In [3]:
train_df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,avg_glucose_level,bmi,smoking_status,urban_residence,is_obese,has_disease,stroke
0,0,36.0,False,False,True,2,68.48,24.3,2,True,False,False,0
1,0,27.0,False,False,False,2,104.21,35.7,2,False,True,True,0
2,0,40.0,False,False,True,2,72.99,46.4,0,True,True,True,0
3,0,44.0,False,False,True,2,124.06,20.8,2,True,False,False,0
4,0,81.0,False,False,True,5,95.84,21.5,2,True,False,False,1


In [21]:
print(f"Fraction of stroke samples in train data: {100 * train_df.stroke.sum() / len(train_df):.2f}%")
print(f"Fraction of stroke samples in test data: {100 * test_df.stroke.sum() / len(test_df):.2f}%")

Fraction of stroke samples in train data: 16.65%
Fraction of stroke samples in test data: 4.90%


## Model selection

We have a plethora of models to choose from. In addition, we can try different train sets (differing in the strength of augmentation). The models we will try are the following:

- Decision Tree
- Random Forest
- kNN
- SVM

Keep in mind that, in order to be 100% correct, we would need to conduct a nested cross-validation (CV) in order to perform both hyperparameter selection and model selection on the train dataset. We will skip this and use a single CV to find hyperparameters for each model. The resulting model will be tested in the test dataset.

For the selection of hyperparameters we will use optuna. This is a framework which enables us to use bayesian optimization to search the hyperparameter space as opposed to random search or grid search. The metric which we'll use is the F1-score. Being a standard classification metric, we can use it under the assumption that precision (fraction of false positives) and recall (fraction of false negatives) are both equally important to us.

In [5]:
def perform_cv(model_class, params, score):
    """
    Performs a 5-fold stratified CV with varying models, hyperparameters and scoring function
    """
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = []
    
    for train_index, val_index in cv.split(X_train, y_train):
        model = model_class(**params)
        model.fit(X_train[train_index], y_train[train_index])
        
        y_pred = model.predict(X_train[val_index])
        cv_results.append(score(y_train[val_index], y_pred))
        
    return np.mean(cv_results)  

### Decision Tree

In [6]:
model_class = DecisionTreeClassifier
score = f1_score

In [7]:
study = optuna.create_study(direction="maximize")  # We want to maximize the F1 score

def objective_cv(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", None]),
        "random_state": 42,
    }
    
    # Perform cross validation
    return perform_cv(model_class, params, f1_score)

[32m[I 2021-05-02 12:27:32,350][0m A new study created in memory with name: no-name-b9bd8d90-7c19-415d-9817-c9f4c8c4fb5c[0m


In [None]:
study.optimize(objective_cv, n_trials=200, n_jobs=6)

In [9]:
print(f"With the best trial we achieve a mean F1 score of {study.best_trial.value:.4f} over all splits")

With the best trial we achieve a mean F1 score of 0.6489 over all splits


In [10]:
tuned_model = model_class(**study.best_trial.params)
tuned_model.fit(X_train, y_train)

y_pred = tuned_model.predict(X_test)
f1_test = f1_score(y_test, y_pred)

print(f"On the independent test set, we achieve an F1 score of {f1_test:.4f} with {model_class.__name__}")

On the independent test set, we achieve an F1 score of 0.2105 with DecisionTreeClassifier


### RandomForest

In [11]:
model_class = RandomForestClassifier
score = f1_score

In [12]:
study = optuna.create_study(direction="maximize")  # We want to maximize the F1 score

def objective_cv(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 25),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 5),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", None]),
        "n_estimators": trial.suggest_int("n_estimators", 5, 100),
        "random_state": 42,
    }
    
    # Perform cross validation
    return perform_cv(model_class, params, f1_score)

[32m[I 2021-05-02 12:27:38,454][0m A new study created in memory with name: no-name-5f0deea2-f5d3-490c-bc8c-943ef0f8eab7[0m


In [None]:
study.optimize(objective_cv, n_trials=200, n_jobs=6)

In [14]:
print(f"With the best trial we achieve a mean F1 score of {study.best_trial.value:.4f} over all splits")

With the best trial we achieve a mean F1 score of 0.7206 over all splits


In [15]:
tuned_model = model_class(**study.best_trial.params)
tuned_model.fit(X_train, y_train)

y_pred = tuned_model.predict(X_test)
f1_test = f1_score(y_test, y_pred)

print(f"On the independent test set, we achieve an F1 score of {f1_test:.4f} with {model_class.__name__}")

On the independent test set, we achieve an F1 score of 0.2020 with RandomForestClassifier


### k-Nearest Neighbor

In [16]:
model_class = KNeighborsClassifier
score = f1_score

In [17]:
study = optuna.create_study(direction="maximize")  # We want to maximize the F1 score

def objective_cv(trial):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 3, 10),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"])
    }
    
    # Perform cross validation
    return perform_cv(model_class, params, f1_score)

[32m[I 2021-05-02 12:29:49,908][0m A new study created in memory with name: no-name-c39a324b-c425-4e0f-a011-df5b165199f1[0m


In [None]:
study.optimize(objective_cv, n_trials=16, n_jobs=6)

In [19]:
print(f"With the best trial we achieve a mean F1 score of {study.best_trial.value:.4f} over all splits")

With the best trial we achieve a mean F1 score of 0.7332 over all splits


In [20]:
tuned_model = model_class(**study.best_trial.params)
tuned_model.fit(X_train, y_train)

y_pred = tuned_model.predict(X_test)
f1_test = f1_score(y_test, y_pred)

print(f"On the independent test set, we achieve an F1 score of {f1_test:.4f} with {model_class.__name__}")

On the independent test set, we achieve an F1 score of 0.2446 with KNeighborsClassifier
