In [1]:
### Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pickle

In [2]:
### Some functions to make things faster and clean

In [3]:
def scaler(method, x_data):
    if method == "minmax":
        minmax = MinMaxScaler()
        x_scaled = minmax.fit_transform(x_data)
    elif method == "standard":
        standard = StandardScaler()
        x_scaled = standard.fit_transform(x_data)
    return x_scaled

In [4]:
def sampler(method, x_data, y_data):
    if method == "over":
        over_sampler = RandomOverSampler(random_state = 0)
        X_resampled, y_resampled = over_sampler.fit_resample(x_data, y_data)
    elif method == "under":
        under_sampler = RandomUnderSampler(random_state = 0)
        X_resampled, y_resampled = under_sampler.fit_resample(x_data, y_data)
    return X_resampled, y_resampled

In [5]:
def rand_forest(depth, x_data, y_data, metric="f1"):
    random_forest = RandomForestClassifier(random_state = 0, max_depth = 3)
    X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, random_state=0)
    random_forest.fit(X_train, y_train)
    if metric == "f1":
        y_pred = random_forest.predict(X_test)
        score = f1_score(y_test, y_pred)
    else:
        score = random_forest.score(X_test, y_test)
    return score

In [6]:
df = pd.read_csv("../data/stroke-data-preprocessed.csv", index_col = 0)

In [7]:
df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,1,67.0,0,1,1,2,1,228.69,36.6,0,1
51676,0,61.0,0,0,1,3,0,202.21,31.45,1,1
31112,1,80.0,0,1,1,2,0,105.92,32.5,1,1
60182,0,49.0,0,0,1,2,1,171.23,34.4,2,1
1665,0,79.0,1,0,1,3,0,174.12,24.0,1,1


In [8]:
# Determine X and y
X = df.drop("stroke", axis = 1)
y = df["stroke"]

In [9]:
# perform a random forest model on our data
rand_forest(3, X, y)

0.0

In [10]:
### Handle imbalanced data by sampling

In [11]:
# Over sampling
X_sampled, y_sampled = sampler("over", X, y)
result = rand_forest(3, X_sampled, y_sampled)
print(f"{result:.2}")

0.81


In [12]:
# Under sampling
X_sampled, y_sampled = sampler("under", X, y)
result = rand_forest(3, X_sampled, y_sampled)
print(f"{result:.2}")

0.83


In [13]:
### Let's see if scaling will effect or not

In [14]:
# Min Max scaler
X_scaled = scaler("minmax", X_sampled)
result = rand_forest(3, X_scaled, y_sampled)
print(f"{result:.2}")

0.83


In [15]:
# Standard scaler
X_scaled = scaler("standard", X_sampled)
result = rand_forest(3, X_scaled, y_sampled)
print(f"{result:.2}")

0.83


In [16]:
# So scaling is not necesseary.

In [17]:
# Create the object model after our experiments
random_forest = RandomForestClassifier(random_state = 0, max_depth = 3)
# Determine X and y
X = df.drop("stroke", axis = 1)
y = df["stroke"]
# Under sampling
X_sampled, y_sampled = sampler("under", X, y)
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, random_state=0)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)
score = f1_score(y_test, y_pred)
print(f"F1 Score is: {score:.2}")

F1 Score is: 0.83


In [18]:
# Keep the Model to use in production
with open("../results/model.pckl", "wb") as f:
    f.write(pickle.dumps(random_forest))