In [1]:
import numpy as np
import pandas as pd 
from imblearn.over_sampling import RandomOverSampler
from sklearn import model_selection as ms
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder

In [2]:
## load dataset
input_dataset = "../data/01_raw/healthcare-dataset-stroke-data.csv"
df = pd.read_csv(input_dataset)

In [3]:
## preprocess data
mean_val = df['bmi'].mean()
df['bmi'].fillna(value=mean_val, inplace=True)

columns_to_encode = ['gender', 'work_type', 'ever_married', 'Residence_type', 'smoking_status']
ohe = OneHotEncoder(handle_unknown="ignore")
for col in columns_to_encode:
    unique_vals = df[col].unique()
    enc_df = pd.DataFrame(ohe.fit_transform(df[[col]]).toarray())
    enc_df.columns = [f'{col}_{i}' for i in range(len(unique_vals))]
    df = df.join(enc_df)
    df = df.drop([col], axis=1)

In [4]:
## split dataset into test and train
X = df.drop(columns=['stroke','id'])
y = df['stroke']

ros = RandomOverSampler(random_state=42)
X_os, y_os = ros.fit_resample(X,y)
X_train, X_test, y_train, y_test = ms.train_test_split(X_os, y_os, test_size=0.3, random_state=42)

In [5]:
## train model
lr = linear_model.LinearRegression()
lr.fit(X_train, y_train)

y_pred_proba = lr.predict(X_test)
y_pred = (y_pred_proba >= 0.3).astype(np.float32)

In [6]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba)}")
print(f"Mean squared error: {mean_squared_error(y_test, y_pred_proba)}")
print(f"F1 score: {f1_score(y_test, y_pred)}")

Accuracy: 0.7363729859444635
AUC: 0.8509237408448587
Mean squared error: 0.15509371645425327
F1 score: 0.7824611032531825
