In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [12]:
# Load the dataset
data_path = 'stroke.csv'
data = pd.read_csv(data_path)

# Fill missing values for 'bmi' with the median
data['bmi'].fillna(data['bmi'].median(), inplace=True)

# Convert categorical variables to numeric using one-hot encoding
data = pd.get_dummies(data, drop_first=True)

# Split the data into features and target
X = data.drop('stroke', axis=1)
y = data['stroke']

# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [10]:
# Define the models and hyperparameters with pipelines including SMOTE and class weights
models = {
    'Logistic Regression': Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', LogisticRegression(max_iter=500, solver='liblinear', class_weight='balanced'))
    ]),
    'Random Forest': Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', RandomForestClassifier(class_weight='balanced'))
    ]),
    'SVM': Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('classifier', SVC(class_weight='balanced'))
    ])
}

param_grid = {
    'Logistic Regression': {'classifier__C': [0.01, 0.1, 1, 10, 100]},
    'Random Forest': {'classifier__n_estimators': [50, 100, 200], 'classifier__max_depth': [None, 10, 20, 30]},
    'SVM': {'classifier__C': [0.01, 0.1, 1, 10, 100], 'classifier__kernel': ['linear', 'rbf']}
}

# Fine-tune the models using GridSearchCV
best_models = {}
for model_name in models:
    grid_search = GridSearchCV(models[model_name], param_grid[model_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

# Evaluate the models
for model_name in best_models:
    y_pred = best_models[model_name].predict(X_test)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))
    print("\n")


Model: Logistic Regression
Accuracy: 0.7553816046966731
              precision    recall  f1-score   support

           0       0.98      0.75      0.85       960
           1       0.17      0.81      0.29        62

    accuracy                           0.76      1022
   macro avg       0.58      0.78      0.57      1022
weighted avg       0.93      0.76      0.82      1022



Model: Random Forest
Accuracy: 0.9168297455968689
              precision    recall  f1-score   support

           0       0.94      0.97      0.96       960
           1       0.15      0.08      0.11        62

    accuracy                           0.92      1022
   macro avg       0.55      0.53      0.53      1022
weighted avg       0.89      0.92      0.90      1022



Model: SVM
Accuracy: 0.8913894324853229
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       960
           1       0.14      0.16      0.15        62

    accuracy                    