In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Loading
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df = df[df['gender'] != 'Other']
df = df.drop('id', axis=1)

# Encode Categorical Variables
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['ever_married'] = le.fit_transform(df['ever_married'])
df['Residence_type'] = le.fit_transform(df['Residence_type'])
df = pd.get_dummies(df, columns=['work_type', 'smoking_status'], drop_first=True)

# Impute Missing Values with KNNImputer
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Remove Outliers
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[col] >= lower) & (df[col] <= upper)]

for col in ['age', 'avg_glucose_level', 'bmi']:
    df_imputed = remove_outliers(df_imputed, col)

# Split Features and Target
X = df_imputed.drop('stroke', axis=1)
y = df_imputed['stroke']

# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE to Training Data
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# SCALING (for models that need it)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# MODEL DEFINITIONS
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Unscaled Models (trained on SMOTE data)
unscaled_models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Naive Bayes": GaussianNB()
}

print("\n--- Unscaled Models with SMOTE ---")
for name, model in unscaled_models.items():
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=[0, 1]).plot()
    plt.title(f'{name} Confusion Matrix')
    plt.show()

# Scaled Models (trained on scaled SMOTE data)
scaled_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN": KNeighborsClassifier(),
    "SVC": SVC(probability=True),
    "MLP Classifier": MLPClassifier(max_iter=500, random_state=42)
}

print("\n--- Scaled Models with SMOTE ---")
for name, model in scaled_models.items():
    model.fit(X_train_res_scaled, y_train_res)
    y_pred = model.predict(X_test_scaled)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    ConfusionMatrixDisplay(cm, display_labels=[0, 1]).plot()
    plt.title(f'{name} Confusion Matrix')
    plt.show()

# Optional: Accuracy Comparison Plot
model_names = list(unscaled_models.keys()) + list(scaled_models.keys())
accuracies = []

# Collect accuracies
for name, model in unscaled_models.items():
    y_pred = model.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))

for name, model in scaled_models.items():
    y_pred = model.predict(X_test_scaled)
    accuracies.append(accuracy_score(y_test, y_pred))

# Plotting
plt.figure(figsize=(12, 6))
sns.barplot(x=model_names, y=accuracies, palette='viridis')
plt.xticks(rotation=45)
plt.ylabel('Accuracy')
plt.title('Model Accuracies After Applying SMOTE')
plt.ylim(0.4, 1.0)
plt.tight_layout()
plt.show()
