In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import TomekLinks
from sklearn.utils.class_weight import compute_class_weight

In [67]:
df = pd.read_csv('cerebral_stroke_imbalanced.csv')
df.dropna(inplace=True)
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
6,52800,Female,52.0,0,0,Yes,Private,Urban,77.59,17.7,formerly smoked,0
7,41413,Female,75.0,0,1,Yes,Self-employed,Rural,243.53,27.0,never smoked,0
8,15266,Female,32.0,0,0,Yes,Private,Rural,77.67,32.3,smokes,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43395,56196,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,5450,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,28375,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,27973,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [69]:

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Initialize Label Encoder
label_encoder = LabelEncoder()

# Apply Label Encoding to each categorical column
for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])
    
# Separate features and target
X = df.drop('stroke', axis=1)  # Replace with your actual target column name
y = df['stroke']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,30468,1,58.0,1,0,1,2,1,87.96,39.2,1,0
3,56543,0,70.0,0,0,1,2,0,69.04,35.9,0,0
6,52800,0,52.0,0,0,1,2,1,77.59,17.7,0,0
7,41413,0,75.0,0,1,1,3,0,243.53,27.0,1,0
8,15266,0,32.0,0,0,1,2,0,77.67,32.3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43395,56196,0,10.0,0,0,0,4,1,58.64,20.4,1,0
43396,5450,0,56.0,0,0,1,0,1,213.61,55.4,0,0
43397,28375,0,82.0,1,0,1,2,1,91.94,28.9,0,0
43398,27973,1,40.0,0,0,1,2,1,99.16,33.2,1,0


In [71]:
# Function to evaluate model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    y_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    return accuracy, f1, auc

In [73]:
# 1. Original Model and Grid Performance
model = LogisticRegression(max_iter=2000, solver='liblinear')
original_metrics = evaluate_model(model, X_train, y_train, X_test, y_test)

# 2. Random Over-Sampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)
over_sampling_metrics = evaluate_model(model, X_train_resampled, y_train_resampled, X_test, y_test)

# 3. Random Under-Sampling
rus = RandomUnderSampler()
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)
under_sampling_metrics = evaluate_model(model, X_train_resampled, y_train_resampled, X_test, y_test)

# 4. SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
smote_metrics = evaluate_model(model, X_train_resampled, y_train_resampled, X_test, y_test)

# 5. Tomek Links
tl = TomekLinks()
X_train_resampled, y_train_resampled = tl.fit_resample(X_train, y_train)
tomek_metrics = evaluate_model(model, X_train_resampled, y_train_resampled, X_test, y_test)

# 6. Class Weighing
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
model = LogisticRegression(class_weight='balanced')
weighted_metrics = evaluate_model(model, X_train, y_train, X_test, y_test)


# Print the results
print("Original Metrics:", original_metrics)
print("Over-Sampling Metrics:", over_sampling_metrics)
print("Under-Sampling Metrics:", under_sampling_metrics)
print("SMOTE Metrics:", smote_metrics)
print("Tomek Links Metrics:", tomek_metrics)
print("Class Weighing Metrics:", weighted_metrics)

Original Metrics: (0.982802109607888, 0.0, 0.6468074350598849)
Over-Sampling Metrics: (0.7295345104333868, 0.09304113802383698, 0.8349198942292736)
Under-Sampling Metrics: (0.7305663838569135, 0.09406322282189669, 0.8311502566495567)
SMOTE Metrics: (0.780440265994038, 0.0832934418382001, 0.7377772592938249)
Tomek Links Metrics: (0.982802109607888, 0.0, 0.6607131746772438)
Class Weighing Metrics: (0.7204769548268746, 0.08552138034508627, 0.8068354331933427)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [74]:
results = {
    'Original': original_metrics,
    'Random Over-Sampling': over_sampling_metrics,
    'Random Under-Sampling': under_sampling_metrics,
    'SMOTE': smote_metrics,
    'Tomek Links': tomek_metrics, 
    'Class Weighing': weighted_metrics  
}

# Create a DataFrame from the dictionary
results_df = pd.DataFrame.from_dict(results, orient='index', columns=['Accuracy', 'F1 Score', 'AUC'])

# Display the results
print(results_df[['Accuracy', 'F1 Score', 'AUC']])

# Compare performance metrics
best_technique = results_df.loc[results_df['AUC'].idxmax()]
print(f"\nBest Technique Based on AUC:\n{best_technique}")

                       Accuracy  F1 Score       AUC
Original               0.982802  0.000000  0.646807
Random Over-Sampling   0.729535  0.093041  0.834920
Random Under-Sampling  0.730566  0.094063  0.831150
SMOTE                  0.780440  0.083293  0.737777
Tomek Links            0.982802  0.000000  0.660713
Class Weighing         0.720477  0.085521  0.806835

Best Technique Based on AUC:
Accuracy    0.729535
F1 Score    0.093041
AUC         0.834920
Name: Random Over-Sampling, dtype: float64
