In [None]:
# -*- coding: utf-8 -*-
"""
Script to analyze label noise effects on stroke dataset using Random Oversampling:
 - Maintains the original Accuracy metric
 - Computes TPR (Sensitivity) and TNR (Specificity)
 - Prints summed Confusion Matrices for each noise level (over 10 runs)
 - Generates grouped bar plots vs. Percentage of Labels Correctly Classified
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler

import warnings
warnings.filterwarnings('ignore')

# ---------------------------------------------------------------------
# 1. SETUP AND DATA LOADING
# ---------------------------------------------------------------------
RANDOM_STATE = 42


model_colors = {
    'Logistic Regression': 'blue',
    'Decision Tree': 'green',
    'Random Forest': 'orange',
    'Naive Bayes': 'purple',
    'K-Nearest Neighbors': 'brown',
    'Support Vector Machine': 'red'
}

print("Loading stroke dataset...")
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nMissing Values in Each Column:")
print(df.isnull().sum())


imputer = SimpleImputer(strategy='mean')
df['bmi'] = imputer.fit_transform(df[['bmi']])

# Drop rows with gender as 'Other' (if any)
df = df[df['gender'] != 'Other']

# One-hot encoding of categorical features
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Prepare feature matrix X and target vector y
X = df.drop(['id', 'stroke'], axis=1)
y = df['stroke'].astype(int)

# Scale numerical features
scaler = StandardScaler()
numerical_features = ['age', 'avg_glucose_level', 'bmi']
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Convert to NumPy arrays for further processing
X_np = X.values
y_np = y.values

# Display class distribution
print("\nClass Distribution in Original Dataset:")
print(y.value_counts())


def compute_metrics_from_confusion(tn, fp, fn, tp):
    """
    Given TN, FP, FN, TP, compute Accuracy, TPR (Sensitivity), and TNR (Specificity).
    Returns metrics in percentages.
    """
    total = tn + fp + fn + tp
    if total == 0:
        return 0, 0, 0
    accuracy = (tp + tn) / total
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    tnr = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    return accuracy * 100.0, tpr * 100.0, tnr * 100.0

def plot_metric_sorted(dataset_name, metric_name, metric_data, x_labels):
    """
    Plots a grouped bar chart for a given dataset and metric.
    Models are sorted by their baseline performance (i.e. at 0% noise, corresponding to 100% correct labels).
    """

    sorted_models = sorted(metric_data.keys(), key=lambda m: metric_data[m][0], reverse=True)
    n_models = len(sorted_models)
    n_groups = len(x_labels)
    x = np.arange(n_groups)
    bar_width = 0.12

    plt.figure(figsize=(12, 6))
    for i, model in enumerate(sorted_models):
        plt.bar(
            x + i * bar_width,
            metric_data[model],
            width=bar_width,
            color=model_colors.get(model, 'grey'),
            label=model
        )

    plt.xlabel('% Accurate Labels (Descending)')
    plt.ylabel(f'{metric_name} (%)')
    plt.title(f'{dataset_name} - {metric_name} vs. % Accurate Labels\n(Models sorted by baseline performance)')
    plt.xticks(x + (n_models / 2 - 0.5) * bar_width, x_labels)
    plt.ylim(0, 100)
    plt.legend(title='Model')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# ---------------------------------------------------------------------
# 3. DEFINE MODELS AND SETUP THE NOISE LEVELS FOR ANALYSIS
# ---------------------------------------------------------------------
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(probability=True, random_state=RANDOM_STATE, class_weight='balanced')
}

# Noise levels: from 0% to 40% label noise (0% noise = 100% correct labels)
noise_levels = np.arange(0.0, 0.45, 0.05)

# Dictionaries to store metrics for each model
acc_results = {m: [] for m in models.keys()}
tpr_results = {m: [] for m in models.keys()}
tnr_results = {m: [] for m in models.keys()}

# ---------------------------------------------------------------------
# 4. TRAINING, EVALUATION, AND METRIC COMPUTATION UNDER NOISE
# ---------------------------------------------------------------------
print("\nEvaluating each model under different noise levels (10 runs each)...")
for model_name, model in models.items():
    print(f"\n=== Model: {model_name} ===")
    for noise_level in noise_levels:
        sum_tn, sum_fp, sum_fn, sum_tp = 0, 0, 0, 0

        for iteration in range(10):
            seed = RANDOM_STATE + iteration

            # 50/50 train/test split with stratification
            X_train, X_test, y_train, y_test = train_test_split(
                X_np, y_np, test_size=0.5, stratify=y_np, random_state=seed
            )

            # Introduce label noise in the training set by flipping a fraction of labels
            y_train_noisy = y_train.copy()
            num_noisy = int(noise_level * len(y_train_noisy))
            np.random.seed(seed)
            noisy_indices = np.random.choice(len(y_train_noisy), size=num_noisy, replace=False)
            y_train_noisy[noisy_indices] = 1 - y_train_noisy[noisy_indices]

            # Apply Random Oversampling to balance the training data
            ros = RandomOverSampler(random_state=seed)
            X_train_bal, y_train_bal = ros.fit_resample(X_train, y_train_noisy)
            if iteration == 0:
                print(f"Noise {int(noise_level*100)}% - Iteration {iteration+1}: ",
                      np.bincount(y_train_bal))

            # Train the model on the balanced data
            model.fit(X_train_bal, y_train_bal)

            # Predict on the test set
            y_pred = model.predict(X_test)

            # Compute confusion matrix components
            tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0, 1]).ravel()
            sum_tn += tn
            sum_fp += fp
            sum_fn += fn
            sum_tp += tp

        ACC, TPR, TNR = compute_metrics_from_confusion(sum_tn, sum_fp, sum_fn, sum_tp)
        acc_results[model_name].append(ACC)
        tpr_results[model_name].append(TPR)
        tnr_results[model_name].append(TNR)

        print(f"Noise {int(noise_level*100)}% => Confusion Matrix Sum (10 runs):")
        print(f"   [[TN={sum_tn}, FP={sum_fp}], [FN={sum_fn}, TP={sum_tp}]]")
        print(f"   Accuracy: {ACC:.2f}%, TPR: {TPR:.2f}%, TNR: {TNR:.2f}%")

Loading stroke dataset...

First 5 rows of the dataset:
      id  gender   age  hypertension  heart_disease ever_married  \
0   9046    Male  67.0             0              1          Yes   
1  51676  Female  61.0             0              0          Yes   
2  31112    Male  80.0             0              1          Yes   
3  60182  Female  49.0             0              0          Yes   
4   1665  Female  79.0             1              0          Yes   

       work_type Residence_type  avg_glucose_level   bmi   smoking_status  \
0        Private          Urban             228.69  36.6  formerly smoked   
1  Self-employed          Rural             202.21   NaN     never smoked   
2        Private          Rural             105.92  32.5     never smoked   
3        Private          Urban             171.23  34.4           smokes   
4  Self-employed          Rural             174.12  24.0     never smoked   

   stroke  
0       1  
1       1  
2       1  
3       1  
4       1  
