In [1]:
# -*- coding: utf-8 -*-
"""
Pima Indians Diabetes Label Noise Analysis (Without SMOTE)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

# Set random state for reproducibility and define a color palette for models
RANDOM_STATE = 42
model_colors = {
    'Logistic Regression': 'blue',
    'Decision Tree': 'green',
    'Random Forest': 'orange',
    'Naive Bayes': 'purple',
    'K-Nearest Neighbors': 'brown',
    'Support Vector Machine': 'red'
}

# Load and inspect the dataset
print("Loading Pima Indians Diabetes dataset...")
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
data = pd.read_csv(url, header=None, names=column_names)

print("\nFirst 5 rows of the dataset:")
print(data.head())
print("\nDataset Info:")
data.info()

# Replace zeros with NaN in selected columns and impute missing values with median
missing_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[missing_cols] = data[missing_cols].replace(0, np.nan)
print("\nMissing values before imputation:")
print(data.isnull().sum())
for col in missing_cols:
    data[col].fillna(data[col].median(), inplace=True)
print("\nMissing values after imputation:")
print(data.isnull().sum())

# Separate features and target; scale the features
X = data.drop('Outcome', axis=1)
y = data['Outcome'].astype(int)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
X_np = X.values
y_np = y.values

print("\nOriginal Class Distribution:")
print(y.value_counts())

# Define noise levels and machine learning models
noise_levels = np.arange(0.00, 0.45, 0.05)  # 0% to 40% noise
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(probability=True, random_state=RANDOM_STATE)
}

# Initialize dictionaries to store metrics and confusion matrices
ACC_values = {m: {nl: [] for nl in noise_levels} for m in models}
TPR_values = {m: {nl: [] for nl in noise_levels} for m in models}
TNR_values = {m: {nl: [] for nl in noise_levels} for m in models}
SENS_values = {m: {nl: [] for nl in noise_levels} for m in models}
SPEC_values = {m: {nl: [] for nl in noise_levels} for m in models}
confusion_matrix_sums = {m: {nl: np.zeros((2, 2), dtype=int) for nl in noise_levels} for m in models}

print("\nStarting training and evaluation with simulated label noise...\n")
NUM_RUNS = 10

# Loop through each model and noise level
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    for noise_level in noise_levels:
        conf_matrix_sum = np.zeros((2, 2), dtype=int)
        for run_idx in range(NUM_RUNS):
            random_seed = RANDOM_STATE + run_idx
            X_train, X_test, y_train, y_test = train_test_split(
                X_np, y_np, test_size=0.5, stratify=y_np, random_state=random_seed
            )
            # Introduce label noise in the training set
            y_train_noisy = y_train.copy()
            num_noisy = int(noise_level * len(y_train_noisy))
            np.random.seed(random_seed)
            noisy_indices = np.random.choice(len(y_train_noisy), size=num_noisy, replace=False)
            y_train_noisy[noisy_indices] = 1 - y_train_noisy[noisy_indices]

            # Train the model with the noisy training set
            model.fit(X_train, y_train_noisy)
            y_pred = model.predict(X_test)
            cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
            conf_matrix_sum += cm

            TN, FP, FN, TP = cm.ravel()
            acc = accuracy_score(y_test, y_pred)
            tpr = TP / (TP + FN) if (TP + FN) > 0 else 0.0
            tnr = TN / (TN + FP) if (TN + FP) > 0 else 0.0

            ACC_values[model_name][noise_level].append(acc)
            TPR_values[model_name][noise_level].append(tpr)
            TNR_values[model_name][noise_level].append(tnr)
            SENS_values[model_name][noise_level].append(tpr)
            SPEC_values[model_name][noise_level].append(tnr)

        confusion_matrix_sums[model_name][noise_level] = conf_matrix_sum
    print(f"Completed evaluations for {model_name}\n")

print("All models evaluated under different noise levels.")

# Display the summed confusion matrices for each model and noise level
print("\n=== Summed Confusion Matrices for Each Model & Noise Level ===")
for model_name in models.keys():
    print(f"\nModel: {model_name}")
    for noise_level in noise_levels:
        print(f"  Noise Level: {int(noise_level * 100)}%")
        print(confusion_matrix_sums[model_name][noise_level])

Loading Pima Indians Diabetes dataset...

First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0 