In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, cohen_kappa_score
import pickle  # For saving models

# Load the dataset
data = pd.read_csv('C:/Users/CSE/Desktop/Data/Thyroid/hypothyroid.csv')

# Encode categorical variables using LabelEncoder
enc = LabelEncoder()
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = enc.fit_transform(data[column])

# Remove duplicates
data = data.drop_duplicates()

# Normalize numerical features
for col in ['age', 'TT4', 'T4U', 'FTI']:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())

# Visualize class distribution of target variable before balancing
plt.figure(figsize=(8, 6))
sns.countplot(data['binaryClass'])
plt.title('Class Distribution of Target Variable (Before Balancing)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.show()

# Augment the dataset to reach 1000 instances using ADASYN
if len(data) < 5000:
    ada = ADASYN(sampling_strategy=0.5)  # Adjust sampling_strategy as needed
    while len(data) < 1000:
        x_aug, y_aug = ada.fit_resample(data.drop('binaryClass', axis=1), data['binaryClass'])
        aug_data = pd.DataFrame(x_aug, columns=data.drop('binaryClass', axis=1).columns)
        aug_data['binaryClass'] = y_aug
        data = pd.concat([data, aug_data], ignore_index=True)

# Visualize class distribution of target variable after augmentation
plt.figure(figsize=(8, 6))
sns.countplot(data['binaryClass'])
plt.title('Class Distribution of Target Variable (After Augmentation)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.show()

# Apply ADASYN for oversampling and RandomUnderSampler for undersampling
ada = ADASYN(sampling_strategy='minority')
rus = RandomUnderSampler(sampling_strategy='majority')

# Apply ADASYN and undersampling to the entire dataset
x_balanced, y_balanced = ada.fit_resample(data.drop('binaryClass', axis=1), data['binaryClass'])
x_balanced, y_balanced = rus.fit_resample(x_balanced, y_balanced)

# Visualize class distribution of target variable after balancing
plt.figure(figsize=(8, 6))
sns.countplot(y_balanced)
plt.title('Class Distribution of Target Variable (After Balancing)')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks([0, 1], ['Negative', 'Positive'])
plt.show()

# Split balanced data into training and test sets
x_train_balanced, x_test_balanced, y_train_balanced, y_test_balanced = train_test_split(x_balanced, y_balanced, test_size=0.2, stratify=y_balanced)

# Split original data into training and test sets
x = data.drop('binaryClass', axis=1)
y = data['binaryClass']
x_train_orig, x_test_orig, y_train_orig, y_test_orig = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

# List of classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'Neural Networks': MLPClassifier(),
    'Support Vector Machine': SVC(probability=True),  # Enable probability estimation for ROC curves
    'J48 (Decision Tree)': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

# Storage dictionaries for results
results_orig = {}
results_balanced = {}

# Evaluate classifiers before and after balancing
for name, classifier in classifiers.items():
    results_orig[name] = {}
    results_balanced[name] = {}
    
    # Fit classifier on original training data
    classifier.fit(x_train_orig, y_train_orig)
    predictions_orig = classifier.predict(x_test_orig)
    accuracy_orig = accuracy_score(y_test_orig, predictions_orig)
    kappa_orig = cohen_kappa_score(y_test_orig, predictions_orig)
    results_orig[name]['accuracy'] = accuracy_orig
    results_orig[name]['kappa'] = kappa_orig
    results_orig[name]['classification_report'] = classification_report(y_test_orig, predictions_orig, target_names=['Negative', 'Positive'])
    
    # Save the trained model for original data
    with open(f'{name}_orig_model.pkl', 'wb') as model_file:
        pickle.dump(classifier, model_file)
    
    # Plot confusion matrix for original data
    plt.figure(figsize=(6, 4))
    cm_orig = confusion_matrix(y_test_orig, predictions_orig)
    sns.heatmap(cm_orig, annot=True, cmap='Blues', fmt='g', cbar=False,
                xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {name} (Original)')
    plt.show()
    
    # Compute ROC curve for original data
    y_score_orig = classifier.predict_proba(x_test_orig)[:, 1]
    fpr_orig, tpr_orig, _ = roc_curve(y_test_orig, y_score_orig)
    roc_auc_orig = auc(fpr_orig, tpr_orig)
    
    # Store ROC results for original data
    results_orig[name]['fpr'] = fpr_orig
    results_orig[name]['tpr'] = tpr_orig
    results_orig[name]['roc_auc'] = roc_auc_orig
    
    # Fit classifier on balanced training data
    classifier.fit(x_train_balanced, y_train_balanced)
    predictions_balanced = classifier.predict(x_test_balanced)
    accuracy_balanced = accuracy_score(y_test_balanced, predictions_balanced)
    kappa_balanced = cohen_kappa_score(y_test_balanced, predictions_balanced)
    results_balanced[name]['accuracy'] = accuracy_balanced
    results_balanced[name]['kappa'] = kappa_balanced
    results_balanced[name]['classification_report'] = classification_report(y_test_balanced, predictions_balanced, target_names=['Negative', 'Positive'])
    
    # Save the trained model for balanced data
    with open(f'{name}_balanced_model.pkl', 'wb') as model_file:
        pickle.dump(classifier, model_file)
    
    # Plot confusion matrix after balancing
    plt.figure(figsize=(6, 4))
    cm_balanced = confusion_matrix(y_test_balanced, predictions_balanced)
    sns.heatmap(cm_balanced, annot=True, cmap='Blues', fmt='g', cbar=False,
                xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {name} (Balanced)')
    plt.show()
    
    # Compute ROC curve after balancing
    y_score_balanced = classifier.predict_proba(x_test_balanced)[:, 1]
    fpr_balanced, tpr_balanced, _ = roc_curve(y_test_balanced, y_score_balanced)
    roc_auc_balanced = auc(fpr_balanced, tpr_balanced)
    
    # Store ROC results after balancing
    results_balanced[name]['fpr'] = fpr_balanced
    results_balanced[name]['tpr'] = tpr_balanced
    results_balanced[name]['roc_auc'] = roc_auc_balanced
    
    # Print results
    print(f'Classifier: {name}')
    print('Classification Report Before Balancing:')
    print(results_orig[name]['classification_report'])
    print(f'Accuracy Before Balancing: {accuracy_orig:.4f}, Kappa Before Balancing: {kappa_orig:.4f}')
    
    print('Classification Report After Balancing:')
    print(results_balanced[name]['classification_report'])
    print(f'Accuracy After Balancing: {accuracy_balanced:.4f}, Kappa After Balancing: {kappa_balanced:.4f}')
    print('-' * 50)

# Plot ROC curves before balancing
plt.figure(figsize=(12, 8))
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
for name, result in results_orig.items():
    plt.plot(result['fpr'], result['tpr'], lw=2, label=f'{name} (AUC = {result["roc_auc"]:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (Before Balancing)')
plt.legend(loc='lower right')
plt.show()

# Plot ROC curves after balancing
plt.figure(figsize=(12, 8))
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
for name, result in results_balanced.items():
    plt.plot(result['fpr'], result['tpr'], lw=2, label=f'{name} (AUC = {result["roc_auc"]:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (After Balancing)')
plt.legend(loc='lower right')
plt.show()
