In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')
import os

os.makedirs('outputs/figures', exist_ok=True)
os.makedirs('outputs/results', exist_ok=True)
os.makedirs('outputs/gridsearch', exist_ok=True)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("✓ Libraries loaded successfully")

## 1. Load and Preprocess Data

In [None]:
# Load thyroid data
df = pd.read_csv('thyroid/new-thyroid.csv', header=None)
df.columns = ['Class', 'T3_resin', 'Thyroxin', 'Triiodothyronine', 'TSH', 'TSH_diff']

print(f"Dataset shape: {df.shape}")
print(f"\nOriginal Class distribution:")
print(df['Class'].value_counts())

# Convert multi-class to binary: 1 = normal, 2,3 = abnormal
df['Binary_Class'] = (df['Class'] == 1).astype(int)

print(f"\nBinary Class distribution (1=Normal, 0=Abnormal):")
print(df['Binary_Class'].value_counts())

# Features and target
X = df.drop(['Class', 'Binary_Class'], axis=1)
y = df['Binary_Class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n✓ Data preprocessed")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution - Train: {np.bincount(y_train)}")
print(f"Class distribution - Test: {np.bincount(y_test)}")

## 2. Define Hyperparameter Grids

In [None]:
param_grids = {
    'KNN': {
        'n_neighbors': [3, 5, 7, 9, 11, 15, 21],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [1000]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1]
    },
    'MLP': {
        'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100), (150, 100, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [500]
    },
    'Decision Tree': {
        'max_depth': [3, 5, 7, 10, 15, 20, None],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'criterion': ['gini', 'entropy']
    },
    'Naive Bayes': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
    }
}

classifiers = {
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'SVM': SVC(random_state=42),
    'MLP': MLPClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

print("✓ Hyperparameter grids defined")

## 3. Perform GridSearchCV

In [None]:
grid_results = {}

print("="*80)
print("THYROID DISEASE - HYPERPARAMETER SENSITIVITY ANALYSIS")
print("="*80)

for name in classifiers.keys():
    print(f"\n{'='*80}")
    print(f"GridSearchCV: {name}")
    print(f"{'='*80}")
    
    grid_search = GridSearchCV(
        classifiers[name], 
        param_grids[name], 
        cv=3, 
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    grid_results[name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'cv_results': grid_search.cv_results_,
        'test_accuracy': accuracy_score(y_test, grid_search.predict(X_test_scaled)),
        'test_f1': f1_score(y_test, grid_search.predict(X_test_scaled)),
        'variance': np.var(grid_search.cv_results_['mean_test_score']),
        'std': np.std(grid_search.cv_results_['mean_test_score']),
        'range': np.ptp(grid_search.cv_results_['mean_test_score'])
    }
    
    print(f"Best CV: {grid_results[name]['best_score']:.4f}")
    print(f"Test Acc: {grid_results[name]['test_accuracy']:.4f}")
    print(f"Sensitivity - Var: {grid_results[name]['variance']:.6f}, Range: {grid_results[name]['range']:.4f}")

print("\n✓ GridSearchCV completed")

## 4. Summary & Visualizations

In [None]:
sensitivity_summary = pd.DataFrame({
    'Classifier': list(grid_results.keys()),
    'Best CV Score': [grid_results[n]['best_score'] for n in grid_results],
    'Test Accuracy': [grid_results[n]['test_accuracy'] for n in grid_results],
    'Test F1': [grid_results[n]['test_f1'] for n in grid_results],
    'Variance': [grid_results[n]['variance'] for n in grid_results],
    'Std Dev': [grid_results[n]['std'] for n in grid_results],
    'Range': [grid_results[n]['range'] for n in grid_results]
}).round(4).sort_values('Variance', ascending=False)

print("\nSENSITIVITY RANKING - THYROID DISEASE")
print("="*80)
print(sensitivity_summary.to_string(index=False))

sensitivity_summary.to_csv('outputs/results/thyroid_sensitivity.csv', index=False)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Hyperparameter Sensitivity - Thyroid Disease', fontsize=16, fontweight='bold')

sensitivity_summary.plot(x='Classifier', y='Variance', kind='bar', ax=axes[0,0], color='coral', legend=False)
axes[0,0].set_title('Variance')
axes[0,0].tick_params(axis='x', rotation=45)

sensitivity_summary.plot(x='Classifier', y='Range', kind='bar', ax=axes[0,1], color='skyblue', legend=False)
axes[0,1].set_title('Range')
axes[0,1].tick_params(axis='x', rotation=45)

x_pos = np.arange(len(sensitivity_summary))
axes[0,2].bar(x_pos - 0.2, sensitivity_summary['Best CV Score'], 0.4, label='CV')
axes[0,2].bar(x_pos + 0.2, sensitivity_summary['Test Accuracy'], 0.4, label='Test')
axes[0,2].set_xticks(x_pos)
axes[0,2].set_xticklabels(sensitivity_summary['Classifier'], rotation=45)
axes[0,2].legend()

sensitivity_summary.plot(x='Classifier', y='Std Dev', kind='bar', ax=axes[1,0], color='mediumpurple', legend=False)
axes[1,0].tick_params(axis='x', rotation=45)

sensitivity_summary.plot(x='Classifier', y='Test F1', kind='bar', ax=axes[1,1], color='gold', legend=False)
axes[1,1].tick_params(axis='x', rotation=45)

axes[1,2].scatter(sensitivity_summary['Variance'], sensitivity_summary['Test Accuracy'], s=200)
for _, row in sensitivity_summary.iterrows():
    axes[1,2].annotate(row['Classifier'], (row['Variance'], row['Test Accuracy']), fontsize=8)
axes[1,2].set_xlabel('Variance')
axes[1,2].set_ylabel('Accuracy')

plt.tight_layout()
plt.savefig('outputs/figures/thyroid_sensitivity_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n✓ Analysis complete")

## 1. Load Data

In [None]:
# Load thyroid data
columns = ['Class', 'T3_resin', 'Total_Serum_thyroxin', 'Total_serum_triiodothyronine', 
           'Basal_TSH', 'Max_TSH_difference']

df = pd.read_csv('thyroid/new-thyroid.data', names=columns)

print(f"Dataset shape: {df.shape}")
print(f"\nOriginal class distribution:")
print(df['Class'].value_counts())
print("\nClasses: 1=Normal, 2=Hyperthyroid, 3=Hypothyroid")
df.head()

## 2. Convert to Binary Classification

In [None]:
# Convert multi-class to binary: 1=Normal (class 1), 0=Abnormal (class 2 or 3)
df['Binary_Class'] = (df['Class'] == 1).astype(int)

print(f"Binary class distribution:")
print(df['Binary_Class'].value_counts())
print("\n0 = Abnormal Thyroid (Hyper/Hypo)")
print("1 = Normal Thyroid")

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
df['Class'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color=['green', 'orange', 'red'])
axes[0].set_title('Original Multi-class Distribution')
axes[0].set_xlabel('Class')
axes[0].set_xticklabels(['Normal', 'Hyper', 'Hypo'], rotation=0)

df['Binary_Class'].value_counts().plot(kind='bar', ax=axes[1], color=['coral', 'skyblue'])
axes[1].set_title('Binary Class Distribution')
axes[1].set_xlabel('Class')
axes[1].set_xticklabels(['Abnormal', 'Normal'], rotation=0)

plt.tight_layout()
plt.savefig('outputs/figures/thyroid_bucketing.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Data Preprocessing

In [None]:
# Features and target
X = df.drop(['Class', 'Binary_Class'], axis=1)
y = df['Binary_Class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 4. Train 6 Classifiers

In [None]:
classifiers = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

results = {}

print("="*70)
print("THYROID DISEASE - TRAINING 6 CLASSIFIERS")
print("="*70)

for name, clf in classifiers.items():
    print(f"\nTraining: {name}")
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=3)
    
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std()
    }
    
    print(f"Accuracy: {accuracy:.4f}, F1: {f1:.4f}, CV: {cv_scores.mean():.4f}")

## 5. Results

In [None]:
results_df = pd.DataFrame(results).T.round(4)
print("\nFINAL RESULTS - THYROID DISEASE")
print(results_df)
results_df.to_csv('outputs/results/thyroid_results.csv')

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
results_df['Accuracy'].plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Accuracy Comparison')
axes[0,0].set_ylim([0.7, 1.0])

results_df['F1-Score'].plot(kind='bar', ax=axes[0,1], color='coral')
axes[0,1].set_title('F1-Score Comparison')
axes[0,1].set_ylim([0.7, 1.0])

x_pos = np.arange(len(results_df))
width = 0.35
axes[1,0].bar(x_pos - width/2, results_df['Precision'], width, label='Precision')
axes[1,0].bar(x_pos + width/2, results_df['Recall'], width, label='Recall')
axes[1,0].set_title('Precision vs Recall')
axes[1,0].set_xticks(x_pos)
axes[1,0].set_xticklabels(results_df.index, rotation=45)
axes[1,0].legend()

results_df[['Accuracy', 'Precision', 'Recall', 'F1-Score']].plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('All Metrics')
axes[1,1].set_ylim([0.7, 1.0])
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('outputs/figures/thyroid_results.png', dpi=300, bbox_inches='tight')
plt.show()

best_model = results_df['Accuracy'].idxmax()
print(f"\nBest Model: {best_model} with Accuracy: {results_df.loc[best_model, 'Accuracy']:.4f}")