In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy.io import arff
import warnings
warnings.filterwarnings('ignore')
import os

os.makedirs('outputs/figures', exist_ok=True)
os.makedirs('outputs/results', exist_ok=True)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')
import os

os.makedirs('outputs/figures', exist_ok=True)
os.makedirs('outputs/results', exist_ok=True)
os.makedirs('outputs/gridsearch', exist_ok=True)

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("✓ Libraries loaded successfully")

## 1. Load and Preprocess Data

In [None]:
# Load CKD data
df = pd.read_csv('kidney_disease/chronic_kidney_disease.csv')
df.columns = df.columns.str.strip()

print(f"Dataset shape: {df.shape}")
print(f"\nTarget distribution:")
print(df['classification'].value_counts())
print(f"\nMissing values per column:")
print(df.isnull().sum())

# Encode target variable (Binary: 0 = notckd, 1 = ckd)
df['classification'] = df['classification'].map({'notckd': 0, 'ckd': 1})

# Separate numeric and categorical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Remove target from numeric cols if present
if 'classification' in numeric_cols:
    numeric_cols.remove('classification')

# Impute missing values
# Numeric: median
if numeric_cols:
    imputer_num = SimpleImputer(strategy='median')
    df[numeric_cols] = imputer_num.fit_transform(df[numeric_cols])

# Categorical: most frequent
if categorical_cols:
    imputer_cat = SimpleImputer(strategy='most_frequent')
    df[categorical_cols] = imputer_cat.fit_transform(df[categorical_cols])

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Split features and target
X = df.drop('classification', axis=1)
y = df['classification']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\n✓ Data preprocessed")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution - Train: {np.bincount(y_train)}")
print(f"Class distribution - Test: {np.bincount(y_test)}")

## 2. Define Hyperparameter Grids

In [None]:
# Define hyperparameter grids (same as Census for consistency)
param_grids = {
    'KNN': {
        'n_neighbors': [3, 5, 7, 9, 11, 15, 21],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [1000]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1]
    },
    'MLP': {
        'hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100), (150, 100, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive'],
        'max_iter': [500]
    },
    'Decision Tree': {
        'max_depth': [3, 5, 7, 10, 15, 20, None],
        'min_samples_split': [2, 5, 10, 20],
        'min_samples_leaf': [1, 2, 4, 8],
        'criterion': ['gini', 'entropy']
    },
    'Naive Bayes': {
        'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
    }
}

classifiers = {
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(random_state=42),
    'SVM': SVC(random_state=42),
    'MLP': MLPClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

print("✓ Hyperparameter grids defined")
print(f"\nTotal classifiers: {len(classifiers)}")
for name, grid in param_grids.items():
    total_combinations = np.prod([len(v) for v in grid.values()])
    print(f"{name}: {len(grid)} hyperparameters, {total_combinations} combinations")

## 3. Perform GridSearchCV

In [None]:
grid_results = {}
best_models = {}

print("="*80)
print("CHRONIC KIDNEY DISEASE - HYPERPARAMETER SENSITIVITY ANALYSIS")
print("="*80)

for name in classifiers.keys():
    print(f"\n{'='*80}")
    print(f"GridSearchCV: {name}")
    print(f"{'='*80}")
    
    clf = classifiers[name]
    param_grid = param_grids[name]
    
    grid_search = GridSearchCV(
        clf, 
        param_grid, 
        cv=5, 
        scoring='accuracy',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    grid_results[name] = {
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'cv_results': grid_search.cv_results_,
        'all_scores': grid_search.cv_results_['mean_test_score'],
        'std_scores': grid_search.cv_results_['std_test_score']
    }
    best_models[name] = grid_search.best_estimator_
    
    y_pred = grid_search.best_estimator_.predict(X_test_scaled)
    test_accuracy = accuracy_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)
    
    score_variance = np.var(grid_search.cv_results_['mean_test_score'])
    score_std = np.std(grid_search.cv_results_['mean_test_score'])
    score_range = np.max(grid_search.cv_results_['mean_test_score']) - np.min(grid_search.cv_results_['mean_test_score'])
    
    grid_results[name]['test_accuracy'] = test_accuracy
    grid_results[name]['test_f1'] = test_f1
    grid_results[name]['variance'] = score_variance
    grid_results[name]['std'] = score_std
    grid_results[name]['range'] = score_range
    
    print(f"Best CV Score: {grid_search.best_score_:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1-Score: {test_f1:.4f}")
    print(f"\nHyperparameter Sensitivity:")
    print(f"  Variance: {score_variance:.6f}")
    print(f"  Std Dev: {score_std:.6f}")
    print(f"  Range: {score_range:.4f}")
    print(f"\nBest Parameters:")
    for param, value in grid_search.best_params_.items():
        print(f"  {param}: {value}")

print(f"\n{'='*80}")
print("✓ GridSearchCV completed")
print(f"{'='*80}")

## 4. Sensitivity Summary

In [None]:
sensitivity_summary = pd.DataFrame({
    'Classifier': list(grid_results.keys()),
    'Best CV Score': [grid_results[name]['best_score'] for name in grid_results.keys()],
    'Test Accuracy': [grid_results[name]['test_accuracy'] for name in grid_results.keys()],
    'Test F1': [grid_results[name]['test_f1'] for name in grid_results.keys()],
    'Variance': [grid_results[name]['variance'] for name in grid_results.keys()],
    'Std Dev': [grid_results[name]['std'] for name in grid_results.keys()],
    'Range': [grid_results[name]['range'] for name in grid_results.keys()]
})

sensitivity_summary = sensitivity_summary.round(4)
sensitivity_summary = sensitivity_summary.sort_values('Variance', ascending=False)

print("\n" + "="*80)
print("HYPERPARAMETER SENSITIVITY RANKING - CHRONIC KIDNEY DISEASE")
print("="*80)
print(sensitivity_summary.to_string(index=False))

sensitivity_summary.to_csv('outputs/results/ckd_sensitivity.csv', index=False)
print("\n✓ Results saved to: outputs/results/ckd_sensitivity.csv")

most_sensitive = sensitivity_summary.iloc[0]['Classifier']
least_sensitive = sensitivity_summary.iloc[-1]['Classifier']
print(f"\nMost Sensitive: {most_sensitive}")
print(f"Least Sensitive: {least_sensitive}")

## 5. Visualizations

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Hyperparameter Sensitivity Analysis - Chronic Kidney Disease Dataset', fontsize=16, fontweight='bold')

ax = axes[0, 0]
sensitivity_summary.plot(x='Classifier', y='Variance', kind='bar', ax=ax, color='coral', legend=False)
ax.set_title('Hyperparameter Variance', fontweight='bold')
ax.set_ylabel('Variance')
ax.set_xlabel('')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)

ax = axes[0, 1]
sensitivity_summary.plot(x='Classifier', y='Range', kind='bar', ax=ax, color='skyblue', legend=False)
ax.set_title('Score Range', fontweight='bold')
ax.set_ylabel('Range')
ax.set_xlabel('')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)

ax = axes[0, 2]
x_pos = np.arange(len(sensitivity_summary))
width = 0.35
ax.bar(x_pos - width/2, sensitivity_summary['Best CV Score'], width, label='CV Score', color='lightgreen')
ax.bar(x_pos + width/2, sensitivity_summary['Test Accuracy'], width, label='Test Accuracy', color='lightcoral')
ax.set_title('CV vs Test Performance', fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(sensitivity_summary['Classifier'], rotation=45)
ax.legend()
ax.grid(axis='y', alpha=0.3)

ax = axes[1, 0]
sensitivity_summary.plot(x='Classifier', y='Std Dev', kind='bar', ax=ax, color='mediumpurple', legend=False)
ax.set_title('Standard Deviation', fontweight='bold')
ax.set_ylabel('Std Dev')
ax.set_xlabel('')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)

ax = axes[1, 1]
sensitivity_summary.plot(x='Classifier', y='Test F1', kind='bar', ax=ax, color='gold', legend=False)
ax.set_title('Test F1-Score', fontweight='bold')
ax.set_ylabel('F1-Score')
ax.set_xlabel('')
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)

ax = axes[1, 2]
scatter = ax.scatter(sensitivity_summary['Variance'], sensitivity_summary['Test Accuracy'], 
                     s=200, c=range(len(sensitivity_summary)), cmap='viridis', alpha=0.6, edgecolors='black')
for idx, row in sensitivity_summary.iterrows():
    ax.annotate(row['Classifier'], (row['Variance'], row['Test Accuracy']), fontsize=8, ha='center', va='bottom')
ax.set_title('Sensitivity vs Performance', fontweight='bold')
ax.set_xlabel('Variance')
ax.set_ylabel('Test Accuracy')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('outputs/figures/ckd_sensitivity_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Visualizations saved")

## 6. Save Detailed Results

In [None]:
for name in grid_results.keys():
    cv_results = grid_results[name]['cv_results']
    results_df = pd.DataFrame(cv_results)
    results_df.to_csv(f'outputs/gridsearch/ckd_{name.replace(" ", "_").lower()}_gridsearch.csv', index=False)

print("✓ Detailed GridSearch results saved to outputs/gridsearch/")
print("\n" + "="*80)
print("✓ Analysis complete for Chronic Kidney Disease dataset")
print("="*80)

## 1. Load Data

In [2]:
# Load CKD data from ARFF file
data, meta = arff.loadarff('kidney_disease/chronic_kidney_disease_full.arff')
df = pd.DataFrame(data)

# Decode byte strings
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].str.decode('utf-8')

print(f"Dataset shape: {df.shape}")
print(f"\nTarget distribution:")
print(df['class'].value_counts())
df.head()

ValueError:  yes value not in ('yes', 'no')

## 2. Data Preprocessing

In [None]:
# Handle missing values
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'unknown', inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)

# Encode target (Binary: 0 = notckd, 1 = ckd)
df['class'] = df['class'].map({'notckd': 0, 'ckd': 1})

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Split features and target
X = df.drop('class', axis=1)
y = df['class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 3. Train 6 Classifiers

In [None]:
classifiers = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB()
}

results = {}

print("="*70)
print("CHRONIC KIDNEY DISEASE - TRAINING 6 CLASSIFIERS")
print("="*70)

for name, clf in classifiers.items():
    print(f"\nTraining: {name}")
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)
    
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std()
    }
    
    print(f"Accuracy: {accuracy:.4f}, F1: {f1:.4f}, CV: {cv_scores.mean():.4f}")

## 4. Results

In [None]:
results_df = pd.DataFrame(results).T.round(4)
print("\nFINAL RESULTS - CHRONIC KIDNEY DISEASE")
print(results_df)
results_df.to_csv('outputs/results/ckd_results.csv')

# Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
results_df['Accuracy'].plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Accuracy Comparison')
axes[0,0].set_ylim([0.7, 1.0])

results_df['F1-Score'].plot(kind='bar', ax=axes[0,1], color='coral')
axes[0,1].set_title('F1-Score Comparison')
axes[0,1].set_ylim([0.7, 1.0])

x_pos = np.arange(len(results_df))
width = 0.35
axes[1,0].bar(x_pos - width/2, results_df['Precision'], width, label='Precision')
axes[1,0].bar(x_pos + width/2, results_df['Recall'], width, label='Recall')
axes[1,0].set_title('Precision vs Recall')
axes[1,0].set_xticks(x_pos)
axes[1,0].set_xticklabels(results_df.index, rotation=45)
axes[1,0].legend()

results_df[['Accuracy', 'Precision', 'Recall', 'F1-Score']].plot(kind='bar', ax=axes[1,1])
axes[1,1].set_title('All Metrics')
axes[1,1].set_ylim([0.7, 1.0])
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('outputs/figures/ckd_results.png', dpi=300, bbox_inches='tight')
plt.show()

best_model = results_df['Accuracy'].idxmax()
print(f"\nBest Model: {best_model} with Accuracy: {results_df.loc[best_model, 'Accuracy']:.4f}")