In [None]:
from sklearn.neural_network import MLPClassifier
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import json
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.pipeline import Pipeline
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.base import clone  # Import clone from sklearn

class GlaucomaDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_dim=64):
        super(Autoencoder, self).__init__()
        
        # Encoder layers
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, encoding_dim),
            nn.ReLU()
        )
        
        # Decoder layers
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_size),
            nn.Sigmoid()  # Output should be in the range [0, 1]
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def train_autoencoder(model, train_loader, optimizer, criterion, device, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        train_loss = 0
        for batch_X, _ in train_loader:
            batch_X = batch_X.to(device)
            
            # Forward pass
            outputs = model(batch_X)
            loss = criterion(outputs, batch_X)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Autoencoder Loss: {train_loss/len(train_loader):.4f}')

def train_and_evaluate_models(X_train, X_val, y_train, y_val, scaler_mean, scaler_scale):
    # First apply PCA
    pca = PCA(n_components=0.95)  # Keep 95% of variance
    X_train_pca = pca.fit_transform(X_train)
    X_val_pca = pca.transform(X_val)
    
    results = {}
    models = {
        # 'Linear_Regression': Pipeline([
        #     ('model', LinearRegression())
        # ]),
        'Logistic_Regression': Pipeline([
            ('model', LogisticRegression(max_iter=2000))
        ]),
        'Decision_Tree': Pipeline([
            ('model', DecisionTreeClassifier(random_state=42))
        ]),
        'Random_Forest': Pipeline([
            ('model', RandomForestClassifier(n_estimators=100, random_state=42))
        ]),
        'SVM': Pipeline([
            ('model', SVC(probability=True, kernel='rbf', random_state=42))
        ]),
        'KNN': Pipeline([
            ('model', KNeighborsClassifier(n_neighbors=5))
        ]),
        'GBM': Pipeline([
            ('model', GradientBoostingClassifier(random_state=42))
        ]),
        'XGBoost': Pipeline([
            ('model', XGBClassifier(random_state=42,
                                     learning_rate=0.1,
                                     n_estimators=100,
                                     max_depth=5))
        ]),
        'Neural_Network': Pipeline([
            ('model', MLPClassifier(hidden_layer_sizes=(64, 32),
                                    max_iter=1000,
                                    random_state=42))
        ])
    }

    # Train and evaluate each model on both original and PCA features
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Train on original features
        model.fit(X_train, y_train)
        metrics_orig = evaluate_model(model, X_val, y_val)
        results[f'{name}_original'] = metrics_orig
        joblib.dump(model, f'glaucoma_model_{name}_original.joblib')
        
        # Train on PCA-reduced features using sklearn's clone
        model_pca = clone(model)
        model_pca.fit(X_train_pca, y_train)
        metrics_pca = evaluate_model(model_pca, X_val_pca, y_val)
        results[f'{name}_pca'] = metrics_pca
        joblib.dump((model_pca, pca), f'glaucoma_model_{name}_pca.joblib')
    
    return results

def evaluate_model(model, X, y):
    """Helper function to evaluate a model"""
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1] if hasattr(model, 'predict_proba') else y_pred
    
    return {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'auc': roc_auc_score(y, y_prob)
    }

def preprocess_data(data):
    def extract_number(value):
        if pd.isna(value):
            return np.nan
        # Extract the first number found in the string
        import re
        numbers = re.findall(r'[-+]?\d*\.\d+|\d+', str(value))
        return float(numbers[0]) if numbers else np.nan
    
    # Select and preprocess numerical features
    numerical_features = {
        'Age': 'Age',
        'Visual Acuity': 'Visual Acuity Measurements',
        'IOP': 'Intraocular Pressure (IOP)',
        'CDR': 'Cup-to-Disc Ratio (CDR)',
        'RNFL': 'Optical Coherence Tomography (OCT) Results',
        'Pachymetry': 'Pachymetry'
    }
    
    processed_data = pd.DataFrame()
    for new_name, original_name in numerical_features.items():
        processed_data[new_name] = data[original_name].apply(extract_number)
    
    # Handle Visual Field Test Results separately since it has sensitivity/specificity
    vf_data = data['Visual Field Test Results'].str.extract(r'Sensitivity: ([\d.]+), Specificity: ([\d.]+)')
    processed_data['VF_Sensitivity'] = pd.to_numeric(vf_data[0])
    processed_data['VF_Specificity'] = pd.to_numeric(vf_data[1])
    
    # Handle OCT measurements
    oct_data = data['Optical Coherence Tomography (OCT) Results'].str.extract(
        r'RNFL Thickness: ([\d.]+).*GCC Thickness: ([\d.]+).*Retinal Volume: ([\d.]+).*Macular Thickness: ([\d.]+)'
    )
    processed_data['RNFL_Thickness'] = pd.to_numeric(oct_data[0])
    processed_data['GCC_Thickness'] = pd.to_numeric(oct_data[1])
    processed_data['Retinal_Volume'] = pd.to_numeric(oct_data[2])
    processed_data['Macular_Thickness'] = pd.to_numeric(oct_data[3])
    
    return processed_data

def main():
    # Data preparation
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv", skipinitialspace=True)
    processed_data = preprocess_data(data)
    X = processed_data.values
    y = (data['Diagnosis'] == 'Glaucoma').astype(int).values
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Save scaler parameters
    scaler_mean = scaler.mean_
    scaler_scale = scaler.scale_
    
    # Create datasets and dataloaders
    train_dataset = GlaucomaDataset(X_train_scaled, y_train)
    val_dataset = GlaucomaDataset(X_val_scaled, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    
    # Train autoencoder
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    autoencoder = Autoencoder(input_size=X.shape[1]).to(device)
    ae_optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
    ae_criterion = nn.MSELoss()
    
    print("\nTraining Autoencoder...")
    train_autoencoder(autoencoder, train_loader, ae_optimizer, ae_criterion, device, num_epochs=50)
    
    # Extract features from autoencoder
    with torch.no_grad():
        X_train_encoded = autoencoder.encoder(torch.FloatTensor(X_train_scaled).to(device)).cpu().numpy()
        X_val_encoded = autoencoder.encoder(torch.FloatTensor(X_val_scaled).to(device)).cpu().numpy()
    
    # Train classical ML models
    print("\nTraining models on original features...")
    ml_results = train_and_evaluate_models(X_train_scaled, X_val_scaled, y_train, y_val, scaler_mean, scaler_scale)
    
    print("\nTraining models on autoencoder features...")
    ml_results_encoded = train_and_evaluate_models(X_train_encoded, X_val_encoded, y_train, y_val, scaler_mean, scaler_scale)
    
    # Combine results
    ml_results.update({f"{k}_Encoded": v for k, v in ml_results_encoded.items()})
    
    # Print and save all results
    print("\nModel Performance Summary:")
    print("=" * 50)
    for model_name, metrics in ml_results.items():
        print(f"\n{model_name}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")
    
    # Save metrics to file
    with open('model_metrics.json', 'w') as f:
        json.dump(ml_results, f, indent=4)
    
    print("\nMetrics saved to model_metrics.json")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
from tabulate import tabulate
import json

def display_metrics_in_terminal(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    table_data = []
    for model_name, metrics in data.items():
        table_data.append([
            model_name,
            metrics['accuracy'],
            metrics['precision'],
            metrics['recall'],
            metrics['f1'],
            metrics['auc']
        ])
    
    headers = ["Model", "Accuracy", "Precision", "Recall", "F1 Score", "AUC"]
    print(tabulate(table_data, headers=headers, tablefmt='psql'))

if __name__ == "__main__":
    display_metrics_in_terminal("model_metrics.json")
    
import json
from tabulate import tabulate
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def load_results(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

def create_performance_table(results):
    # Prepare data for tabulation
    table_data = []
    headers = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'AUC']

    for model_name, model_data in results.items():
        row = [
            model_name.upper(),
            f"{model_data['accuracy']:.4f}",
            f"{model_data['precision']:.4f}",
            f"{model_data['recall']:.4f}",
            f"{model_data['f1']:.4f}",
            f"{model_data['auc']:.4f}"
        ]
        table_data.append(row)

    # Sort by AUC
    table_data.sort(key=lambda x: float(x[5]), reverse=True)
    return table_data, headers

def plot_comparison(results):
    models = list(results.keys())
    auc_scores = [results[m]['auc'] for m in models]
    f1_scores = [results[m]['f1'] for m in models]

    # Create figure with subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

    # Bar plot comparing AUC scores
    ax1.bar(models, auc_scores, label='AUC', color='skyblue')
    ax1.set_ylabel('AUC Score')
    ax1.set_title('AUC Scores by Model')
    ax1.set_xticks(range(len(models)))
    ax1.set_xticklabels([m.upper() for m in models], rotation=45)
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # Bar plot showing F1 scores
    ax2.bar(models, f1_scores, color='lightgreen')
    ax2.set_ylabel('F1 Score')
    ax2.set_title('F1 Scores by Model')
    ax2.set_xticks(range(len(models)))
    ax2.set_xticklabels([m.upper() for m in models], rotation=45)
    ax2.grid(True, alpha=0.3)

    plt.tight_layout()
    return fig

def main():
    # Load results
    results = load_results('model_metrics.json')

    # Create and display performance table
    table_data, headers = create_performance_table(results)
    print("\nModel Performance Summary:")
    print(tabulate(table_data, headers=headers, tablefmt='grid'))

    # Print summary statistics
    print("\nSummary Statistics:")
    print("-" * 50)
    auc_scores = [float(row[5]) for row in table_data]
    print(f"Average AUC: {sum(auc_scores)/len(auc_scores):.4f}")
    print(f"Best AUC: {max(auc_scores):.4f} ({table_data[0][0]} model)")

    # Create and save visualization
    fig = plot_comparison(results)
    plt.savefig('performance_comparison.png')
    plt.close()

    print("\nVisualization has been saved to 'performance_comparison.png'")

if __name__ == "__main__":
    main()



In [None]:
import optuna
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import joblib
import json
from train import preprocess_data
import pandas as pd
from sklearn.model_selection import train_test_split

def optimize_random_forest(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None]),
        'random_state': 42
    }
    model = RandomForestClassifier(**params)
    return cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

def optimize_svm(trial, X, y):
    params = {
        'C': trial.suggest_loguniform('C', 1e-3, 1e3),
        'gamma': trial.suggest_loguniform('gamma', 1e-3, 1e3),
        'kernel': trial.suggest_categorical('kernel', ['rbf', 'sigmoid', 'poly']),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'random_state': 42,
        'probability': True
    }
    model = SVC(**params)
    return cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

def optimize_xgboost(trial, X, y):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'scale_pos_weight': trial.suggest_loguniform('scale_pos_weight', 1, 10),
        'random_state': 42
    }
    model = XGBClassifier(**params)
    return cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()

def optimize_gbm(trial, X, y):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'random_state': 42
    }
    model = GradientBoostingClassifier(**params)
    return cross_val_score(model, X, y, cv=5, scoring='roc_auc').mean()

def optimize_mlp(trial, X, y):
    layers = trial.suggest_int('n_layers', 1, 3)
    params = {
        'hidden_layer_sizes': tuple(
            trial.suggest_int(f'n_units_l{i}', 32, 256) for i in range(layers)
        ),
        'learning_rate_init': trial.suggest_loguniform('learning_rate_init', 1e-4, 1e-1),
        'max_iter': 1000,
        'random_state': 42
    }
    model = MLPClassifier(**params)
    return cross_val_score(model, X, y, cv=5, scoring='roc_auc').mean()

def optimize_knn(trial, X, y):
    params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 20),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'p': trial.suggest_int('p', 1, 2)  # 1 for manhattan, 2 for euclidean
    }
    model = KNeighborsClassifier(**params)
    return cross_val_score(model, X, y, cv=5, scoring='roc_auc').mean()

def main():
    # Load and preprocess data
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv", skipinitialspace=True)
    processed_data = preprocess_data(data)
    X = processed_data.values
    y = (data['Diagnosis'] == 'Glaucoma').astype(int).values

    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Dictionary of optimization functions
    optimization_funcs = {
        'Random_Forest': optimize_random_forest,
        'SVM': optimize_svm,
        'XGBoost': optimize_xgboost,
        'GBM': optimize_gbm,
        'Neural_Network': optimize_mlp,
        'KNN': optimize_knn
    }

    # Store best parameters and scores
    best_params = {}
    best_scores = {}

    # Run optimization for each model
    n_trials = 100  # Increased from 50 to 100
    for model_name, optimize_func in optimization_funcs.items():
        print(f"\nOptimizing {model_name}...")
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: optimize_func(trial, X_scaled, y), 
                      n_trials=n_trials)
        
        best_params[model_name] = study.best_params
        best_scores[model_name] = study.best_value
        
        print(f"Best {model_name} parameters:", study.best_params)
        print(f"Best {model_name} score:", study.best_value)

    # Save results
    results = {
        'best_parameters': best_params,
        'best_scores': best_scores
    }
    
    with open('hyperparameter_optimization_results.json', 'w') as f:
        json.dump(results, f, indent=4)

    print("\nOptimization results saved to hyperparameter_optimization_results.json")

if __name__ == "__main__":
    main()


In [None]:
### logistic regression:

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import numpy as np
import json

def load_best_params():
    try:
        with open('fast_hyperopt_results.json', 'r') as f:
            return json.load(f)['best_parameters'].get('Logistic_Regression', {})
    except FileNotFoundError:
        try:
            with open('hyperparameter_optimization_results.json', 'r') as f:
                return json.load(f)['best_parameters'].get('Logistic_Regression', {})
        except FileNotFoundError:
            return {}

def train(X_train, y_train):
    # Load best parameters if available
    best_params = load_best_params()
    default_params = {'max_iter': 2000}
    model_params = {**default_params, **best_params}
    
    # Create pipeline with SMOTE and model
    pipeline = ImbPipeline([
        ('sampling', SMOTE(random_state=42)),
        ('model', LogisticRegression(**model_params))
    ])
    
    # Train model
    print("\nTraining Logistic Regression...")
    pipeline.fit(X_train, y_train)
    
    # Save model
    model_path = 'glaucoma_model_Logistic_Regression.joblib'
    joblib.dump(pipeline, model_path)
    print(f"Model saved to {model_path}")
    
    return pipeline

def evaluate(model, X, y):
    # Get predictions
    y_pred = model.predict(X)
    
    # Get probability predictions if available
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X)[:, 1]
        # Convert probability predictions to binary if needed
        if np.any((y_pred > 1) | (y_pred < 0)):
            y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = y_pred
        y_pred = (y_pred >= 0.5).astype(int)
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'auc': roc_auc_score(y, y_prob)
    }
    
    return metrics

def predict(model, X):
    return model.predict(X)

def predict_proba(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)
    return model.predict(X)

def load_model(filepath='glaucoma_model_Logistic_Regression.joblib'):
    return joblib.load(filepath)

def main():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    
    # Load and preprocess data (simplified example)
    # In a real application, you would use the same preprocessing as in train.py
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
    X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
    y = (data['Diagnosis'] == 'Glaucoma').astype(int)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train model
    model = train(X_train_scaled, y_train)
    
    # Evaluate model
    metrics = evaluate(model, X_val_scaled, y_val)
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
if __name__ == "__main__":
    main()



In [None]:
### decision tree:

from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import numpy as np
import json

def load_best_params():
    try:
        with open('fast_hyperopt_results.json', 'r') as f:
            return json.load(f)['best_parameters'].get('Decision_Tree', {})
    except FileNotFoundError:
        try:
            with open('hyperparameter_optimization_results.json', 'r') as f:
                return json.load(f)['best_parameters'].get('Decision_Tree', {})
        except FileNotFoundError:
            return {}

def train(X_train, y_train):
    # Load best parameters if available
    best_params = load_best_params()
    default_params = {'random_state': 42}
    model_params = {**default_params, **best_params}
    
    # Create pipeline with SMOTE and model
    pipeline = ImbPipeline([
        ('sampling', SMOTE(random_state=42)),
        ('model', DecisionTreeClassifier(**model_params))
    ])
    
    # Train model
    print("\nTraining Decision Tree...")
    pipeline.fit(X_train, y_train)
    
    # Save model
    model_path = 'glaucoma_model_Decision_Tree.joblib'
    joblib.dump(pipeline, model_path)
    print(f"Model saved to {model_path}")
    
    return pipeline

def evaluate(model, X, y):
    # Get predictions
    y_pred = model.predict(X)
    
    # Get probability predictions if available
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X)[:, 1]
        # Convert probability predictions to binary if needed
        if np.any((y_pred > 1) | (y_pred < 0)):
            y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = y_pred
        y_pred = (y_pred >= 0.5).astype(int)
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'auc': roc_auc_score(y, y_prob)
    }
    
    return metrics

def predict(model, X):
    return model.predict(X)

def predict_proba(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)
    return model.predict(X)

def load_model(filepath='glaucoma_model_Decision_Tree.joblib'):
    return joblib.load(filepath)

def main():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    
    # Load and preprocess data (simplified example)
    # In a real application, you would use the same preprocessing as in train.py
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
    X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
    y = (data['Diagnosis'] == 'Glaucoma').astype(int)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train model
    model = train(X_train_scaled, y_train)
    
    # Evaluate model
    metrics = evaluate(model, X_val_scaled, y_val)
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
if __name__ == "__main__":
    main()


In [None]:
### random forest model:



In [None]:
###XAI for Random Forest(seconf best)
import numpy as np
import pandas as pd
import joblib
import shap
import lime.lime_tabular
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# -------------------------------
# 🔹 Load Data and Preprocess
# -------------------------------
data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
y = (data['Diagnosis'] == 'Glaucoma').astype(int)

# Convert DataFrame to NumPy array for model compatibility
X_numpy = X.values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Load Pretrained Random Forest Model
rf_model = joblib.load("glaucoma_model_Random_Forest.joblib")

# Select a Sample for Explanation
sample_idx = 10  # Pick a random test sample
X_sample = X_test[sample_idx].reshape(1, -1)  # Convert single instance to NumPy array

# -------------------------------
# 🔹 1. Feature Importance (Random Forest)
# -------------------------------
feature_importance = pd.Series(rf_model.named_steps['model'].feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importance[:10], y=feature_importance.index[:10], palette="coolwarm")
plt.xlabel("Feature Importance Score")
plt.title("Top 10 Feature Importance (Random Forest)")
plt.show()

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the dataset
valid_features = [f for f in feature_importance.index if f in X.columns][:2]  # Pick first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(rf_model.named_steps['model'], X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")

# -------------------------------
# 🔹 3. Permutation Importance
# -------------------------------
perm_importance = permutation_importance(rf_model, X_test_scaled, y_test, scoring='accuracy', n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance.importances_mean})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Permutation Importance (Random Forest)")
plt.show()

# -------------------------------
# 🔹 4. LIME Explanation (for one sample)
# -------------------------------
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, class_names=['No Glaucoma', 'Glaucoma'], discretize_continuous=True)
exp = explainer.explain_instance(X_sample[0], rf_model.predict_proba, num_features=5)
exp.show_in_notebook()

# -------------------------------
# 🔹 5. SHAP Explanation (Limited Visualizations)
# -------------------------------
explainer = shap.Explainer(rf_model.predict_proba, X_train_scaled)
shap_values = explainer(X_test_scaled)

# Summary Plot (Top 10 features only)
shap.summary_plot(shap_values[..., 1], X_test_scaled, feature_names=X.columns, max_display=10)  # Class 1 (Glaucoma)


In [None]:
### svm model:
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import numpy as np
import json

def load_best_params():
    try:
        with open('fast_hyperopt_results.json', 'r') as f:
            return json.load(f)['best_parameters'].get('SVM', {})
    except FileNotFoundError:
        try:
            with open('hyperparameter_optimization_results.json', 'r') as f:
                return json.load(f)['best_parameters'].get('SVM', {})
        except FileNotFoundError:
            return {}

def train(X_train, y_train):
    # Load best parameters if available
    best_params = load_best_params()
    default_params = {'random_state': 42, 'probability': True}
    model_params = {**default_params, **best_params}
    
    # Create pipeline with SMOTE and model
    pipeline = ImbPipeline([
        ('sampling', SMOTE(random_state=42)),
        ('model', SVC(**model_params))
    ])
    
    # Train model
    print("\nTraining SVM...")
    pipeline.fit(X_train, y_train)
    
    # Save model
    model_path = 'glaucoma_model_SVM.joblib'
    joblib.dump(pipeline, model_path)
    print(f"Model saved to {model_path}")
    
    return pipeline

def evaluate(model, X, y):
    # Get predictions
    y_pred = model.predict(X)
    
    # Get probability predictions if available
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X)[:, 1]
        # Convert probability predictions to binary if needed
        if np.any((y_pred > 1) | (y_pred < 0)):
            y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = y_pred
        y_pred = (y_pred >= 0.5).astype(int)
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'auc': roc_auc_score(y, y_prob)
    }
    
    return metrics

def predict(model, X):
    return model.predict(X)

def predict_proba(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)
    return model.predict(X)

def load_model(filepath='glaucoma_model_SVM.joblib'):
    return joblib.load(filepath)

def main():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    
    # Load and preprocess data (simplified example)
    # In a real application, you would use the same preprocessing as in train.py
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
    X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
    y = (data['Diagnosis'] == 'Glaucoma').astype(int)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train model
    model = train(X_train_scaled, y_train)
    
    # Evaluate model
    metrics = evaluate(model, X_val_scaled, y_val)
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
if __name__ == "__main__":
    main()


In [None]:
###XAI on SVM (second best)
import numpy as np
import pandas as pd
import joblib
import shap
import lime.lime_tabular
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# -------------------------------
# 🔹 Load Data and Preprocess
# -------------------------------
data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
y = (data['Diagnosis'] == 'Glaucoma').astype(int)

# Convert DataFrame to NumPy array for model compatibility
X_numpy = X.values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Load Pretrained SVM Model
svm_model = joblib.load("glaucoma_model_SVM.joblib")

# Select a Sample for Explanation
sample_idx = 10  # Pick a random test sample
X_sample = X_test[sample_idx].reshape(1, -1)  # Convert single instance to NumPy array

# -------------------------------
# 🔹 1. Feature Importance via Permutation Importance (SVM has no built-in feature importance)
# -------------------------------
perm_importance = permutation_importance(svm_model, X_test_scaled, y_test, scoring='accuracy', n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance.importances_mean})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Feature Importance (SVM)")
plt.show()

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the dataset
valid_features = perm_importance_df["Feature"][:2].tolist()  # Pick first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(svm_model, X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")

# -------------------------------
# 🔹 3. Permutation Importance (Direct Interpretation)
# -------------------------------
plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Permutation Importance (SVM Model)")
plt.show()

# -------------------------------
# 🔹 4. LIME Explanation (for one sample)
# -------------------------------
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, class_names=['No Glaucoma', 'Glaucoma'], discretize_continuous=True)
exp = explainer.explain_instance(X_sample[0], svm_model.decision_function, num_features=5)
exp.show_in_notebook()

# -------------------------------
# 🔹 5. SHAP Explanation (Limited Visualizations)
# -------------------------------
explainer = shap.Explainer(svm_model.decision_function, X_train_scaled)
shap_values = explainer(X_test_scaled)

# Summary Plot (Top 10 features only)
shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns, max_display=10)


In [None]:
### knn model:

from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import numpy as np
import json

def load_best_params():
    try:
        with open('fast_hyperopt_results.json', 'r') as f:
            return json.load(f)['best_parameters'].get('KNN', {})
    except FileNotFoundError:
        try:
            with open('hyperparameter_optimization_results.json', 'r') as f:
                return json.load(f)['best_parameters'].get('KNN', {})
        except FileNotFoundError:
            return {}

def train(X_train, y_train):
    # Load best parameters if available
    best_params = load_best_params()
    default_params = {'n_neighbors': 5}
    model_params = {**default_params, **best_params}
    
    # Create pipeline with SMOTE and model
    pipeline = ImbPipeline([
        ('sampling', SMOTE(random_state=42)),
        ('model', KNeighborsClassifier(**model_params))
    ])
    
    # Train model
    print("\nTraining KNN...")
    pipeline.fit(X_train, y_train)
    
    # Save model
    model_path = 'glaucoma_model_KNN.joblib'
    joblib.dump(pipeline, model_path)
    print(f"Model saved to {model_path}")
    
    return pipeline

def evaluate(model, X, y):
    # Get predictions
    y_pred = model.predict(X)
    
    # Get probability predictions if available
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X)[:, 1]
        # Convert probability predictions to binary if needed
        if np.any((y_pred > 1) | (y_pred < 0)):
            y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = y_pred
        y_pred = (y_pred >= 0.5).astype(int)
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'auc': roc_auc_score(y, y_prob)
    }
    
    return metrics

def predict(model, X):
    return model.predict(X)

def predict_proba(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)
    return model.predict(X)

def load_model(filepath='glaucoma_model_KNN.joblib'):
    return joblib.load(filepath)

def main():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    
    # Load and preprocess data (simplified example)
    # In a real application, you would use the same preprocessing as in train.py
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
    X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
    y = (data['Diagnosis'] == 'Glaucoma').astype(int)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train model
    model = train(X_train_scaled, y_train)
    
    # Evaluate model
    metrics = evaluate(model, X_val_scaled, y_val)
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
if __name__ == "__main__":
    main()


In [None]:
knn_model = joblib.load("/content/glaucoma_model_KNN.joblib")  # Update this path


In [None]:
### XAI on knn model(best performing model)
import numpy as np
import pandas as pd
import joblib
import shap

import lime.lime_tabular
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# -------------------------------
# 🔹 Load Data and Preprocess
# -------------------------------
data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
y = (data['Diagnosis'] == 'Glaucoma').astype(int)

# Convert DataFrame to NumPy array for KNN model
X_numpy = X.values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Load Pretrained KNN Model
knn_model = joblib.load("glaucoma_model_KNN.joblib")

# Train a Random Forest for Feature Importance (since KNN lacks built-in feature importance)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Select a Sample for Explanation
sample_idx = 10  # Pick a random test sample
X_sample = X_test[sample_idx].reshape(1, -1)  # Convert single instance to NumPy array

# -------------------------------
# 🔹 1. Feature Importance (Random Forest)
# -------------------------------
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importance[:10], y=feature_importance.index[:10], palette="coolwarm")
plt.xlabel("Feature Importance Score")
plt.title("Top 10 Feature Importance (Random Forest)")
plt.show()

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the scaled dataset
valid_features = [f for f in feature_importance.index if f in X.columns][:2]  # Pick the first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(rf_model, X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")

# -------------------------------
# 🔹 3. Permutation Importance
# -------------------------------
perm_importance = permutation_importance(knn_model, X_test_scaled, y_test, scoring='accuracy', n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance.importances_mean})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Permutation Importance (KNN Model)")
plt.show()

# -------------------------------
# 🔹 4. LIME Explanation (for one sample)
# -------------------------------
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, class_names=['No Glaucoma', 'Glaucoma'], discretize_continuous=True)
exp = explainer.explain_instance(X_sample[0], knn_model.predict_proba, num_features=5)
exp.show_in_notebook()

# -------------------------------
# 🔹 5. SHAP Explanation (Limited Visualizations)
# -------------------------------
explainer = shap.Explainer(knn_model.predict_proba, X_train_scaled)
shap_values = explainer(X_test_scaled)

# Summary Plot (Top 10 features only)
shap.summary_plot(shap_values[..., 1], X_test_scaled, feature_names=X.columns, max_display=10)  # Class 1 (Glaucoma)

In [None]:
### gbm model:

from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import numpy as np
import json

def load_best_params():
    try:
        with open('fast_hyperopt_results.json', 'r') as f:
            return json.load(f)['best_parameters'].get('GBM', {})
    except FileNotFoundError:
        try:
            with open('hyperparameter_optimization_results.json', 'r') as f:
                return json.load(f)['best_parameters'].get('GBM', {})
        except FileNotFoundError:
            return {}

def train(X_train, y_train):
    # Load best parameters if available
    best_params = load_best_params()
    default_params = {'random_state': 42}
    model_params = {**default_params, **best_params}
    
    # Create pipeline with SMOTE and model
    pipeline = ImbPipeline([
        ('sampling', SMOTE(random_state=42)),
        ('model', GradientBoostingClassifier(**model_params))
    ])
    
    # Train model
    print("\nTraining Gradient Boosting...")
    pipeline.fit(X_train, y_train)
    
    # Save model
    model_path = 'glaucoma_model_GBM.joblib'
    joblib.dump(pipeline, model_path)
    print(f"Model saved to {model_path}")
    
    return pipeline

def evaluate(model, X, y):
    # Get predictions
    y_pred = model.predict(X)
    
    # Get probability predictions if available
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X)[:, 1]
        # Convert probability predictions to binary if needed
        if np.any((y_pred > 1) | (y_pred < 0)):
            y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = y_pred
        y_pred = (y_pred >= 0.5).astype(int)
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'auc': roc_auc_score(y, y_prob)
    }
    
    return metrics

def predict(model, X):
    return model.predict(X)

def predict_proba(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)
    return model.predict(X)

def load_model(filepath='glaucoma_model_GBM.joblib'):
    return joblib.load(filepath)

def main():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    
    # Load and preprocess data (simplified example)
    # In a real application, you would use the same preprocessing as in train.py
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
    X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
    y = (data['Diagnosis'] == 'Glaucoma').astype(int)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train model
    model = train(X_train_scaled, y_train)
    
    # Evaluate model
    metrics = evaluate(model, X_val_scaled, y_val)
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
if __name__ == "__main__":
    main()


In [None]:
###XAI on GBM (worst performing model)
import numpy as np
import pandas as pd
import joblib
import shap
import lime.lime_tabular
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# -------------------------------
# 🔹 Load Data and Preprocess
# -------------------------------
data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
y = (data['Diagnosis'] == 'Glaucoma').astype(int)

# Convert DataFrame to NumPy array for model compatibility
X_numpy = X.values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Load Pretrained GBM Model
gbm_model = joblib.load("glaucoma_model_GBM.joblib")

# Select a Sample for Explanation
sample_idx = 10  # Pick a random test sample
X_sample = X_test[sample_idx].reshape(1, -1)  # Convert single instance to NumPy array

# -------------------------------
# 🔹 1. Feature Importance (GBM)
# -------------------------------
feature_importance = pd.Series(gbm_model.named_steps['model'].feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importance[:10], y=feature_importance.index[:10], palette="coolwarm")
plt.xlabel("Feature Importance Score")
plt.title("Top 10 Feature Importance (GBM)")
plt.show()

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the dataset
valid_features = [f for f in feature_importance.index if f in X.columns][:2]  # Pick first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(gbm_model.named_steps['model'], X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")

# -------------------------------
# 🔹 3. Permutation Importance
# -------------------------------
perm_importance = permutation_importance(gbm_model, X_test_scaled, y_test, scoring='accuracy', n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance.importances_mean})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Permutation Importance (GBM)")
plt.show()

# -------------------------------
# 🔹 4. LIME Explanation (for one sample)
# -------------------------------
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, class_names=['No Glaucoma', 'Glaucoma'], discretize_continuous=True)
exp = explainer.explain_instance(X_sample[0], gbm_model.predict_proba, num_features=5)
exp.show_in_notebook()

# -------------------------------
# 🔹 5. SHAP Explanation (Limited Visualizations)
# -------------------------------
explainer = shap.Explainer(gbm_model.predict_proba, X_train_scaled)
shap_values = explainer(X_test_scaled)

# Summary Plot (Top 10 features only)
shap.summary_plot(shap_values[..., 1], X_test_scaled, feature_names=X.columns, max_display=10)  # Class 1 (Glaucoma)


In [None]:
### xgboost model:

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import numpy as np
import json

def load_best_params():
    try:
        with open('fast_hyperopt_results.json', 'r') as f:
            return json.load(f)['best_parameters'].get('XGBoost', {})
    except FileNotFoundError:
        try:
            with open('hyperparameter_optimization_results.json', 'r') as f:
                return json.load(f)['best_parameters'].get('XGBoost', {})
        except FileNotFoundError:
            return {}

def train(X_train, y_train):
    # Load best parameters if available
    best_params = load_best_params()
    default_params = {
        'random_state': 42,
        'objective': 'binary:logistic',
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model_params = {**default_params, **best_params}
    
    # Create pipeline with SMOTE and model
    pipeline = ImbPipeline([
        ('sampling', SMOTE(random_state=42)),
        ('model', XGBClassifier(**model_params))
    ])
    
    # Train model
    print("\nTraining XGBoost...")
    pipeline.fit(X_train, y_train)
    
    # Save model
    model_path = 'glaucoma_model_XGBoost.joblib'
    joblib.dump(pipeline, model_path)
    print(f"Model saved to {model_path}")
    
    return pipeline

def evaluate(model, X, y):
    # Get predictions
    y_pred = model.predict(X)
    
    # Get probability predictions if available
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X)[:, 1]
        # Convert probability predictions to binary if needed
        if np.any((y_pred > 1) | (y_pred < 0)):
            y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = y_pred
        y_pred = (y_pred >= 0.5).astype(int)
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'auc': roc_auc_score(y, y_prob)
    }
    
    return metrics

def predict(model, X):
    return model.predict(X)

def predict_proba(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)
    return model.predict(X)

def load_model(filepath='glaucoma_model_XGBoost.joblib'):
    return joblib.load(filepath)

def main():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    
    # Load and preprocess data (simplified example)
    # In a real application, you would use the same preprocessing as in train.py
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
    X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
    y = (data['Diagnosis'] == 'Glaucoma').astype(int)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train model
    model = train(X_train_scaled, y_train)
    
    # Evaluate model
    metrics = evaluate(model, X_val_scaled, y_val)
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
if __name__ == "__main__":
    main()


In [None]:
### XAI on XGBoost model (second worst performing medel)
import numpy as np
import pandas as pd
import joblib
import shap
import lime.lime_tabular
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# -------------------------------
# 🔹 Load Data and Preprocess
# -------------------------------
data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
y = (data['Diagnosis'] == 'Glaucoma').astype(int)

# Convert DataFrame to NumPy array for model compatibility
X_numpy = X.values

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Load Pretrained XGBoost Model
xgb_model = joblib.load("glaucoma_model_XGBoost.joblib")

# Select a Sample for Explanation
sample_idx = 10  # Pick a random test sample
X_sample = X_test[sample_idx].reshape(1, -1)  # Convert single instance to NumPy array

# -------------------------------
# 🔹 1. Feature Importance (XGBoost)
# -------------------------------
feature_importance = pd.Series(xgb_model.named_steps['model'].feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importance[:10], y=feature_importance.index[:10], palette="coolwarm")
plt.xlabel("Feature Importance Score")
plt.title("Top 10 Feature Importance (XGBoost)")
plt.show()

# -------------------------------
# 🔹 2. Partial Dependence Plot (PDP) - FIXED
# -------------------------------
# Use only features present in the dataset
valid_features = [f for f in feature_importance.index if f in X.columns][:2]  # Pick first 2 valid features

if valid_features:
    display = PartialDependenceDisplay.from_estimator(xgb_model.named_steps['model'], X_train_scaled, features=[X.columns.get_loc(f) for f in valid_features], grid_resolution=50)
    display.plot()
    plt.suptitle("Partial Dependence Plots (Top 2 Features)")
    plt.show()
else:
    print("No valid features found for Partial Dependence Plot.")

# -------------------------------
# 🔹 3. Permutation Importance
# -------------------------------
perm_importance = permutation_importance(xgb_model, X_test_scaled, y_test, scoring='accuracy', n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': perm_importance.importances_mean})
perm_importance_df = perm_importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=perm_importance_df["Importance"][:10], y=perm_importance_df["Feature"][:10], palette="coolwarm")
plt.xlabel("Permutation Importance Score")
plt.title("Top 10 Permutation Importance (XGBoost)")
plt.show()

# -------------------------------
# 🔹 4. LIME Explanation (for one sample)
# -------------------------------
explainer = lime.lime_tabular.LimeTabularExplainer(X_train_scaled, feature_names=X.columns, class_names=['No Glaucoma', 'Glaucoma'], discretize_continuous=True)
exp = explainer.explain_instance(X_sample[0], xgb_model.predict_proba, num_features=5)
exp.show_in_notebook()

# -------------------------------
# 🔹 5. SHAP Explanation (Limited Visualizations)
# -------------------------------
explainer = shap.Explainer(xgb_model.predict_proba, X_train_scaled)
shap_values = explainer(X_test_scaled)

# Summary Plot (Top 10 features only)
shap.summary_plot(shap_values[..., 1], X_test_scaled, feature_names=X.columns, max_display=10)  # Class 1 (Glaucoma)


In [None]:
### neural network model:

from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib
import numpy as np
import json

def load_best_params():
    try:
        with open('fast_hyperopt_results.json', 'r') as f:
            return json.load(f)['best_parameters'].get('Neural_Network', {})
    except FileNotFoundError:
        try:
            with open('hyperparameter_optimization_results.json', 'r') as f:
                return json.load(f)['best_parameters'].get('Neural_Network', {})
        except FileNotFoundError:
            return {}

def train(X_train, y_train):
    # Load best parameters if available
    best_params = load_best_params()
    default_params = {
        'hidden_layer_sizes': (64, 32),
        'max_iter': 1000,
        'random_state': 42
    }
    model_params = {**default_params, **best_params}
    
    # Create pipeline with SMOTE and model
    pipeline = ImbPipeline([
        ('sampling', SMOTE(random_state=42)),
        ('model', MLPClassifier(**model_params))
    ])
    
    # Train model
    print("\nTraining Neural Network...")
    pipeline.fit(X_train, y_train)
    
    # Save model
    model_path = 'glaucoma_model_Neural_Network.joblib'
    joblib.dump(pipeline, model_path)
    print(f"Model saved to {model_path}")
    
    return pipeline

def evaluate(model, X, y):
    # Get predictions
    y_pred = model.predict(X)
    
    # Get probability predictions if available
    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X)[:, 1]
        # Convert probability predictions to binary if needed
        if np.any((y_pred > 1) | (y_pred < 0)):
            y_pred = (y_prob >= 0.5).astype(int)
    else:
        y_prob = y_pred
        y_pred = (y_pred >= 0.5).astype(int)
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y, y_pred),
        'precision': precision_score(y, y_pred),
        'recall': recall_score(y, y_pred),
        'f1': f1_score(y, y_pred),
        'auc': roc_auc_score(y, y_prob)
    }
    
    return metrics

def predict(model, X):
    return model.predict(X)

def predict_proba(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)
    return model.predict(X)

def load_model(filepath='glaucoma_model_Neural_Network.joblib'):
    return joblib.load(filepath)

def main():
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    
    # Load and preprocess data (simplified example)
    # In a real application, you would use the same preprocessing as in train.py
    data = pd.read_csv("/kaggle/input/glaucoma/glaucoma_dataset.csv")
    X = data.drop('Diagnosis', axis=1).select_dtypes(include=['number']).fillna(0)
    y = (data['Diagnosis'] == 'Glaucoma').astype(int)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Train model
    model = train(X_train_scaled, y_train)
    
    # Evaluate model
    metrics = evaluate(model, X_val_scaled, y_val)
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
if __name__ == "__main__":
    main()
