In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score, roc_curve)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import MinMaxScaler 
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [None]:
# Function to plot confusion matrix with percentages
def plot_confusion_matrix_with_percentages(y_true, y_pred, title="Confusion Matrix", filename="confusion_matrix.png"):
    cm = confusion_matrix(y_true, y_pred)
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100  # Percentages of each class
    labels = np.unique(y_true)  # Assumes that the unique labels are the same for both sets

    # Create a heatmap
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm_percentage, annot=True, fmt='.2f', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(filename)
    plt.show()

In [None]:
# Define the function for plotting feature importances without numbers
def plot_feature_importance(features_df, title, filename):
    # Normalize the importance values to scale them between 0 and 1 for the gradient
    scaler = MinMaxScaler()
    features_df['Importance_Scaled'] = scaler.fit_transform(features_df[['Importance']])

    # Plot the top 20 features
    plt.figure(figsize=(10, 6))  # Adjust size as needed
    sns.barplot(x='Importance', y='Feature', data=features_df, palette='viridis')

    # Customize the plot
    plt.title(title, fontsize=18)
    plt.xlabel('Importance', fontsize=14)
    plt.ylabel('Feature', fontsize=14)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    
    # Save the plot
    plt.tight_layout()
    plt.savefig(filename, dpi=300)  # Save the plot with high resolution

    # Show the plot
    plt.show()

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load the dataset
excel_file = '/kaggle/input/paper03data/Python.xlsx'
df = pd.read_excel(excel_file)

# Preview the data
print(df.head())

# Select most relevant features

In [None]:
# Compute the correlation matrix
correlation_matrix = df.corr()

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title("Feature Correlation Matrix", fontsize=16)

# Save the figure in high quality
save_path = '/kaggle/working/correlation_matrix.png'
plt.savefig(save_path, dpi=300, bbox_inches='tight') 

# Show the figure
plt.show()

print(f"Correlation matrix figure saved at: {save_path}")

# Set a correlation threshold
threshold = 0.97

# Identify features with high correlation
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            correlated_features.add(correlation_matrix.columns[i])

print(f"Features to be removed due to high correlation: {correlated_features}")

# Drop highly correlated features
df = df.drop(columns=correlated_features)

print(f"Remaining columns after correlation filter: {df.columns}")

# Preview the data
print(df.head())

In [None]:
# Define the variance threshold
var_threshold = 0.01

# Apply variance thresholding
selector = VarianceThreshold(threshold=var_threshold)
df_high_variance = selector.fit_transform(df)

# Get the remaining features
selected_features = df.columns[selector.get_support()]
df = pd.DataFrame(df_high_variance, columns=selected_features)

print(f"Remaining columns after variance filter: {df.columns}")

# Preview the data
print(df.head())

# Define the model

In [None]:
# Define the features and the target
X = df.drop(columns=['f(vij)obj'])
y = df['f(vij)obj']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Run all models

In [None]:
# Define the number of folds
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
# Define models
models = {
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
}

In [None]:
# Dictionary to store cross-validation results
model_results = {}

# Evaluate models using K-Fold Cross Validation
for model_name, model in models.items():
    print(f'Evaluating {model_name} with K-Fold...')
    
    accuracy_scores = []
    roc_auc_scores = []

    for train_idx, test_idx in kfold.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
        y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]
        
        # Train the model
        model.fit(X_train_fold, y_train_fold)
        
        # Make predictions
        y_pred = model.predict(X_test_fold)
        y_proba = model.predict_proba(X_test_fold)[:, 1] if hasattr(model, "predict_proba") else None
        
        # Compute metrics
        acc = accuracy_score(y_test_fold, y_pred)
        roc_auc = roc_auc_score(y_test_fold, y_proba) if y_proba is not None else None
        
        accuracy_scores.append(acc)
        if roc_auc is not None:
            roc_auc_scores.append(roc_auc)
    
    # Store the mean metrics
    model_results[model_name] = {
        'Mean Accuracy': np.mean(accuracy_scores),
        'Mean ROC AUC': np.mean(roc_auc_scores) if roc_auc_scores else None
    }

    print(f'{model_name} - Accuracy: {np.mean(accuracy_scores):.2f}, ROC AUC: {np.mean(roc_auc_scores) if roc_auc_scores else "N/A"}')

# Convert results to a DataFrame
results_df = pd.DataFrame.from_dict(model_results, orient='index')
print(results_df)

In [None]:
# Exclude LDA from plots
non_linear_models = {name: model for name, model in models.items() if name != 'LDA'}

# Extract model names, accuracy, and ROC AUC scores
model_names = list(non_linear_models.keys())
train_accuracies = [model_results[name]['Mean Accuracy'] for name in model_names]
roc_aucs = [model_results[name]['Mean ROC AUC'] for name in model_names]

# HORIZONTAL BAR PLOT - TRAINING ACCURACY & TEST ACCURACY
plt.figure(figsize=(10, 6))
colors = sns.color_palette("viridis", len(model_names))

sns.barplot(x=train_accuracies, y=model_names, palette=colors)

plt.xlabel('Mean Accuracy')
plt.ylabel('Model')
plt.title('Training & Test Accuracy for Non-Linear Models')
plt.xlim(0, 1)

# Save the figure
plt.savefig("model_accuracy_barplot.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# ROC Curves for Non-Linear Models
non_linear_models = {name: model for name, model in models.items() if name not in ['LDA', 'Logistic Regression']}

plt.figure(figsize=(10, 6))
for model_name, model in non_linear_models.items():
    model.fit(X_train, y_train)
    y_train_proba = model.predict_proba(X_train)[:, 1]
    y_test_proba = model.predict_proba(X_test)[:, 1]

    # Compute ROC curves
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)

    # Plot
    plt.plot(fpr_train, tpr_train, linestyle='--', label=f'{model_name} - Train')
    plt.plot(fpr_test, tpr_test, label=f'{model_name} - Test')

# Plot Random Classifier line
plt.plot([0, 1], [0, 1], linestyle='dotted', color='black')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Non-Linear Models')
plt.legend()

# Save the ROC curve plot
plt.savefig("roc_curves_non_linear_models.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Select the best model based on highest mean accuracy from cross-validation
best_model_name = max(model_results, key=lambda k: model_results[k]['Mean Accuracy'])
best_model = models[best_model_name]

print(f"\nRetraining the best model: {best_model_name} on full training data...")

# Train the best model on the full training set
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_test_pred = best_model.predict(X_test)
y_test_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None

# Compute final performance metrics
final_accuracy = accuracy_score(y_test, y_test_pred)
final_roc_auc = roc_auc_score(y_test, y_test_proba) if y_test_proba is not None else None

# Display final results
print(f"\nFinal Test Set Results for {best_model_name}:")
print(f"Accuracy: {final_accuracy:.2f}")
print(f"ROC AUC: {final_roc_auc:.2f}" if final_roc_auc is not None else "ROC AUC: N/A")

# 1. LDA

In [None]:
# Initialize the LDA model
lda_model = LinearDiscriminantAnalysis()

# Train the model
lda_model.fit(X_train, y_train)

# Predictions
y_train_pred_lda = lda_model.predict(X_train)
y_test_pred_lda = lda_model.predict(X_test)

# Accuracy
train_accuracy_lda = accuracy_score(y_train, y_train_pred_lda)
test_accuracy_lda = accuracy_score(y_test, y_test_pred_lda)

print(f'LDA - Training Accuracy: {train_accuracy_lda:.2f}')
print(f'LDA - Test Accuracy: {test_accuracy_lda:.2f}')
print("Classification Report for LDA (Training Set):")
print(classification_report(y_train, y_train_pred_lda))
print("Classification Report for LDA (Test Set):")
print(classification_report(y_test, y_test_pred_lda))

**Add Randomized SearchCV**

In [None]:
# Define the parameter grid for RandomizedSearchCV
param_dist_lda = {
    'solver': ['svd', 'lsqr', 'eigen'],  # We will try all three solvers
    'shrinkage': [None, 0.0, 0.1, 0.2, 0.3, 0.5, 0.7, 1.0]  # Shrinkage for lsqr and eigen only
}

# Adjusting RandomizedSearchCV parameters to handle the shrinkage issue
random_search_lda = RandomizedSearchCV(
    estimator=lda_model,
    param_distributions=param_dist_lda,
    n_iter=15,  # Reduce iterations to 15 to avoid excessive computation
    cv=3,  # Cross-validation folds
    verbose=2,
    random_state=7,
    n_jobs=-1
)

# Fit the RandomizedSearchCV on the training data
random_search_lda.fit(X_train, y_train)

# Get the best parameters from RandomizedSearchCV
best_params_random_lda = random_search_lda.best_params_
print(f"Best parameters from Randomized Search for LDA: {best_params_random_lda}")

**Add GridSearchCV**

In [None]:
# Ensure valid shrinkage values for LDA
def get_valid_shrinkage(solver, shrinkage):
    if solver == 'svd':
        return [None]  # 'svd' does not support shrinkage
    elif solver in ['lsqr', 'eigen'] and shrinkage is not None:
        # For lsqr and eigen solvers, generate valid shrinkage values
        return [max(0.0, shrinkage - 0.1), shrinkage, min(1.0, shrinkage + 0.1)]
    return [None]  # Return None by default for unsupported solvers or when shrinkage is None

# Extract the best solver and shrinkage from RandomizedSearchCV
best_solver_lda = best_params_random_lda['solver']
best_shrinkage_lda = best_params_random_lda.get('shrinkage', None)

# Create the parameter grid for GridSearchCV based on RandomizedSearchCV result
param_grid_lda = {
    'solver': [best_solver_lda],
    'shrinkage': get_valid_shrinkage(best_solver_lda, best_shrinkage_lda)
}

# Set up GridSearchCV for LDA
grid_search_lda = GridSearchCV(
    estimator=lda_model,
    param_grid=param_grid_lda,
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV on the training data
grid_search_lda.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
best_params_grid_lda = grid_search_lda.best_params_
print(f"Best parameters from Grid Search for LDA: {best_params_grid_lda}")

# Predict using the best model from GridSearchCV
best_lda_model = grid_search_lda.best_estimator_
y_test_pred_lda = best_lda_model.predict(X_test)

# Evaluate the model performance
test_accuracy_lda = accuracy_score(y_test, y_test_pred_lda)
print(f'Test Set Accuracy with Best Hyperparameters from Grid Search: {test_accuracy_lda * 100:.2f}%')

**Evaluate the model**

In [None]:
# Get the best model from Grid Search
best_lda_model = grid_search_lda.best_estimator_

# Predictions on the training set
y_train_pred_lda = best_lda_model.predict(X_train)

# Predictions on the test set
y_test_pred_lda = best_lda_model.predict(X_test)

In [None]:
# 1. Accuracy Score for the Training Set
train_accuracy_lda = accuracy_score(y_train, y_train_pred_lda)
print(f'Train Set Accuracy (LDA): {train_accuracy_lda * 100:.2f}%')

# 2. Accuracy Score for the Test Set
test_accuracy_lda = accuracy_score(y_test, y_test_pred_lda)
print(f'Test Set Accuracy (LDA): {test_accuracy_lda * 100:.2f}%')

# How many predictions were correct out of the total for both sets
train_correct_lda = sum(y_train_pred_lda == y_train)
test_correct_lda = sum(y_test_pred_lda == y_test)

print(f'Training set (LDA): {train_correct_lda} correct out of {len(y_train)}')
print(f'Test set (LDA): {test_correct_lda} correct out of {len(y_test)}')

# Sensitivity (Recall) and Specificity for Training Set
cm_train_lda = confusion_matrix(y_train, y_train_pred_lda)
if cm_train_lda.shape == (2, 2):
    TN, FP, FN, TP = cm_train_lda.ravel()
    sensitivity_train_lda = TP / (TP + FN)
    specificity_train_lda = TN / (TN + FP)
else:
    sensitivity_train_lda = recall_score(y_train, y_train_pred_lda, average='macro')
    specificity_train_lda = np.nan  # Not defined for multiclass

# Sensitivity (Recall) and Specificity for Test Set
cm_test_lda = confusion_matrix(y_test, y_test_pred_lda)
if cm_test_lda.shape == (2, 2):
    TN, FP, FN, TP = cm_test_lda.ravel()
    sensitivity_test_lda = TP / (TP + FN)
    specificity_test_lda = TN / (TN + FP)
else:
    sensitivity_test_lda = recall_score(y_test, y_test_pred_lda, average='macro')
    specificity_test_lda = np.nan  # Not defined for multiclass

# 3. Summary Table
summary_lda = pd.DataFrame({
    'Metric': ['Accuracy', 'Sensitivity', 'Specificity'],
    'Training Set': [train_accuracy_lda, sensitivity_train_lda, specificity_train_lda],
    'Test Set': [test_accuracy_lda, sensitivity_test_lda, specificity_test_lda]
})

print("\nLDA Performance Summary:")
print(summary_lda)

In [None]:
# Initialize variables
if cm_train_lda.shape == (2, 2):  # Binary classification
    TN_train, FP_train, FN_train, TP_train = cm_train_lda.ravel()
    TN_test, FP_test, FN_test, TP_test = cm_test_lda.ravel()
    
    # Sensitivity (Recall) & Specificity calculations
    sensitivity_train_lda = TP_train / (TP_train + FN_train) * 100
    specificity_train_lda = TN_train / (TN_train + FP_train) * 100
    sensitivity_test_lda = TP_test / (TP_test + FN_test) * 100
    specificity_test_lda = TN_test / (TN_test + FP_test) * 100
    
    # Total Predictions
    total_train = len(y_train)
    total_test = len(y_test)

    # Summary Table in Required Format
    summary_lda = pd.DataFrame({
        "Data Set": ["Training", "", "Total", "Test", "", "Total"],
        "Observed Classification": ["f(vij)obs=0", "f(vij)obs=1", "", "f(vij)obs=0", "f(vij)obs=1", ""],
        "Stat. Param.": ["Sp (%)", "Sn (%)", "Ac (%)", "Sp (%)", "Sn (%)", "Ac (%)"],
        "Pred. Stats.": [
            specificity_train_lda, sensitivity_train_lda, train_accuracy_lda,
            specificity_test_lda, sensitivity_test_lda, test_accuracy_lda
        ],
        "nj": [TN_train + FP_train, TP_train + FN_train, total_train, TN_test + FP_test, TP_test + FN_test, total_test],
        "f(vij)pred=0": [TN_train, FN_train, TN_train + FN_train, TN_test, FN_test, TN_test + FN_test],
        "f(vij)pred=1": [FP_train, TP_train, FP_train + TP_train, FP_test, TP_test, FP_test + TP_test]
    })

    # Print the formatted summary
    print("\nLDA Performance Summary:")
    print(summary_lda.to_string(index=False))

else:
    print("Multiclass classification detected – Specificity not defined.")

In [None]:
# 4. Classification Report for the Training Set (LDA)
print("Classification Report on Training Set (LDA):")
print(classification_report(y_train, y_train_pred_lda, digits=4))

# 5. Classification Report for the Test Set (LDA)
print("Classification Report on Test Set (LDA):")
print(classification_report(y_test, y_test_pred_lda, digits=4))

In [None]:
# 6. Confusion Matrix for the Training Set (LDA)
print("Confusion Matrix on Training Set (LDA):")
print(confusion_matrix(y_train, y_train_pred_lda))

# 7. Confusion Matrix for the Test Set (LDA)
print("Confusion Matrix on Test Set (LDA):")
print(confusion_matrix(y_test, y_test_pred_lda))

# Plot and save confusion matrices for LDA
plot_confusion_matrix_with_percentages(y_train, y_train_pred_lda, title="Training Set (LDA)", filename="LDA_Train_Confusion_Matrix")
plot_confusion_matrix_with_percentages(y_test, y_test_pred_lda, title="Test Set (LDA)", filename="LDA_Test_Confusion_Matrix")

In [None]:
# 8. AUC (Area Under the ROC Curve) for LDA
# For binary classification (1 and 0):
y_test_proba_lda = best_lda_model.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class (1)
test_auc_lda = roc_auc_score(y_test, y_test_proba_lda)
print(f'Test AUC (LDA): {test_auc_lda:.4f}')

# For the training set
y_train_proba_lda = best_lda_model.predict_proba(X_train)[:, 1]
train_auc_lda = roc_auc_score(y_train, y_train_proba_lda)
print(f'Training AUC (LDA): {train_auc_lda:.4f}')

# Plot ROC curve for the Test set
fpr_test_lda, tpr_test_lda, _ = roc_curve(y_test, y_test_proba_lda)
roc_auc_test_lda = auc(fpr_test_lda, tpr_test_lda)

# Plot ROC curve for the Training set
fpr_train_lda, tpr_train_lda, _ = roc_curve(y_train, y_train_proba_lda)
roc_auc_train_lda = auc(fpr_train_lda, tpr_train_lda)

# Plot the ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr_test_lda, tpr_test_lda, color='blue', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test_lda:.2f})')
plt.plot(fpr_train_lda, tpr_train_lda, color='green', lw=2, label=f'Training ROC curve (AUC = {roc_auc_train_lda:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)  # Diagonal line (random classifier)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (LDA)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
# 9. Get feature importances from the best LDA model
lda_coefficients = best_lda_model.coef_.flatten()

# Create a DataFrame for LDA feature importances
features_df_lda = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': np.abs(lda_coefficients)
})

# Sort the DataFrame by importance
features_df_lda = features_df_lda.sort_values(by='Importance', ascending=False)

# Select the top 20 features for LDA
top_20_features_lda = features_df_lda.head(20)

# Plot the top 20 feature importances for LDA
plot_feature_importance(top_20_features_lda, 'Top 20 Feature Importances (LDA)', '/kaggle/working/Top_20_Feature_Importances_LDA.png')

# 2. XGBoost

In [None]:
# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Predictions
y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb = xgb_model.predict(X_test)

# Accuracy
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)

print(f'XGBoost - Training Accuracy: {train_accuracy_xgb:.2f}')
print(f'XGBoost - Test Accuracy: {test_accuracy_xgb:.2f}')
print("Classification Report for XGBoost (Training Set):")
print(classification_report(y_train, y_train_pred_xgb))
print("Classification Report for XGBoost (Test Set):")
print(classification_report(y_test, y_test_pred_xgb))

**Add Randomized SearchCV**

In [None]:
# Define the hyperparameter grid
param_distributions_xgb = {
    'n_estimators': np.arange(50, 201, 10),
    'max_depth': [3, 6, 10, 15],
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 5, 10]
}

# Set up RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions_xgb,
    n_iter=30,  
    cv=3,      
    verbose=2,
    random_state=7,
    n_jobs=-1
)

# Fit RandomizedSearchCV on the training data
random_search_xgb.fit(X_train, y_train)

# Get the best parameters from RandomizedSearchCV
best_params_random_xgb = random_search_xgb.best_params_
print("Best parameters from Randomized Search for XGBoost:", best_params_random_xgb)

**Add GridSearchCV**

In [None]:
# Ensure we don't have invalid n_estimators
n_estimators_values_xgb = [
    max(1, best_params_random_xgb['n_estimators'] - 10),
    best_params_random_xgb['n_estimators'],
    best_params_random_xgb['n_estimators'] + 10
]

# Create the parameter grid
param_grid_xgb = {
    'n_estimators': n_estimators_values_xgb,
    'max_depth': [best_params_random_xgb['max_depth'] - 2, best_params_random_xgb['max_depth'], best_params_random_xgb['max_depth'] + 2],
    'learning_rate': [best_params_random_xgb['learning_rate'] - 0.05, best_params_random_xgb['learning_rate'], best_params_random_xgb['learning_rate'] + 0.05],
    'subsample': [best_params_random_xgb['subsample']],
    'colsample_bytree': [best_params_random_xgb['colsample_bytree']],
    'gamma': [best_params_random_xgb['gamma']],
    'min_child_weight': [best_params_random_xgb['min_child_weight']]
}

# Set up GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV on the training data
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
best_params_grid_xgb = grid_search_xgb.best_params_
print("Best parameters from Grid Search for XGBoost:", best_params_grid_xgb)

In [None]:
# Train final XGBoost model with the best hyperparameters
best_xgb_model = xgb.XGBClassifier(**best_params_grid_xgb, use_label_encoder=False, eval_metric='logloss', random_state=42)

# Perform K-Fold Cross-Validation on the final tuned model
final_cv_scores = cross_val_score(best_xgb_model, X, y, cv=kf, scoring='accuracy')

print(f'Final XGBoost Model - Mean Cross-Validation Accuracy: {final_cv_scores.mean():.4f} ± {final_cv_scores.std():.4f}')

# Train on full dataset and evaluate predictions
best_xgb_model.fit(X, y)

y_pred = best_xgb_model.predict(X)

# Predict using the best model
best_xgb_model = grid_search_xgb.best_estimator_
y_test_pred_xgb = best_xgb_model.predict(X_test)

# Evaluate the model performance
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
print(f'Test Set Accuracy with Best Hyperparameters from Grid Search: {test_accuracy_xgb * 100:.2f}%')

**Evaluate the model**

In [None]:
# Get the best model from Grid Search
best_xgb_model = grid_search_xgb.best_estimator_

# Predictions on the training set
y_train_pred_xgb = best_xgb_model.predict(X_train)

# Predictions on the test set
y_test_pred_xgb = best_xgb_model.predict(X_test)

In [None]:
# 1. Accuracy Score for the Training Set
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
print(f'Train Set Accuracy: {train_accuracy_xgb * 100:.2f}%')

# 2. Accuracy Score for the Test Set
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
print(f'Test Set Accuracy: {test_accuracy_xgb * 100:.2f}%')

# How many predictions were correct out of the total for both sets
train_correct_xgb = sum(y_train_pred_xgb == y_train)
test_correct_xgb = sum(y_test_pred_xgb == y_test)

print(f'Training set: {train_correct_xgb} correct out of {len(y_train)}')
print(f'Test set: {test_correct_xgb} correct out of {len(y_test)}')

# Sensitivity (Recall) and Specificity for Training Set
cm_train_xgb = confusion_matrix(y_train, y_train_pred_xgb)
if cm_train_xgb.shape == (2, 2):
    TN, FP, FN, TP = cm_train_xgb.ravel()
    sensitivity_train_xgb = TP / (TP + FN)
    specificity_train_xgb = TN / (TN + FP)
else:
    sensitivity_train_xgb = recall_score(y_train, y_train_pred_xgb, average='macro')
    specificity_train_xgb = np.nan  # Not defined for multiclass

# Sensitivity (Recall) and Specificity for Test Set
cm_test_xgb = confusion_matrix(y_test, y_test_pred_xgb)
if cm_test_xgb.shape == (2, 2):
    TN, FP, FN, TP = cm_test_xgb.ravel()
    sensitivity_test_xgb = TP / (TP + FN)
    specificity_test_xgb = TN / (TN + FP)
else:
    sensitivity_test_xgb = recall_score(y_test, y_test_pred_xgb, average='macro')
    specificity_test_xgb = np.nan  # Not defined for multiclass

# 3. Summary Table
summary_xgb = pd.DataFrame({
    'Metric': ['Accuracy', 'Sensitivity', 'Specificity'],
    'Training Set': [train_accuracy_xgb, sensitivity_train_xgb, specificity_train_xgb],
    'Test Set': [test_accuracy_xgb, sensitivity_test_xgb, specificity_test_xgb]
})

print("\nXGBoost Performance Summary:")
print(summary_xgb)

In [None]:
# Initialize variables
if cm_train_xgb.shape == (2, 2):  # Binary classification
    TN_train, FP_train, FN_train, TP_train = cm_train_xgb.ravel()
    TN_test, FP_test, FN_test, TP_test = cm_test_xgb.ravel()
    
    # Sensitivity (Recall) & Specificity calculations
    sensitivity_train_xgb = TP_train / (TP_train + FN_train) * 100
    specificity_train_xgb = TN_train / (TN_train + FP_train) * 100
    sensitivity_test_xgb = TP_test / (TP_test + FN_test) * 100
    specificity_test_xgb = TN_test / (TN_test + FP_test) * 100
    
    # Total Predictions
    total_train = len(y_train)
    total_test = len(y_test)

    # Summary Table in Required Format
    summary_xgb = pd.DataFrame({
        "Data Set": ["Training", "", "Total", "Test", "", "Total"],
        "Observed Classification": ["f(vij)obs=0", "f(vij)obs=1", "", "f(vij)obs=0", "f(vij)obs=1", ""],
        "Stat. Param.": ["Sp (%)", "Sn (%)", "Ac (%)", "Sp (%)", "Sn (%)", "Ac (%)"],
        "Pred. Stats.": [
            specificity_train_xgb, sensitivity_train_xgb, train_accuracy_xgb,
            specificity_test_xgb, sensitivity_test_xgb, test_accuracy_xgb
        ],
        "nj": [TN_train + FP_train, TP_train + FN_train, total_train, TN_test + FP_test, TP_test + FN_test, total_test],
        "f(vij)pred=0": [TN_train, FN_train, TN_train + FN_train, TN_test, FN_test, TN_test + FN_test],
        "f(vij)pred=1": [FP_train, TP_train, FP_train + TP_train, FP_test, TP_test, FP_test + TP_test]
    })

    # Print the formatted summary
    print("\nXGBoost Performance Summary:")
    print(summary_xgb.to_string(index=False))

else:
    print("Multiclass classification detected – Specificity not defined.")

In [None]:
# 4. Classification Report for the Training Set
print("Classification Report on Training Set:")
print(classification_report(y_train, y_train_pred_xgb, digits=4))

# 5. Classification Report for the Test Set
print("Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred_xgb, digits=4))

In [None]:
# 6. Confusion Matrix for the Training Set
print("Confusion Matrix on Training Set:")
print(confusion_matrix(y_train, y_train_pred_xgb))

# 7. Confusion Matrix for the Test Set
print("Confusion Matrix on Test Set:")
print(confusion_matrix(y_test, y_test_pred_xgb))

# Plot and save confusion matrices for XGB
plot_confusion_matrix_with_percentages(y_train, y_train_pred_xgb, title="Training Set (XGB)", filename="XGB_Train_Confusion_Matrix")
plot_confusion_matrix_with_percentages(y_test, y_test_pred_xgb, title="Test Set (XGB)", filename="XGB_Test_Confusion_Matrix")

In [None]:
# 8. AUC (Area Under the ROC Curve) for XGBoost
# For binary classification (1 and 0):
y_test_proba_xgb = best_xgb_model.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class (1)
test_auc_xgb = roc_auc_score(y_test, y_test_proba_xgb)
print(f'Test AUC (XGBoost): {test_auc_xgb:.4f}')

# For the training set
y_train_proba_xgb = best_xgb_model.predict_proba(X_train)[:, 1]
train_auc_xgb = roc_auc_score(y_train, y_train_proba_xgb)
print(f'Training AUC (XGBoost): {train_auc_xgb:.4f}')

# Plot ROC curve for the Test set
fpr_test_xgb, tpr_test_xgb, _ = roc_curve(y_test, y_test_proba_xgb)
roc_auc_test_xgb = auc(fpr_test_xgb, tpr_test_xgb)

# Plot ROC curve for the Training set
fpr_train_xgb, tpr_train_xgb, _ = roc_curve(y_train, y_train_proba_xgb)
roc_auc_train_xgb = auc(fpr_train_xgb, tpr_train_xgb)

# Plot the ROC curve
plt.figure(figsize=(10, 8))
plt.plot(fpr_test_xgb, tpr_test_xgb, color='blue', lw=2, label=f'Test ROC curve (AUC = {roc_auc_test_xgb:.2f})')
plt.plot(fpr_train_xgb, tpr_train_xgb, color='green', lw=2, label=f'Training ROC curve (AUC = {roc_auc_train_xgb:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)  # Diagonal line (random classifier)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve (XGBoost)')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

In [None]:
# 9. Get feature importances from the best XGB model
feature_importances_xgb = best_xgb_model.feature_importances_

# Create a DataFrame for XGB feature importances
features_df_xgb = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances_xgb
})

# Sort the DataFrame by importance
features_df_xgb = features_df_xgb.sort_values(by='Importance', ascending=False)

# Select the top 20 features for XGB (or adjust the number as needed)
top_20_features_xgb = features_df_xgb.head(20)

# Plot the top 20 feature importances for XGB
plot_feature_importance(top_20_features_xgb, 'Top 20 Feature Importances (XGB)', '/kaggle/working/Top_20_Feature_Importances_XGB.png')

**Make Predictions**

In [None]:
# Cargar el archivo Excel
file_path = '/kaggle/input/predictions-python/XGB pred.xlsx'
new_data = pd.read_excel(file_path)

# Mostrar las primeras filas para verificar que se cargó correctamente
print(new_data.head())

In [None]:
# Obtener las columnas que el modelo espera
model_columns = best_xgb_model.get_booster().feature_names

# Filtrar el DataFrame para que solo tenga las columnas necesarias
new_data_filtered = new_data[model_columns if all(col in new_data.columns for col in model_columns) else new_data.columns.intersection(model_columns)]

# Verificar las columnas seleccionadas
print("Columnas seleccionadas:", new_data_filtered.columns.tolist())

In [None]:
import numpy as np

# Identificar las columnas faltantes
missing_columns = [col for col in model_columns if col not in new_data_filtered.columns]

# Añadir las columnas faltantes con NaN
for col in missing_columns:
    new_data_filtered[col] = np.nan

# Asegurarse de que el orden de las columnas coincida exactamente con el modelo
new_data_filtered = new_data_filtered[model_columns]

# Verificar que todas las columnas ahora están presentes
print("Columnas después de añadir las faltantes:", new_data_filtered.columns.tolist())

In [None]:
# Realizar las predicciones
predictions = best_xgb_model.predict(new_data_filtered)

# Mostrar las predicciones
print("Predicciones realizadas:", predictions)

In [None]:
# Añadir las predicciones al DataFrame original
new_data['Predicciones'] = predictions

# Guardar el DataFrame con las predicciones en un nuevo archivo
new_data.to_excel('/kaggle/working/Predicciones_Resultados.xlsx', index=False)

print("Predicciones guardadas en 'Predicciones_Resultados.xlsx'")

**Evaluate The Predictions**

In [None]:
# Set the plot style
sns.set(style="whitegrid")

# Create a bar chart
plt.figure(figsize=(8, 6)) 
sns.barplot(x='Deriv', y='Pred', data=results_df, palette='viridis')

# Customize the plot with increased font sizes
plt.title('Comparison of Success Probability in Assays', fontsize=18)
plt.xlabel('Molecules and Derivatives', fontsize=14)
plt.ylabel('Success Probability (Pred)', fontsize=14)
plt.ylim(0, 1)  
plt.xticks(rotation=45, fontsize=14) 
plt.yticks(fontsize=14)

# Save the plot
plt.tight_layout()  # Ensure everything fits without overlapping
plt.savefig('/kaggle/working/Success_Probability_Assays.png', dpi=300)  # Save the figure

# Show the plot
plt.show()

In [None]:
# Group by assay type and calculate the average success
assay_success = results_df.groupby('c10=Assay Type')['Pred'].mean().reset_index()

# Create a dictionary to map the assay type letter to its corresponding name
assay_type_map = {
    'B': 'Binding',
    'F': 'Functional',
    'A': 'ADME',
    'P': 'Physicochemical'
}

# Map the assay type column to the full names
assay_success['Assay Type'] = assay_success['c10=Assay Type'].map(assay_type_map)

# Set the plot style
sns.set(style="whitegrid")

# Create the bar plot with the 'viridis' palette
plt.figure(figsize=(6, 6))
sns.barplot(x='Assay Type', y='Pred', data=assay_success, palette='viridis')

# Customize the plot with increased font sizes
plt.title('Success by Assay Type', fontsize=18)
plt.xlabel('Assay Type', fontsize=14)
plt.ylabel('Success Average', fontsize=14)
plt.xticks(rotation=45, fontsize=14)
plt.yticks(fontsize=14)

# Save the image
save_path = '/kaggle/working/success_by_assay_type.png'
plt.tight_layout()  # Adjust layout to avoid overlap
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f'Image saved at: {save_path}')

# Show the plot
plt.show()

In [None]:
# List of proteins related to Calmodulin (your specified proteins)
calmodulin_related = [
    "Calmodulin",
    "CaM kinase I alpha",
    "CaM kinase I delta",
    "CaM kinase II",
    "CaM kinase II beta",
    "CaM kinase II delta",
    "CaM kinase II gamma",
    "CaM kinase IV",
    "CaM-kinase kinase alpha",
    "CaM-kinase kinase beta",cmap = sns.diverging_palette(150, 10, s=90, l=50, as_cmap=True)

    "Calcium/calmodulin-dependent protein kinase type 1B"
]

# Step 1: Filter the dataframe based on 'Target Organism' being 'Homo sapiens' 
# and 'Target Name' being one of the Calmodulin-related proteins, plus 'Assay Type' = B
filtered_df = results_df[
    (results_df['c04=Target Organism'] == 'Homo sapiens') & 
    (results_df['c01=Target Name'].isin(calmodulin_related)) & 
    (results_df['c10=Assay Type'] == 'B') 
]

# Step 2: Focus on derivatives 1a-2c and Riluzole
derivatives = ['1a', '1b', '1c', '1d', '2a', '2b', '2c']
riluzole_data = results_df[results_df['Deriv'] == 'Riluzole']

# Filter for the derivatives 1a-2c in the filtered dataframe
filtered_derivatives = filtered_df[filtered_df['Deriv'].isin(derivatives)]

# Step 3: Create a new column to calculate the relative performance compared to Riluzole
# We will focus on the 'Pred' column (1 or 0 for success/failure)
riluzole_preds = riluzole_data[riluzole_data['c01=Target Name'].isin(calmodulin_related)]

# Create a mapping for Riluzole predictions by Target Name
riluzole_pred_map = riluzole_preds.set_index('c01=Target Name')['Pred'].to_dict()

# Calculate the relative performance for derivatives vs Riluzole
filtered_derivatives['Relative Performance'] = filtered_derivatives.apply(
    lambda row: row['Pred'] - riluzole_pred_map.get(row['c01=Target Name'], 0), axis=1
)

# Step 4: Pivot the dataframe to create the heatmap data with Deriv as columns and Target Name as rows
heatmap_data = filtered_derivatives.pivot_table(
    index='c01=Target Name', columns='Deriv', values='Relative Performance', aggfunc='mean'
)

# Reorder the rows to match your specified protein list
heatmap_data = heatmap_data.reindex(calmodulin_related)

# Step 5: Normalize the data for color gradient from red (worse) to green (better)
min_val, max_val = heatmap_data.min().min(), heatmap_data.max().max()
normalized_data = (heatmap_data - min_val) / (max_val - min_val) * 2 - 1  # Scaling to [-1, 1]

# Step 6: Plot the heatmap with gradient colors
plt.figure(figsize=(12, 8))
cmap = sns.diverging_palette(0, 120, as_cmap=True)

sns.heatmap(normalized_data, annot=True, cmap=cmap, fmt='.2f', cbar_kws={'label': 'Relative Performance (vs Riluzole)'})

# Add title and labels
plt.title('Comparison of Riluzole Derivatives vs Calmodulin-Related Proteins (Binding Assays)', fontsize=16)
plt.xlabel('Riluzole Derivatives', fontsize=14)
plt.ylabel('Calmodulin-Related Proteins', fontsize=14)

plt.show()

In [None]:
# List of proteins related to Calmodulin (with Calmodulin first and sorted)
calmodulin_related = [
    "Calmodulin",
    "CaM kinase I alpha",
    "CaM kinase I delta",
    "CaM kinase II",
    "CaM kinase II beta",
    "CaM kinase II delta",
    "CaM kinase II gamma",
    "CaM kinase IV",
    "CaM-kinase kinase alpha",
    "CaM-kinase kinase beta",
    "Calcium/calmodulin-dependent protein kinase type 1B"
]

# Step 1: Filter the dataframe for Homo sapiens, calmodulin-related proteins, and binding assays (B)
filtered_df = results_df[
    (results_df['c04=Target Organism'] == 'Homo sapiens') &
    (results_df['c01=Target Name'].isin(calmodulin_related)) &
    (results_df['c10=Assay Type'] == 'B')
]

# Step 2: Focus on derivatives 1a-2c and Riluzole
derivatives = ['1a', '1b', '1c', '1d', '2a', '2b', '2c']
riluzole_data = results_df[results_df['Deriv'] == 'Riluzole']

# Filter for derivatives in the filtered dataframe
filtered_derivatives = filtered_df[filtered_df['Deriv'].isin(derivatives)]

# Step 3: Create a new column to calculate relative performance vs Riluzole
riluzole_preds = riluzole_data[riluzole_data['c01=Target Name'].isin(calmodulin_related)]
riluzole_pred_map = riluzole_preds.set_index('c01=Target Name')['Pred'].to_dict()

# Calculate relative performance
filtered_derivatives['Relative Performance'] = filtered_derivatives.apply(
    lambda row: row['Pred'] - riluzole_pred_map.get(row['c01=Target Name'], 0), axis=1
)

# Step 4: Pivot to create the heatmap data
heatmap_data = filtered_derivatives.pivot_table(
    index='c01=Target Name', columns='Deriv', values='Relative Performance', aggfunc='mean'
)

# Step 5: Filter out proteins without any data (remove NaN rows)
heatmap_data = heatmap_data.dropna(how='all')

# Reorder the proteins to match the specified order
heatmap_data = heatmap_data.reindex(calmodulin_related)

# Step 6: Normalize for color gradient from red (worse) to green (better)
min_val, max_val = heatmap_data.min().min(), heatmap_data.max().max()
normalized_data = (heatmap_data - min_val) / (max_val - min_val) * 2 - 1  # Scale to [-1, 1]

# Step 7: Plot the heatmap with red-green gradient
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 6))
cmap = sns.diverging_palette(15, 150, as_cmap=True)  # Green to red

sns.heatmap(normalized_data, annot=True, cmap=cmap, fmt='.2f', 
            cbar_kws={'label': 'Relative Performance (vs Riluzole)'})

# Add titles and labels with extra padding for the title
plt.title('Comparison of Riluzole Derivatives vs Calmodulin-Related Proteins (Binding Assays)', 
          fontsize=18, pad=15)  # Added pad=20 to separate the title

plt.xlabel('Riluzole Derivatives', fontsize=14)
plt.ylabel('Calmodulin-Related Proteins', fontsize=14)

# Save the plot as a PNG image
save_path = '/kaggle/working/heatmap_riluzole_derivatives_vs_calmodulin.png'
plt.tight_layout()  # Adjust layout to prevent clipping
plt.savefig(save_path, dpi=300, bbox_inches='tight')  # Save the image

print(f"Image saved at: {save_path}")  # Notify the user

# Show the plot
plt.show()

# MODEL COMPARISON

In [None]:
# Initialize model results dictionary
model_results = {}

# Define the models
models = {
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f'Training {model_name}...')
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # AUROC for Test Data
    y_test_proba = model.predict_proba(X_test)[:, 1]  
    test_roc_auc = roc_auc_score(y_test, y_test_proba)
    
    # AUROC for Train Data
    y_train_proba = model.predict_proba(X_train)[:, 1] 
    train_roc_auc = roc_auc_score(y_train, y_train_proba)
    
    # Save results
    model_results[model_name] = {
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train ROC AUC': train_roc_auc,
        'Test ROC AUC': test_roc_auc
    }

    print(f'{model_name} - Train Accuracy: {train_accuracy:.2f}, Test Accuracy: {test_accuracy:.2f}, Train ROC AUC: {train_roc_auc:.2f}, Test ROC AUC: {test_roc_auc:.2f}')

In [None]:
# Initialize model results dictionary
model_results = {}

# Define the models
models = {
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f'Training {model_name}...')
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # AUROC for Test Data
    y_test_proba = model.predict_proba(X_test)[:, 1]  
    test_roc_auc = roc_auc_score(y_test, y_test_proba)
    
    # AUROC for Train Data
    y_train_proba = model.predict_proba(X_train)[:, 1] 
    train_roc_auc = roc_auc_score(y_train, y_train_proba)
    
    # Save results
    model_results[model_name] = {
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train ROC AUC': train_roc_auc,
        'Test ROC AUC': test_roc_auc
    }

    print(f'{model_name} - Train Accuracy: {train_accuracy:.2f}, Test Accuracy: {test_accuracy:.2f}, Train ROC AUC: {train_roc_auc:.2f}, Test ROC AUC: {test_roc_auc:.2f}')

In [None]:
# Create a figure
plt.figure(figsize=(8, 6))

# Plot ROC curves for each model
for model_name, model in models.items():
    # Predict probabilities
    y_test_proba = model.predict_proba(X_test)[:, 1]
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_test_proba)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {model_results[model_name]["Test ROC AUC"]:.2f})')

# Add diagonal reference line
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Add labels, title, and legend
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves Comparison (Test)', fontsize=14)
plt.legend(loc='lower right')

# Save the figure
save_path = '/kaggle/working/roc_curves_comparison.png'
plt.savefig(save_path, dpi=300)

# Show the figure
plt.show()

print(f"ROC curve figure saved at: {save_path}")

In [None]:
# Create a figure
plt.figure(figsize=(8, 6))

# Plot ROC curves for each model using training data
for model_name, model in models.items():
    # Predict probabilities
    y_train_proba = model.predict_proba(X_train)[:, 1]
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_train, y_train_proba)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {model_results[model_name]["Train ROC AUC"]:.2f})')

# Add diagonal reference line
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Add labels, title, and legend
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=12)
plt.ylabel('True Positive Rate (TPR)', fontsize=12)
plt.title('ROC Curves Comparison (Training)', fontsize=14)
plt.legend(loc='lower right')

# Save the figure
save_path = '/kaggle/working/roc_curves_train_comparison.png'
plt.savefig(save_path, dpi=300) 

# Show the figure
plt.show()

print(f"ROC curve figure saved at: {save_path}")

In [None]:
# Create DataFrame from model_results
results_df = pd.DataFrame([
    {
        'Model': model_name,
        'Train Accuracy': metrics['Train Accuracy'],
        'Test Accuracy': metrics['Test Accuracy'],
        'Train ROC AUC': metrics['Train ROC AUC'],  # Include Train ROC AUC
        'Test ROC AUC': metrics['Test ROC AUC'],    # Include Test ROC AUC
    }
    for model_name, metrics in model_results.items()
])

# Print the final DataFrame with the desired columns
print(results_df[['Model', 'Train Accuracy', 'Test Accuracy', 'Train ROC AUC', 'Test ROC AUC']].to_string(index=False))

In [None]:
# Initialize model results dictionary
model_results = {}

# Define the models
models = {
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LDA': LinearDiscriminantAnalysis()
}

# Train and evaluate each model using 10-fold cross-validation
for model_name, model in models.items():
    print(f'Evaluating {model_name} with 10-fold cross validation...')
    
    # Compute cross-validation scores
    train_cv_accuracy = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy').mean()
    test_cv_accuracy = cross_val_score(model, X_test, y_test, cv=10, scoring='accuracy').mean()
    
    train_cv_roc_auc = cross_val_score(model, X_train, y_train, cv=10, scoring='roc_auc').mean()
    test_cv_roc_auc = cross_val_score(model, X_test, y_test, cv=10, scoring='roc_auc').mean()
    
    # Save results
    model_results[model_name] = {
        'Train CV Accuracy': train_cv_accuracy,
        'Test CV Accuracy': test_cv_accuracy,
        'Train CV ROC AUC': train_cv_roc_auc,
        'Test CV ROC AUC': test_cv_roc_auc
    }

    print(f'{model_name} - Train CV Accuracy: {train_cv_accuracy:.2f}, Test CV Accuracy: {test_cv_accuracy:.2f}, Train CV ROC AUC: {train_cv_roc_auc:.2f}, Test CV ROC AUC: {test_cv_roc_auc:.2f}')

# Create DataFrame from model_results
results_df = pd.DataFrame([
    {
        'Model': model_name,
        'Train CV Accuracy': metrics['Train CV Accuracy'],
        'Test CV Accuracy': metrics['Test CV Accuracy'],
        'Train CV ROC AUC': metrics['Train CV ROC AUC'],
        'Test CV ROC AUC': metrics['Test CV ROC AUC'],
    }
    for model_name, metrics in model_results.items()
])

# Print the final DataFrame with the desired columns
print(results_df[['Model', 'Train CV Accuracy', 'Test CV Accuracy', 'Train CV ROC AUC', 'Test CV ROC AUC']].to_string(index=False))