In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score, roc_auc_score, roc_curve)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load the dataset
excel_file = '/kaggle/input/paper03dataegs/Python EGS.xlsx'
df = pd.read_excel(excel_file)

# Preview the data
print(df.head())

# 0. SELECT MOST RELEVANT FEATURES

In [None]:
# Compute the correlation matrix
correlation_matrix = df.corr()

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title("Feature Correlation Matrix", fontsize=16)

# Save the figure in high quality
save_path = '/kaggle/working/correlation_matrix.png'
plt.savefig(save_path, dpi=300, bbox_inches='tight') 

# Show the figure
plt.show()

print(f"Correlation matrix figure saved at: {save_path}")

# Set a correlation threshold
threshold = 0.97

# Identify features with high correlation
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            correlated_features.add(correlation_matrix.columns[i])

print(f"Features to be removed due to high correlation: {correlated_features}")

# Drop highly correlated features
df = df.drop(columns=correlated_features)

print(f"Remaining columns after correlation filter: {df.columns}")

# Preview the data
print(df.head())

In [None]:
# Define the variance threshold
var_threshold = 0.01

# Apply variance thresholding
selector = VarianceThreshold(threshold=var_threshold)
df_high_variance = selector.fit_transform(df)

# Get the remaining features
selected_features = df.columns[selector.get_support()]
df = pd.DataFrame(df_high_variance, columns=selected_features)

print(f"Remaining columns after variance filter: {df.columns}")

# Preview the data
print(df.head())

In [None]:
# Define the features and the target
X = df.drop(columns=['f(vij)obj'])
y = df['f(vij)obj']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 # 1. RANDOM FOREST

In [None]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier()

# Train the model
rf_model.fit(X_train, y_train)

# Predictions
y_train_pred_rf = rf_model.predict(X_train)
y_test_pred_rf = rf_model.predict(X_test)

# Accuracy
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)

print(f'Random Forest - Training Accuracy: {train_accuracy_rf:.2f}')
print(f'Random Forest - Test Accuracy: {test_accuracy_rf:.2f}')
print("Classification Report for Random Forest (Training Set):")
print(classification_report(y_train, y_train_pred_rf))
print("Classification Report for Random Forest (Test Set):")
print(classification_report(y_test, y_test_pred_rf))

**1.1 Add Randomized SearchCV**

In [None]:
# Define the hyperparameter grid
param_distributions = {
    'n_estimators': np.arange(10, 50, 100),
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define the hyperparameter grid
param_distributions = {
    'n_estimators': np.arange(50, 201, 10),
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70],
    'min_samples_split': [2, 3, 4, 5, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 10],
    'bootstrap': [True, False]
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_distributions,
    n_iter=30,  
    cv=3,      
    verbose=2,
    random_state=7,
    n_jobs=-1
)

# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Get the best parameters from RandomizedSearchCV
best_params_random = random_search.best_params_
print("Best parameters from Randomized Search:", best_params_random)

**1.2 Add GridSearchCV**

In [None]:
# Ensure we don't have invalid n_estimators
n_estimators_values = [
    max(1, best_params_random['n_estimators'] - 10),  
    best_params_random['n_estimators'],
    best_params_random['n_estimators'] + 10
]

# Create the parameter grid
param_grid = {
    'n_estimators': n_estimators_values,
    'max_depth': [best_params_random['max_depth'] - 5, best_params_random['max_depth'], best_params_random['max_depth'] + 5],
    'min_samples_split': [best_params_random['min_samples_split'] - 1, best_params_random['min_samples_split'], best_params_random['min_samples_split'] + 1],
    'min_samples_leaf': [best_params_random['min_samples_leaf'] - 1, best_params_random['min_samples_leaf'], best_params_random['min_samples_leaf'] + 1],
    'bootstrap': [best_params_random['bootstrap']]
}

# Proceed with GridSearchCV setup and fitting
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
best_params_grid = grid_search.best_params_
print("Best parameters from Grid Search:", best_params_grid)

# Predict using the best model
best_rf_model = grid_search.best_estimator_
y_test_pred = best_rf_model.predict(X_test)

# Evaluate the model performance
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Set Accuracy with Best Hyperparameters from Grid Search: {test_accuracy * 100:.2f}%')

**1.3 Evaluate the model**

In [None]:
# Get the best model from Grid Search
best_rf_model = grid_search.best_estimator_

# Predictions on the training set
y_train_pred = best_rf_model.predict(X_train)

# Predictions on the test set
y_test_pred = best_rf_model.predict(X_test)

In [None]:
# 1. Accuracy Score for the Training Set
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Train Set Accuracy: {train_accuracy * 100:.2f}%')

# 2. Accuracy Score for the Test Set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Set Accuracy: {test_accuracy * 100:.2f}%')

In [None]:
# 3. How many predictions were correct out of the total for both sets
train_correct = sum(y_train_pred == y_train)
test_correct = sum(y_test_pred == y_test)

print(f'Training set: {train_correct} correct out of {len(y_train)}')
print(f'Test set: {test_correct} correct out of {len(y_test)}')

In [None]:
# 4. Classification Report for the Training Set
print("Classification Report on Training Set:")
print(classification_report(y_train, y_train_pred, digits=4))

# 5. Classification Report for the Test Set
print("Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred, digits=4))

In [None]:
# 6. Confusion Matrix for the Training Set
print("Confusion Matrix on Training Set:")
print(confusion_matrix(y_train, y_train_pred))

# 7. Confusion Matrix for the Test Set
print("Confusion Matrix on Test Set:")
print(confusion_matrix(y_test, y_test_pred))

In [None]:
# Function to plot confusion matrix with percentages
def plot_confusion_matrix_with_percentages(y_true, y_pred, title, filename):
    # Generate confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Calculate percentages
    cm_percentage = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

    # Plot confusion matrix
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm_percentage, annot=True, fmt=".2f", cmap="Blues", cbar=False)

    # Add titles and labels
    plt.title(f'{title} CM (%)', fontsize=11)
    plt.xlabel('Predicted', fontsize=10)
    plt.ylabel('Expected', fontsize=10)

    # Adjust layout to prevent cropping
    plt.tight_layout()

    # Save the figure in high resolution (300 DPI)
    plt.savefig(f'/kaggle/working/3{filename}.png', dpi=300) 

In [None]:
# Check unique values in the training and test target variables for Random Forest
unique_classes_rf = np.unique(y_train)
print("Unique classes in training set (RF):", unique_classes_rf)
unique_classes_test_rf = np.unique(y_test)
print("Unique classes in test set (RF):", unique_classes_test_rf)

# Plot and save confusion matrices for RF
plot_confusion_matrix_with_percentages(y_train, y_train_pred, title="Training Set (RF)", filename="RF_Train_Confusion_Matrix")
plot_confusion_matrix_with_percentages(y_test, y_test_pred, title="Test Set (RF)", filename="RF_Test_Confusion_Matrix")

In [None]:
# Get feature importances from the best model
feature_importances = best_rf_model.feature_importances_

# Create a DataFrame to hold feature names and their importances
features_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
features_df = features_df.sort_values(by='Importance', ascending=False)

# Select the top 20 features
top_20_features = features_df.head(20)

# Plot the top 20 feature importances
plt.figure(figsize=(6, 4))
plt.barh(top_20_features['Feature'], top_20_features['Importance'], color='skyblue')
plt.xlabel('Importance', fontsize=14)
plt.title('Top 20 Feature Importances (RF)', fontsize=16)
plt.gca().invert_yaxis()  

# Save the plot
plt.tight_layout() 
plt.savefig('/kaggle/working/Top_20_Feature_Importances_RF.png', dpi=300)  

# Show the plot
plt.show()

## 2. Support Vector Machine (SVM)

In [None]:
# Initialize the Support Vector Machine model
svm_model = SVC()

# Train the model
svm_model.fit(X_train, y_train)

# Predictions
y_train_pred_svm = svm_model.predict(X_train)
y_test_pred_svm = svm_model.predict(X_test)

# Accuracy
train_accuracy_svm = accuracy_score(y_train, y_train_pred_svm)
test_accuracy_svm = accuracy_score(y_test, y_test_pred_svm)

print(f'Support Vector Machine - Training Accuracy: {train_accuracy_svm:.2f}')
print(f'Support Vector Machine - Test Accuracy: {test_accuracy_svm:.2f}')
print("Classification Report for SVM (Training Set):")
print(classification_report(y_train, y_train_pred_svm))
print("Classification Report for SVM (Test Set):")
print(classification_report(y_test, y_test_pred_svm))

# 3. Decision Tree Classifier

In [None]:
# Initialize the Decision Tree Classifier model
dt_model = DecisionTreeClassifier()

# Train the model
dt_model.fit(X_train, y_train)

# Predictions
y_train_pred_dt = dt_model.predict(X_train)
y_test_pred_dt = dt_model.predict(X_test)

# Accuracy
train_accuracy_dt = accuracy_score(y_train, y_train_pred_dt)
test_accuracy_dt = accuracy_score(y_test, y_test_pred_dt)

print(f'Decision Tree - Training Accuracy: {train_accuracy_dt:.2f}')
print(f'Decision Tree - Test Accuracy: {test_accuracy_dt:.2f}')
print("Classification Report for Decision Tree (Training Set):")
print(classification_report(y_train, y_train_pred_dt))
print("Classification Report for Decision Tree (Test Set):")
print(classification_report(y_test, y_test_pred_dt))

# 4. K-Nearest Neighbours KNN

In [None]:
# Initialize the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5) 

# Train the model
knn_model.fit(X_train, y_train)

# Predictions
y_train_pred_knn = knn_model.predict(X_train)
y_test_pred_knn = knn_model.predict(X_test)

# Accuracy
train_accuracy_knn = accuracy_score(y_train, y_train_pred_knn)
test_accuracy_knn = accuracy_score(y_test, y_test_pred_knn)

print(f'K-Nearest Neighbors - Training Accuracy: {train_accuracy_knn:.2f}')
print(f'K-Nearest Neighbors - Test Accuracy: {test_accuracy_knn:.2f}')
print("Classification Report for KNN (Training Set):")
print(classification_report(y_train, y_train_pred_knn))
print("Classification Report for KNN (Test Set):")
print(classification_report(y_test, y_test_pred_knn))

# 5. Gradient Boosting Classifier

In [None]:
# Initialize the Gradient Boosting Classifier model
gb_model = GradientBoostingClassifier()

# Train the model
gb_model.fit(X_train, y_train)

# Predictions
y_train_pred_gb = gb_model.predict(X_train)
y_test_pred_gb = gb_model.predict(X_test)

# Accuracy
train_accuracy_gb = accuracy_score(y_train, y_train_pred_gb)
test_accuracy_gb = accuracy_score(y_test, y_test_pred_gb)

print(f'Gradient Boosting Classifier - Training Accuracy: {train_accuracy_gb:.2f}')
print(f'Gradient Boosting Classifier - Test Accuracy: {test_accuracy_gb:.2f}')
print("Classification Report for Gradient Boosting (Training Set):")
print(classification_report(y_train, y_train_pred_gb))
print("Classification Report for Gradient Boosting (Test Set):")
print(classification_report(y_test, y_test_pred_gb))

# 6. XGBoost

In [None]:
# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train, y_train)

# Predictions
y_train_pred_xgb = xgb_model.predict(X_train)
y_test_pred_xgb = xgb_model.predict(X_test)

# Accuracy
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)

print(f'XGBoost - Training Accuracy: {train_accuracy_xgb:.2f}')
print(f'XGBoost - Test Accuracy: {test_accuracy_xgb:.2f}')
print("Classification Report for XGBoost (Training Set):")
print(classification_report(y_train, y_train_pred_xgb))
print("Classification Report for XGBoost (Test Set):")
print(classification_report(y_test, y_test_pred_xgb))

**6.1 Add Randomized SearchCV**

In [None]:
# Define the hyperparameter grid
param_distributions_xgb = {
    'n_estimators': np.arange(50, 201, 10),
    'max_depth': [3, 6, 10, 15],
    'learning_rate': np.linspace(0.01, 0.3, 10),
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'min_child_weight': [1, 5, 10]
}

# Set up RandomizedSearchCV
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions_xgb,
    n_iter=30,  
    cv=3,      
    verbose=2,
    random_state=7,
    n_jobs=-1
)

# Fit RandomizedSearchCV on the training data
random_search_xgb.fit(X_train, y_train)

# Get the best parameters from RandomizedSearchCV
best_params_random_xgb = random_search_xgb.best_params_
print("Best parameters from Randomized Search for XGBoost:", best_params_random_xgb)

**6.2 Add GridSearchCV**

In [None]:
# Ensure we don't have invalid n_estimators
n_estimators_values_xgb = [
    max(1, best_params_random_xgb['n_estimators'] - 10),
    best_params_random_xgb['n_estimators'],
    best_params_random_xgb['n_estimators'] + 10
]

# Create the parameter grid
param_grid_xgb = {
    'n_estimators': n_estimators_values_xgb,
    'max_depth': [best_params_random_xgb['max_depth'] - 2, best_params_random_xgb['max_depth'], best_params_random_xgb['max_depth'] + 2],
    'learning_rate': [best_params_random_xgb['learning_rate'] - 0.05, best_params_random_xgb['learning_rate'], best_params_random_xgb['learning_rate'] + 0.05],
    'subsample': [best_params_random_xgb['subsample']],
    'colsample_bytree': [best_params_random_xgb['colsample_bytree']],
    'gamma': [best_params_random_xgb['gamma']],
    'min_child_weight': [best_params_random_xgb['min_child_weight']]
}

# Set up GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    cv=3,
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV on the training data
grid_search_xgb.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
best_params_grid_xgb = grid_search_xgb.best_params_
print("Best parameters from Grid Search for XGBoost:", best_params_grid_xgb)

# Predict using the best model
best_xgb_model = grid_search_xgb.best_estimator_
y_test_pred_xgb = best_xgb_model.predict(X_test)

# Evaluate the model performance
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
print(f'Test Set Accuracy with Best Hyperparameters from Grid Search: {test_accuracy_xgb * 100:.2f}%')

**6.3 Evaluate the model**

In [None]:
# Get the best model from Grid Search
best_xgb_model = grid_search_xgb.best_estimator_

# Predictions on the training set
y_train_pred_xgb = best_xgb_model.predict(X_train)

# Predictions on the test set
y_test_pred_xgb = best_xgb_model.predict(X_test)

In [None]:
# 1. Accuracy Score for the Training Set
train_accuracy_xgb = accuracy_score(y_train, y_train_pred_xgb)
print(f'Train Set Accuracy: {train_accuracy_xgb * 100:.2f}%')

# 2. Accuracy Score for the Test Set
test_accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
print(f'Test Set Accuracy: {test_accuracy_xgb * 100:.2f}%')

# 3. How many predictions were correct out of the total for both sets
train_correct_xgb = sum(y_train_pred_xgb == y_train)
test_correct_xgb = sum(y_test_pred_xgb == y_test)

print(f'Training set: {train_correct_xgb} correct out of {len(y_train)}')
print(f'Test set: {test_correct_xgb} correct out of {len(y_test)}')

In [None]:
# 4. Classification Report for the Training Set
print("Classification Report on Training Set:")
print(classification_report(y_train, y_train_pred_xgb, digits=4))

# 5. Classification Report for the Test Set
print("Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred_xgb, digits=4))

In [None]:
# 6. Confusion Matrix for the Training Set
print("Confusion Matrix on Training Set:")
print(confusion_matrix(y_train, y_train_pred_xgb))

# 7. Confusion Matrix for the Test Set
print("Confusion Matrix on Test Set:")
print(confusion_matrix(y_test, y_test_pred_xgb))

In [None]:
# Check unique values in the training and test target variables for XGBoost
unique_classes_xgb = np.unique(y_train)
print("Unique classes in training set (XGB):", unique_classes_xgb)
unique_classes_test_xgb = np.unique(y_test)
print("Unique classes in test set (XGB):", unique_classes_test_xgb)

# Plot and save confusion matrices for XGB
plot_confusion_matrix_with_percentages(y_train, y_train_pred_xgb, title="Training Set (XGB)", filename="XGB_Train_Confusion_Matrix")
plot_confusion_matrix_with_percentages(y_test, y_test_pred_xgb, title="Test Set (XGB)", filename="XGB_Test_Confusion_Matrix")

In [None]:
# Get feature importances from the best model
feature_importances_xgb = best_xgb_model.feature_importances_

# Create a DataFrame to hold feature names and their importances
features_df_xgb = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances_xgb
})

# Sort the DataFrame by importance
features_df_xgb = features_df_xgb.sort_values(by='Importance', ascending=False)

# Select the top 20 features
top_20_features_xgb = features_df_xgb.head(12)

# Plot the top 20 feature importances
plt.figure(figsize=(5, 3))
plt.barh(top_20_features_xgb['Feature'], top_20_features_xgb['Importance'], color='skyblue')
plt.xlabel('Importance', fontsize=10)
plt.title('Top 20 Feature Importances (XGB)', fontsize=10)
plt.gca().invert_yaxis()  
plt.yticks(fontsize=8)  
plt.xticks(fontsize=8) 

# Save the plot
plt.tight_layout()  
plt.savefig('/kaggle/working/Top_20_Feature_Importances_XGB.png', dpi=300)  

# Show the plot
plt.show()

# 7. Model Comparison

In [None]:
# Initialize model results dictionary
model_results = {}

# Define the models
models = {
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train and evaluate each model
for model_name, model in models.items():
    print(f'Training {model_name}...')
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    # AUROC for Test Data
    y_test_proba = model.predict_proba(X_test)[:, 1]  
    test_roc_auc = roc_auc_score(y_test, y_test_proba)
    
    # AUROC for Train Data
    y_train_proba = model.predict_proba(X_train)[:, 1] 
    train_roc_auc = roc_auc_score(y_train, y_train_proba)
    
    # Save results
    model_results[model_name] = {
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Train ROC AUC': train_roc_auc,
        'Test ROC AUC': test_roc_auc
    }

    print(f'{model_name} - Train Accuracy: {train_accuracy:.2f}, Test Accuracy: {test_accuracy:.2f}, Train ROC AUC: {train_roc_auc:.2f}, Test ROC AUC: {test_roc_auc:.2f}')

**7.1 Bar Plot**

In [None]:
# Convert the results dictionary into a DataFrame
results_df = pd.DataFrame(model_results).T  
results_df.reset_index(inplace=True)
results_df.rename(columns={'index': 'Model'}, inplace=True)

# Sort the DataFrame by 'Test Accuracy'
results_df.sort_values(by='Test Accuracy', ascending=False, inplace=True)

# Convert accuracies to percentages
results_df['Train Accuracy'] *= 100
results_df['Test Accuracy'] *= 100

# Melt the DataFrame to long format for plotting both train and test accuracy
results_melted = results_df.melt(id_vars='Model', value_vars=['Train Accuracy', 'Test Accuracy'], 
                                 var_name='Accuracy Type', value_name='Accuracy')

# Plot the combined accuracy plot with vibrant colors
plt.figure(figsize=(12, 8)) 
ax = sns.barplot(x='Accuracy', y='Model', hue='Accuracy Type', data=results_melted, palette='magma')

# Add title and labels with larger font sizes
plt.title('Model Train vs Test Accuracy Comparison', fontsize=20)
plt.xlabel('Accuracy (%)', fontsize=18)
plt.ylabel('Model', fontsize=18)

# Customize tick parameters for both axes
ax.tick_params(axis='x', labelsize=14)  
ax.tick_params(axis='y', labelsize=14)  

# Position legend outside the plot
legend = ax.legend(title='Accuracy Type', fontsize=14, title_fontsize=16, loc='upper center', 
                   bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False)  

# Add percentage values on the bars with larger font size
for container in ax.containers:
    ax.bar_label(container, fmt='%.1f%%', label_type='edge', fontsize=14, padding=5)

# Adjust layout
plt.tight_layout()

# Save the plot
save_path = '/kaggle/working/model_accuracy_comparison.png'
plt.savefig(save_path, dpi=300) 
plt.show()

print(f"Plot saved at {save_path}")

**7.2 Plot ROC Curves for All Models**

In [None]:
# Create a figure
plt.figure(figsize=(8, 6))

# Plot ROC curves for each model
for model_name, model in models.items():
    # Predict probabilities
    y_test_proba = model.predict_proba(X_test)[:, 1]
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_test_proba)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {model_results[model_name]["Test ROC AUC"]:.2f})')

# Add diagonal reference line
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Add labels, title, and legend
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves Comparison (Test)', fontsize=14)
plt.legend(loc='lower right')

# Save the figure
save_path = '/kaggle/working/roc_curves_comparison.png'
plt.savefig(save_path, dpi=300)

# Show the figure
plt.show()

print(f"ROC curve figure saved at: {save_path}")

In [None]:
# Create a figure
plt.figure(figsize=(8, 6))

# Plot ROC curves for each model using training data
for model_name, model in models.items():
    # Predict probabilities
    y_train_proba = model.predict_proba(X_train)[:, 1]
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_train, y_train_proba)
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {model_results[model_name]["Train ROC AUC"]:.2f})')

# Add diagonal reference line
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Add labels, title, and legend
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=12)
plt.ylabel('True Positive Rate (TPR)', fontsize=12)
plt.title('ROC Curves Comparison (Training)', fontsize=14)
plt.legend(loc='lower right')

# Save the figure
save_path = '/kaggle/working/roc_curves_train_comparison.png'
plt.savefig(save_path, dpi=300) 

# Show the figure
plt.show()

print(f"ROC curve figure saved at: {save_path}")

**7.3 Comparative Table**

In [None]:
# Create DataFrame from model_results
results_df = pd.DataFrame([
    {
        'Model': model_name,
        'Train Accuracy': metrics['Train Accuracy'],
        'Test Accuracy': metrics['Test Accuracy'],
        'Train ROC AUC': metrics['Train ROC AUC'],  # Include Train ROC AUC
        'Test ROC AUC': metrics['Test ROC AUC'],    # Include Test ROC AUC
    }
    for model_name, metrics in model_results.items()
])

# Print the final DataFrame with the desired columns
print(results_df[['Model', 'Train Accuracy', 'Test Accuracy', 'Train ROC AUC', 'Test ROC AUC']].to_string(index=False))