# Import the Dataset

In [1]:
X=y=0

In [None]:
# This code is downloading the notebook from GitHub and running it
import requests
from pathlib import Path
url = "https://raw.githubusercontent.com/nbakas/MachineLearning/refs/heads/main/08-BankingDataset.ipynb"
filename = url.split("/")[-1]
local_path = Path.cwd() / filename
if not local_path.exists():
    response = requests.get(url)
    response.raise_for_status()
    local_path.write_bytes(response.content)
%run 08-BankingDataset.ipynb

In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

In [4]:
# Select a random subset of X,y, to speed up the computation. X,y are pandas DataFrames
X_subset = X.sample(n=10_000, random_state=42)
y_subset = y.loc[X_subset.index]

In [5]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.2, stratify=y_subset, random_state=42)

# Random Forests

In [6]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 500],  # Number of trees in the forest
    'max_depth': [5, 10],  # Maximum depth of the tree
    'min_samples_split': [10, 20],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [10, 20]  # Minimum number of samples required to be at a leaf node
}

In [7]:
# Initialize classifier and StratifiedKFold
# StratifiedKFold is used to ensure that the training and test sets have the same proportion of classes as the original dataset
rf = RandomForestClassifier(random_state=42) # Initialize the classifier
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Initialize the StratifiedKFold

In [8]:
# Set up GridSearchCV with return_train_score=True
grid_search = GridSearchCV(
    estimator=rf, # The classifier
    param_grid=param_grid, # The parameter grid
    cv=skf, # The StratifiedKFold
    scoring='roc_auc', # The scoring metric
    n_jobs=-1, # The number of jobs to run in parallel
    verbose=4, # The verbosity level
    return_train_score=True # Whether to return the training scores
)

In [None]:
# Fit the model
grid_search.fit(X_train, y_train)

In [10]:
best_rf = grid_search.best_estimator_

In [None]:
# Feature importance
importances = best_rf.feature_importances_ # Get the feature importances
sorted_indices = np.argsort(importances)[::-1] # Sort the feature importances in descending order
sorted_importances = importances[sorted_indices] # Get the sorted feature importances
sorted_features = X_train.columns[sorted_indices] # Get the sorted features
# Plot the feature importance
plt.figure(figsize=(10, 6)) # Create a figure
plt.barh(sorted_features, sorted_importances) # Plot the feature importance
plt.xlabel('Importance') # Label the x-axis
plt.ylabel('Feature') # Label the y-axis
plt.title('Feature Importance') # Title the plot
plt.show() # Show the plot

In [None]:
# Evaluate on test set
from sklearn.metrics import classification_report, confusion_matrix
y_pred = best_rf.predict(X_test) # Predict the test set
test_accuracy = accuracy_score(y_test, y_pred) # Calculate the accuracy of the test set
classification_report_str = classification_report(y_test, y_pred) # Calculate the classification report
print("Best hyperparameters:", grid_search.best_params_) # Print the best hyperparameters
print("Test set accuracy: {:.4f}".format(test_accuracy)) # Print the accuracy of the test set
print("\nClassification Report:\n", classification_report_str) # Print the classification report


confusion_matrix_result = confusion_matrix(y_test, y_pred, normalize='true') * 100 # Calculate the confusion matrix
confusion_matrix_raw = confusion_matrix(y_test, y_pred) # Calculate the confusion matrix

# Combine percentage and raw values
annot = np.empty_like(confusion_matrix_result, dtype=object) # Create an empty array to store the annotations
for i in range(confusion_matrix_result.shape[0]): # Loop through the rows of the confusion matrix
    for j in range(confusion_matrix_result.shape[1]): # Loop through the columns of the confusion matrix
        annot[i, j] = f"{confusion_matrix_result[i, j]:.2f}%({confusion_matrix_raw[i, j]})" # Store the annotations

# Plot the confusion matrix
plt.figure(figsize=(10, 8)) # Create a figure
import seaborn as sns # Import seaborn
sns.heatmap(confusion_matrix_result, annot=annot, fmt='', cmap='Blues', xticklabels=y_test.iloc[:, 0].unique(), yticklabels=y_test.iloc[:, 0].unique()) # Plot the confusion matrix
plt.xlabel('Predicted') # Label the x-axis
plt.ylabel('Actual') # Label the y-axis
plt.title('Confusion Matrix') # Title the plot
plt.show() # Show the plot

In [None]:
# Plot training and cross-validation accuracy
results = grid_search.cv_results_ # Get the results from the grid search
mean_train_scores = results['mean_train_score'] # Get the mean training scores
mean_test_scores = results['mean_test_score'] # Get the mean cross-validation scores
iso = np.argsort(mean_test_scores) # Sort the cross-validation scores
plt.figure(figsize=(14, 6)) # Create a figure
plt.plot(mean_train_scores[iso], label='Train Accuracy', marker='o') # Plot the training accuracy
plt.plot(mean_test_scores[iso], label='CV Accuracy', marker='s') # Plot the cross-validation accuracy
plt.xlabel('Hyperparameter Combination Index') # Label the x-axis
plt.ylabel('Accuracy') # Label the y-axis
plt.ylim(0.85, 0.95)  # Set y limits from 0 to 1
plt.title('Train vs Cross-Validation Accuracy') # Title the plot
plt.legend() # Show the legend
plt.grid(True) # Show the grid
plt.tight_layout() # Adjust the layout
plt.show() # Show the plot

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
# Predict the labels for the test set
y_test_pred = best_rf.predict(X_test)

# Calculate the accuracy of the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)

# Calculate the precision of the model on the test set
test_precision = precision_score(y_test, y_test_pred, pos_label='yes')

# Calculate the recall of the model on the test set
test_recall = recall_score(y_test, y_test_pred, pos_label='yes')

# Calculate the F1-score of the model on the test set
test_f1 = f1_score(y_test, y_test_pred, pos_label='yes')

# Calculate the ROC AUC score of the model on the test set
y_test_prob = best_rf.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_test_prob)

# Print all metrics for the test set
print(f"Test Set Accuracy: {test_accuracy:.4f}")
print(f"Test Set Precision: {test_precision:.4f}")
print(f"Test Set Recall: {test_recall:.4f}")
print(f"Test Set F1 Score: {test_f1:.4f}")
print(f"Test Set ROC AUC: {test_roc_auc:.4f}")

# XGBoost

In [None]:
# Simple binary mapping
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)

In [16]:
# Define hyperparameter grid for XGBoost
param_grid = {
    'n_estimators': [100, 200], # Number of trees in the forest
    'max_depth': [3, 6, 10], # Maximum depth of the tree
    'learning_rate': [0.01, 0.1], # Learning rate
    'subsample': [0.8, 1.0] # Subsample
}

In [17]:
# Define model and cross-validation
from xgboost import XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) # Initialize the XGBoost classifier
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) # Initialize the StratifiedKFold

In [18]:
# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb, # The classifier
    param_grid=param_grid, # The parameter grid
    cv=skf, # The StratifiedKFold
    scoring='accuracy', # The scoring metric
    n_jobs=-1, # The number of jobs to run in parallel
    verbose=1, # The verbosity level
    return_train_score=True # Whether to return the training scores
)

In [None]:
# Fit the model
grid_search.fit(X_train, y_train)

In [20]:
# Evaluate on test data
best_xgb = grid_search.best_estimator_

In [None]:
print("Best hyperparameters:", grid_search.best_params_)

In [None]:
# Feature importance
importances = best_xgb.feature_importances_ # Get the feature importances
sorted_indices = np.argsort(importances)[::-1] # Sort the feature importances in descending order
sorted_importances = importances[sorted_indices] # Get the sorted feature importances
sorted_features = X_train.columns[sorted_indices] # Get the sorted features
# Plot the feature importance
plt.figure(figsize=(10, 6)) # Create a figure
plt.barh(sorted_features, sorted_importances) # Plot the feature importance
plt.xlabel('Importance') # Label the x-axis
plt.ylabel('Feature') # Label the y-axis
plt.title('Feature Importance') # Title the plot
plt.show() # Show the plot

In [None]:
# Evaluate on test set
from sklearn.metrics import classification_report, confusion_matrix
y_pred = best_xgb.predict(X_test) # Predict the test set
test_accuracy = accuracy_score(y_test, y_pred) # Calculate the accuracy of the test set
classification_report_str = classification_report(y_test, y_pred) # Calculate the classification report
print("Best hyperparameters:", grid_search.best_params_) # Print the best hyperparameters
print("Test set accuracy: {:.4f}".format(test_accuracy)) # Print the accuracy of the test set
print("\nClassification Report:\n", classification_report_str) # Print the classification report


confusion_matrix_result = confusion_matrix(y_test, y_pred, normalize='true') * 100 # Calculate the confusion matrix
confusion_matrix_raw = confusion_matrix(y_test, y_pred) # Calculate the confusion matrix

# Combine percentage and raw values
annot = np.empty_like(confusion_matrix_result, dtype=object) # Create an empty array to store the annotations
for i in range(confusion_matrix_result.shape[0]): # Loop through the rows of the confusion matrix
    for j in range(confusion_matrix_result.shape[1]): # Loop through the columns of the confusion matrix
        annot[i, j] = f"{confusion_matrix_result[i, j]:.2f}%({confusion_matrix_raw[i, j]})" # Store the annotations

# Plot the confusion matrix
plt.figure(figsize=(10, 8)) # Create a figure
import seaborn as sns # Import seaborn
sns.heatmap(confusion_matrix_result, annot=annot, fmt='', cmap='Blues', xticklabels=np.unique(y_test), yticklabels=np.unique(y_test)) # Plot the confusion matrix
plt.xlabel('Predicted') # Label the x-axis
plt.ylabel('Actual') # Label the y-axis
plt.title('Confusion Matrix') # Title the plot
plt.show() # Show the plot

In [None]:
# Plot training and cross-validation accuracy
results = grid_search.cv_results_ # Get the results from the grid search
mean_train_scores = results['mean_train_score'] # Get the mean training scores
mean_test_scores = results['mean_test_score'] # Get the mean cross-validation scores
iso = np.argsort(mean_test_scores) # Sort the cross-validation scores
plt.figure(figsize=(14, 6)) # Create a figure
plt.plot(mean_train_scores[iso], label='Train Accuracy', marker='o') # Plot the training accuracy
plt.plot(mean_test_scores[iso], label='CV Accuracy', marker='s') # Plot the cross-validation accuracy
plt.xlabel('Hyperparameter Combination Index') # Label the x-axis
plt.ylabel('Accuracy') # Label the y-axis
plt.ylim(0.85, 1.05)  # Set y limits from 0 to 1
plt.title('Train vs Cross-Validation Accuracy') # Title the plot
plt.legend() # Show the legend
plt.grid(True) # Show the grid
plt.tight_layout() # Adjust the layout
plt.show() # Show the plot

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
# Predict the labels for the test set
y_test_pred = best_xgb.predict(X_test)

# Calculate the accuracy of the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)

# Calculate the precision of the model on the test set
test_precision = precision_score(y_test, y_test_pred)

# Calculate the recall of the model on the test set
test_recall = recall_score(y_test, y_test_pred)

# Calculate the F1-score of the model on the test set
test_f1 = f1_score(y_test, y_test_pred)

# Calculate the ROC AUC score of the model on the test set
y_test_prob = best_xgb.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_test_prob)

# Print all metrics for the test set
print(f"Test Set Accuracy: {test_accuracy:.4f}")
print(f"Test Set Precision: {test_precision:.4f}")
print(f"Test Set Recall: {test_recall:.4f}")
print(f"Test Set F1 Score: {test_f1:.4f}")
print(f"Test Set ROC AUC: {test_roc_auc:.4f}")