# Title

In [None]:
import pandas as pd 

from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit, StratifiedKFold
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA  
from sklearn.linear_model import LogisticRegression  
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.svm import SVC  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

from collections import Counter  
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt 
from mpl_toolkits.axes_grid1 import make_axes_locatable 

from ucimlrepo import fetch_ucirepo 

### Import and pre-process data

In [None]:
path = './'

try:
    X = pd.read_csv(path+'X.csv')
    y = pd.read_csv(path+'y.csv')
    print('Data loaded from directory:', path)
except:
    bank_marketing = fetch_ucirepo(id=222) 
    X = bank_marketing.data.features 
    y = bank_marketing.data.targets 
    print('Data loaded from UCIML')

df = X
df['y'] = y['y']

In [None]:
percents = (df.isna().sum() / len(X)) * 100
formatted_percents = percents.apply(lambda x: f'{x:.2f}%')
print(formatted_percents, '\n')
print(f"Length of DataFrame: {len(df)}", '\n\n')

In [None]:
df_clean = df.drop(columns = ['poutcome', 'contact'])

In [None]:
df_clean = df_clean.dropna()

In [None]:
print(type(df_clean))
print(len(df))
print(len(df_clean))
print(df_clean.isna().sum().sum())

In [None]:
# Create a DataFrame with data types of each column
dtype_info = pd.DataFrame({
    'Column': df_clean.columns,
    'Dtype': df_clean.dtypes
})

# Add a new column for the number of unique values if the column is of type 'object'
dtype_info['Unique Values'] = dtype_info['Column'].apply(lambda col: df_clean[col].nunique() if df_clean[col].dtype == 'object' else None)
dtype_info

In [None]:
for c in df_clean.columns:
    if df_clean[c].dtype == 'object' or c in ['day_of_week']:
        print(f'{c}: \n{sorted(df_clean[c].unique())}', '\n')

In [None]:
# Correctly rename the column without assigning it back to the DataFrame
df_clean.rename(columns={'day_of_week': 'day_of_month'}, inplace=True)

In [None]:
# Identify object-type columns (typically categorical)
object_columns = df_clean.select_dtypes(include=['object']).columns

# Create a binary column for pdays = -1
df_clean['not_previously_contacted'] = (df_clean['pdays'] == -1).astype(int)

# One-hot encode only the object columns, dropping the first category
df_encoded = pd.get_dummies(df_clean, columns=object_columns, drop_first=True, dtype=float)

# Optionally, check the result
print(df_encoded.head())

In [None]:
X = df_encoded.drop(columns='y')
y = df_encoded['y']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
class_mapping = {
    0: 'yes',
    1: 'no'
}

class_distribution = y.value_counts(normalize=True) * 100

print('Class distribution of training data:')
for label, value in class_distribution.items():
    print(f"{class_mapping[label]}: {value:.2f}%")
    
print(f'\nLength of training data: {len(y)}')

In [None]:
def split_scale_pca(X, y, test_size=0.3, random_state=42, n_components=None, verbose=True):
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    # Scale data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # PCA Decomposition
    if n_components is not None:
        pca = PCA(n_components=n_components)
    
        X_train_final = pca.fit_transform(X_train_scaled)
        X_test_final = pca.transform(X_test_scaled)

        if verbose:
            print(f"Reduced dimensions: {X_train_final.shape[1]}")
    else:
        X_train_final = X_train_scaled
        X_test_final = X_test_scaled

    if verbose:
        print(f"Train Data Shape: {X_train_final.shape}")
        print(f"Test Data Shape: {X_test_final.shape}")

    return X_train_final, X_test_final, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = split_scale_pca(X, y, test_size=0.3, random_state=42, n_components=2, verbose=True)

## Training and Evaluation - Full Dataset

In [None]:
param_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(class_weight='balanced', max_iter=1000),
        'params': {'C': [0.1, 1], 'solver': ['liblinear', 'lbfgs']}
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(random_state=42),
        'params': {'max_depth': [5, 10], 'criterion': ['gini', 'entropy']}
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance']}
    },
    'SVC': {
        'model': SVC(class_weight='balanced'),
        'params': {'kernel': ['linear', 'rbf'], 'C': [0.1, 1]}
    }
}

train_times = {}
train_preds = {}
train_accuracies = {}
test_preds = {}
test_accuracies = {}
best_models = {}

for name, config in param_grids.items():
    print(f'Training {name} with GridSearchCV')

    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=42), 
        n_jobs=-1, 
        verbose=1
    )

    start_time = time.time()
    grid_search.fit(X_train, y_train)
    elapsed_time = time.time() - start_time

    avg_train_time = elapsed_time / len(grid_search.cv_results_['params'])
    train_times[name] = avg_train_time

    best_model = grid_search.best_estimator_
    best_models[name] = best_model

    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    train_preds[name] = y_train_pred
    train_accuracies[name] = accuracy_score(y_train, y_train_pred)
    test_preds[name] = y_test_pred
    test_accuracies[name] = accuracy_score(y_test, y_test_pred)

    print(f'{name} - Best Params: {grid_search.best_params_}')
    print(f'{name} - Train Accuracy: {train_accuracies[name]:.4f}')
    print(f'{name} - Test Accuracy: {test_accuracies[name]:.4f}')
    print(f'{name} - Average Train Time: {avg_train_time:.4f} seconds\n')

print('Training times for each model:')
for name, train_time in train_times.items():
    print(f'{name}: {train_time:.4f} seconds')

In [None]:
# Model names as you defined them
model_names = ['Logistic Regression', 'Decision Tree', 'KNN', 'SVC']  

# Step 1: Store confusion matrices and calculate the maximum value across all matrices
confusion_matrices = {}
max_value = 0

for name in model_names:
    # Calculate the confusion matrix
    y_test_pred = test_preds[name]
    cm = confusion_matrix(y_test, y_test_pred)
    confusion_matrices[name] = cm
    max_value = max(max_value, cm.max())  # Update the maximum value for consistent color scaling

# Step 2: Set up the 2x2 grid for the confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.flatten()  # Flatten the axes array to easily access each subplot

# Step 3: Plot each confusion matrix with consistent color scaling
for idx, name in enumerate(model_names):
    best_model = best_models[name]  
    train_accuracy = train_accuracies[name]  
    test_accuracy = test_accuracies[name] 

    # Print the model performance
    print(f'{name} Model Performance:')
    print(f"Training Accuracy: {train_accuracy:.2f}")
    print(f"Testing Accuracy: {test_accuracy:.2f}")

    # Use the stored test predictions
    y_test_pred = test_preds[name]

    # Print the classification report
    print(classification_report(y_test, y_test_pred))

    # Retrieve the confusion matrix
    cm = confusion_matrices[name]
    print(f'Confusion Matrix:\n{cm}\n')

    # Manually plot the confusion matrix using consistent color scaling
    im = axes[idx].imshow(cm, interpolation='nearest', cmap='Blues', vmin=0, vmax=max_value)
    axes[idx].set_title(f'Confusion Matrix: {name}')
    
    # Add labels to the axes
    axes[idx].set_xticks([0, 1])
    axes[idx].set_yticks([0, 1])
    axes[idx].set_xticklabels(['0', '1'])
    axes[idx].set_yticklabels(['0', '1'])
    axes[idx].set_xlabel('Predicted label')
    axes[idx].set_ylabel('True label')

    # Annotate each cell with the corresponding count
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            axes[idx].text(j, i, format(cm[i, j], 'd'),
                           ha="center", va="center",
                           color="white" if cm[i, j] > max_value / 2 else "black")  # Dynamic text color

    # Use make_axes_locatable to create a color bar that aligns with the matrix
    divider = make_axes_locatable(axes[idx])
    cax = divider.append_axes("right", size="5%", pad=0.1)
    fig.colorbar(im, cax=cax)

# Adjust layout for better spacing
plt.tight_layout()
plt.show()


## Training and Evaluation - Resampled Dataset

In [None]:
# # Parameter grids for GridSearchCV
# param_grids = {
#     'Logistic Regression - Balanced': {
#         'model': LogisticRegression(class_weight='balanced', max_iter=1000),
#         'params': {'C': [0.1, 1], 'solver': ['liblinear', 'lbfgs']}
#     },
#     'Decision Tree - Balanced': { 
#         'model': DecisionTreeClassifier(random_state=42),
#         'params': {'max_depth': [5, 10], 'criterion': ['gini', 'entropy']}
#     },
#     'KNN - Balanced': {
#         'model': KNeighborsClassifier(),
#         'params': {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance']}
#     },
#     'SVC - Balanced': {
#         'model': SVC(class_weight='balanced'),
#         'params': {'kernel': ['linear', 'rbf'], 'C': [0.1, 1]}
#     }
# }

# # Define seeds for each fold
# seeds = [43, 44, 45, 46]

# # Initialize dictionaries to store results
# balanced_train_times = {}
# balanced_best_params = {}
# balanced_train_accuracies = {name: [] for name in param_grids}
# balanced_test_accuracies = {name: [] for name in param_grids}
# balanced_best_params_all_folds = {name: [] for name in param_grids}
# balanced_best_models = {}
# balanced_test_preds = {}  

# # Inside the training loop for each seed and model, store the best model
# for seed in seeds:
#     print(f"\n=== Starting Fold with Seed {seed} ===")
    
#     # Random under-sampling with the current seed
#     undersampler = RandomUnderSampler(random_state=seed)
#     X_resampled, y_resampled = undersampler.fit_resample(X, y)

#     X_train, X_test, y_train, y_test = split_scale_pca(X_resampled, y_resampled, test_size=0.3, random_state=seed, n_components=2, verbose=False)

#     # Iterate over models in param_grids and perform training
#     for name, config in param_grids.items():
#         print(f'Training {name} with GridSearchCV for seed {seed}')

#         # Setup and fit GridSearchCV
#         grid_search = GridSearchCV(
#             estimator=config['model'],
#             param_grid=config['params'],
#             cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=seed),
#             n_jobs=-1,
#             verbose=1
#         )

#         # Train the model
#         start_time = time.time()
#         grid_search.fit(X_train, y_train)
#         elapsed_time = time.time() - start_time

#         # Calculate average train time
#         avg_train_time = elapsed_time / len(grid_search.cv_results_['params'])
#         balanced_train_times[f'{name} (Seed {seed})'] = avg_train_time

#         # Get the best model from this fold and store it
#         best_model = grid_search.best_estimator_
#         balanced_best_models[name] = best_model  # Store best model for each algorithm

#         # Store best params for this fold
#         balanced_best_params_all_folds[name].append(grid_search.best_params_)

#         # Train and test predictions
#         y_train_pred = best_model.predict(X_train)
#         y_test_pred = best_model.predict(X_test)

#         # Store accuracies and predictions
#         balanced_train_accuracies[name].append(accuracy_score(y_train, y_train_pred))
#         balanced_test_accuracies[name].append(accuracy_score(y_test, y_test_pred))
        
#         # NEW: Store test predictions for evaluation later
#         balanced_test_preds[name] = y_test_pred

#         # Print results
#         print(f'{name} (Seed {seed}) - Best Params: {grid_search.best_params_}')
#         print(f'{name} (Seed {seed}) - Train Accuracy: {balanced_train_accuracies[name][-1]:.4f}')
#         print(f'{name} (Seed {seed}) - Test Accuracy: {balanced_test_accuracies[name][-1]:.4f}')
#         print(f'{name} (Seed {seed}) - Average Train Time: {avg_train_time:.4f} seconds\n')


# # Calculate average accuracies and find the most common best params
# for name in param_grids:
#     avg_train_acc = sum(balanced_train_accuracies[name]) / len(balanced_train_accuracies[name])
#     avg_test_acc = sum(balanced_test_accuracies[name]) / len(balanced_test_accuracies[name])

#     # Find the most common best params
#     best_params_counter = Counter([str(params) for params in balanced_best_params_all_folds[name]])
#     most_common_params = best_params_counter.most_common(1)

#     # If there's a tie or no clear winner, use the best params from the highest test accuracy fold
#     if most_common_params[0][1] == 1:
#         best_idx = balanced_test_accuracies[name].index(max(balanced_test_accuracies[name]))
#         best_params_for_model = balanced_best_params_all_folds[name][best_idx]
#     else:
#         best_params_for_model = eval(most_common_params[0][0])

#     print(f'\n=== Summary for {name} ===')
#     print(f'Average Train Accuracy: {avg_train_acc:.4f}')
#     print(f'Average Test Accuracy: {avg_test_acc:.4f}')
#     print(f'Most Common Best Params: {best_params_for_model}')


In [None]:
# Parameter grids for GridSearchCV
param_grids = {
    'Logistic Regression - Resampled': {
        'model': LogisticRegression(class_weight='balanced', max_iter=1000),
        'params': {'C': [0.1, 1], 'solver': ['liblinear', 'lbfgs']}
    },
    'Decision Tree - Resampled': { 
        'model': DecisionTreeClassifier(random_state=42),
        'params': {'max_depth': [5, 10], 'criterion': ['gini', 'entropy']}
    },
    'KNN - Resampled': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance']}
    },
    'SVC - Resampled': {
        'model': SVC(class_weight='balanced'),
        'params': {'kernel': ['linear', 'rbf'], 'C': [0.1, 1]}
    }
}

# Define seeds for each fold
seeds = [43, 44, 45, 46]

# Initialize dictionaries to store results
resampled_train_times = {}
resampled_best_params = {}
resampled_train_accuracies = {name: [] for name in param_grids}
resampled_test_accuracies = {name: [] for name in param_grids}
resampled_best_params_all_folds = {name: [] for name in param_grids}
resampled_best_models = {}
resampled_test_preds = {}  

# Inside the training loop for each seed and model, store the best model
for seed in seeds:
    print(f"\n=== Starting Fold with Seed {seed} ===")
    
    # Random under-sampling with the current seed
    undersampler = RandomUnderSampler(random_state=seed)
    X_resampled, y_resampled = undersampler.fit_resample(X, y)

    X_train, X_test, y_train, y_test = split_scale_pca(X_resampled, y_resampled, test_size=0.3, random_state=seed, n_components=2, verbose=False)

    # Iterate over models in param_grids and perform training
    for name, config in param_grids.items():
        print(f'Training {name} with GridSearchCV for seed {seed}')

        # Setup and fit GridSearchCV
        grid_search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=seed),
            n_jobs=-1,
            verbose=1
        )

        # Train the model
        start_time = time.time()
        grid_search.fit(X_train, y_train)
        elapsed_time = time.time() - start_time

        # Calculate average train time
        avg_train_time = elapsed_time / len(grid_search.cv_results_['params'])
        resampled_train_times[f'{name} (Seed {seed})'] = avg_train_time

        # Get the best model from this fold and store it
        best_model = grid_search.best_estimator_
        resampled_best_models[name] = best_model  # Store best model for each algorithm

        # Store best params for this fold
        resampled_best_params_all_folds[name].append(grid_search.best_params_)

        # Train and test predictions
        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)

        # Store accuracies and predictions
        resampled_train_accuracies[name].append(accuracy_score(y_train, y_train_pred))
        resampled_test_accuracies[name].append(accuracy_score(y_test, y_test_pred))
        
        # Store test predictions for evaluation later
        resampled_test_preds[name] = y_test_pred

        # Print results
        print(f'{name} (Seed {seed}) - Best Params: {grid_search.best_params_}')
        print(f'{name} (Seed {seed}) - Train Accuracy: {resampled_train_accuracies[name][-1]:.4f}')
        print(f'{name} (Seed {seed}) - Test Accuracy: {resampled_test_accuracies[name][-1]:.4f}')
        print(f'{name} (Seed {seed}) - Average Train Time: {avg_train_time:.4f} seconds\n')

# Calculate average accuracies and find the most common best params
for name in param_grids:
    avg_train_acc = sum(resampled_train_accuracies[name]) / len(resampled_train_accuracies[name])
    avg_test_acc = sum(resampled_test_accuracies[name]) / len(resampled_test_accuracies[name])

    # Find the most common best params
    best_params_counter = Counter([str(params) for params in resampled_best_params_all_folds[name]])
    most_common_params = best_params_counter.most_common(1)

    # If there's a tie or no clear winner, use the best params from the highest test accuracy fold
    if most_common_params[0][1] == 1:
        best_idx = resampled_test_accuracies[name].index(max(resampled_test_accuracies[name]))
        best_params_for_model = resampled_best_params_all_folds[name][best_idx]
    else:
        best_params_for_model = eval(most_common_params[0][0])

    print(f'\n=== Summary for {name} ===')
    print(f'Average Train Accuracy: {avg_train_acc:.4f}')
    print(f'Average Test Accuracy: {avg_test_acc:.4f}')
    print(f'Most Common Best Params: {best_params_for_model}')

In [None]:
# # Model names with adjusted formatting
# model_names = ['Logistic Regression\n(Balanced)', 'Decision Tree\n(Balanced)', 'KNN\n(Balanced)', 'SVC\n(Balanced)']

# # Re-splitting and scaling the dataset
# X_train, X_test, y_train, y_test = split_scale_pca(X, y, test_size=0.3, random_state=42, n_components=2, verbose=False)

# # Step 1: Store confusion matrices for each model
# confusion_matrices = {}
# test_accuracies = {}

# # Calculate the maximum value across all confusion matrices to set consistent bounds
# max_value = 0

# for name in model_names:
#     # Fetch the best model from the dictionary
#     best_model = balanced_best_models[name.replace('\n(Balanced)', ' - Balanced')]

#     # Predict on the test set and store the predictions
#     y_test_pred = best_model.predict(X_test)
    
#     # Store the confusion matrix
#     cm = confusion_matrix(y_test, y_test_pred)
#     confusion_matrices[name] = cm

#     # Store the test accuracy
#     test_accuracies[name] = accuracy_score(y_test, y_test_pred)

#     # Update max_value with the highest value found in the confusion matrix
#     max_value = max(max_value, cm.max())

# # Step 2: Plot the stored confusion matrices using consistent scaling
# fig, axes = plt.subplots(2, 2, figsize=(10, 10))
# axes = axes.flatten()  # Flatten the axes array to easily access each subplot

# # Iterate through each model, print performance, and plot the confusion matrix
# for idx, name in enumerate(model_names):
#     cm = confusion_matrices[name]  # Use the stored confusion matrix
#     test_accuracy = test_accuracies[name]  # Use the stored test accuracy

#     # Clean the title for printing by removing the newline character
#     name_cleaned = name.replace('\n', ' ')

#     # Print the accuracy and classification report
#     print(f'{name_cleaned} Model Performance:')
#     print(f"Testing Accuracy on Full Test Set: {test_accuracy:.2f}")
#     print(f'Confusion Matrix:\n{cm}\n')

#     # Manually plot the confusion matrix using consistent color scaling
#     im = axes[idx].imshow(cm, interpolation='nearest', cmap='Blues', vmin=0, vmax=max_value)  # Set consistent color scaling

#     # Annotate the confusion matrix cells with the text (count values)
#     for i in range(cm.shape[0]):
#         for j in range(cm.shape[1]):
#             axes[idx].text(j, i, format(cm[i, j], 'd'),
#                            ha="center", va="center",
#                            color="white" if cm[i, j] > max_value / 2 else "black")  # Dynamic text color

#     # Set the title and labels
#     axes[idx].set_title(f'Confusion Matrix: {name}')
#     axes[idx].set_xticks([0, 1])
#     axes[idx].set_yticks([0, 1])
#     axes[idx].set_xticklabels(['0', '1'])
#     axes[idx].set_yticklabels(['0', '1'])
#     axes[idx].set_xlabel('Predicted label')
#     axes[idx].set_ylabel('True label')

#     # Use make_axes_locatable to create a color bar that aligns with the matrix
#     divider = make_axes_locatable(axes[idx])
#     cax = divider.append_axes("right", size="5%", pad=0.1)
#     fig.colorbar(im, cax=cax)

# # Adjust layout for better spacing
# plt.tight_layout()
# plt.subplots_adjust(top=0.9, hspace=0.4)  # Adjust 'top' for more space at the top, and increase 'hspace'
# plt.show()

In [None]:
# Model names with adjusted formatting
model_names = ['Logistic Regression\n(Resampled)', 'Decision Tree\n(Resampled)', 'KNN\n(Resampled)', 'SVC\n(Resampled)']

# Re-splitting and scaling the dataset
X_train, X_test, y_train, y_test = split_scale_pca(X, y, test_size=0.3, random_state=42, n_components=2, verbose=False)

# Step 1: Store confusion matrices for each model
confusion_matrices = {}
test_accuracies = {}

# Calculate the maximum value across all confusion matrices to set consistent bounds
max_value = 0

for name in model_names:
    # Fetch the best model from the dictionary
    best_model = resampled_best_models[name.replace('\n(Resampled)', ' - Resampled')]

    # Predict on the test set and store the predictions
    y_test_pred = best_model.predict(X_test)
    
    # Store the confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    confusion_matrices[name] = cm

    # Store the test accuracy
    test_accuracies[name] = accuracy_score(y_test, y_test_pred)

    # Update max_value with the highest value found in the confusion matrix
    max_value = max(max_value, cm.max())

# Step 2: Plot the stored confusion matrices using consistent scaling
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.flatten()  # Flatten the axes array to easily access each subplot

# Iterate through each model, print performance, and plot the confusion matrix
for idx, name in enumerate(model_names):
    cm = confusion_matrices[name]  # Use the stored confusion matrix
    test_accuracy = test_accuracies[name]  # Use the stored test accuracy

    # Clean the title for printing by removing the newline character
    name_cleaned = name.replace('\n', ' ')

    # Print the accuracy and classification report
    print(f'{name_cleaned} Model Performance:')
    print(f"Testing Accuracy on Full Test Set: {test_accuracy:.2f}")
    print(f'Confusion Matrix:\n{cm}\n')

    # Manually plot the confusion matrix using consistent color scaling
    im = axes[idx].imshow(cm, interpolation='nearest', cmap='Blues', vmin=0, vmax=max_value)  # Set consistent color scaling

    # Annotate the confusion matrix cells with the text (count values)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            axes[idx].text(j, i, format(cm[i, j], 'd'),
                           ha="center", va="center",
                           color="white" if cm[i, j] > max_value / 2 else "black")  # Dynamic text color

    # Set the title and labels
    axes[idx].set_title(f'Confusion Matrix: {name}')
    axes[idx].set_xticks([0, 1])
    axes[idx].set_yticks([0, 1])
    axes[idx].set_xticklabels(['0', '1'])
    axes[idx].set_yticklabels(['0', '1'])
    axes[idx].set_xlabel('Predicted label')
    axes[idx].set_ylabel('True label')

    # Use make_axes_locatable to create a color bar that aligns with the matrix
    divider = make_axes_locatable(axes[idx])
    cax = divider.append_axes("right", size="5%", pad=0.1)
    fig.colorbar(im, cax=cax)

# Adjust layout for better spacing
plt.tight_layout()
plt.subplots_adjust(top=0.9, hspace=0.4)  # Adjust 'top' for more space at the top, and increase 'hspace'
plt.show()


## SMOTE

In [None]:
# Parameter grids for GridSearchCV
param_grids = {
    'Logistic Regression - SMOTE': {
        'model': LogisticRegression(class_weight='balanced', max_iter=1000),
        'params': {'C': [0.1, 1], 'solver': ['liblinear', 'lbfgs']}
    },
    'Decision Tree - SMOTE': { 
        'model': DecisionTreeClassifier(random_state=42),
        'params': {'max_depth': [5, 10], 'criterion': ['gini', 'entropy']}
    },
    'KNN - SMOTE': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5], 'weights': ['uniform', 'distance']}
    },
    'SVC - SMOTE': {
        'model': SVC(class_weight='balanced'),
        'params': {'kernel': ['linear', 'rbf'], 'C': [0.1, 1]}
    }
}

# Define seeds for each fold
seeds = [43, 44, 45, 46]

# Initialize dictionaries to store results
smote_train_times = {}
smote_best_params = {}
smote_train_accuracies = {name: [] for name in param_grids}
smote_test_accuracies = {name: [] for name in param_grids}
smote_best_params_all_folds = {name: [] for name in param_grids}
smote_best_models = {}
smote_test_preds = {}  


for seed in seeds:
    print(f"\n=== Starting Fold with Seed {seed} ===")
    
    # Define StratifiedKFold for cross-validation
    stratified_cv = StratifiedKFold(n_splits=1, shuffle=True, random_state=seed)

    # Split original data into train and test
    X_train, X_test, y_train, y_test = split_scale_pca(X, y, test_size=0.3, random_state=seed, n_components=2, verbose=False)
    
    # Apply SMOTE only to the training set
    smote = SMOTE(random_state=seed)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    for name, config in param_grids.items():
        print(f'Training {name} with GridSearchCV for seed {seed}')

        # Setup and fit GridSearchCV with StratifiedKFold
        grid_search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            cv=stratified_cv,
            n_jobs=-1,
            verbose=1
        )

        # Train the model on the resampled training data
        start_time = time.time()
        grid_search.fit(X_train_resampled, y_train_resampled)
        elapsed_time = time.time() - start_time

        # Calculate average train time
        avg_train_time = elapsed_time / len(grid_search.cv_results_['params'])
        smote_train_times[f'{name} (Seed {seed})'] = avg_train_time

        # Get the best model from this fold and store it
        best_model = grid_search.best_estimator_
        smote_best_models[name] = best_model  # Store best model for each algorithm

        # Store best params for this fold
        smote_best_params_all_folds[name].append(grid_search.best_params_)

        # Train and test predictions
        y_train_pred = best_model.predict(X_train)
        y_test_pred = best_model.predict(X_test)

        # Store accuracies and predictions
        smote_train_accuracies[name].append(accuracy_score(y_train, y_train_pred))
        smote_test_accuracies[name].append(accuracy_score(y_test, y_test_pred))
        
        # NEW: Store test predictions for evaluation later
        smote_test_preds[name] = y_test_pred

        # Print results
        print(f'{name} (Seed {seed}) - Best Params: {grid_search.best_params_}')
        print(f'{name} (Seed {seed}) - Train Accuracy: {smote_train_accuracies[name][-1]:.4f}')
        print(f'{name} (Seed {seed}) - Test Accuracy: {smote_test_accuracies[name][-1]:.4f}')
        print(f'{name} (Seed {seed}) - Average Train Time: {avg_train_time:.4f} seconds\n')


# Calculate average accuracies and find the most common best params
for name in param_grids:
    avg_train_acc = sum(smote_train_accuracies[name]) / len(smote_train_accuracies[name])
    avg_test_acc = sum(smote_test_accuracies[name]) / len(smote_test_accuracies[name])

    # Find the most common best params
    best_params_counter = Counter([str(params) for params in smote_best_params_all_folds[name]])
    most_common_params = best_params_counter.most_common(1)

    # If there's a tie or no clear winner, use the best params from the highest test accuracy fold
    if most_common_params[0][1] == 1:
        best_idx = smote_test_accuracies[name].index(max(smote_test_accuracies[name]))
        best_params_for_model = smote_best_params_all_folds[name][best_idx]
    else:
        best_params_for_model = eval(most_common_params[0][0])

    print(f'\n=== Summary for {name} ===')
    print(f'Average Train Accuracy: {avg_train_acc:.4f}')
    print(f'Average Test Accuracy: {avg_test_acc:.4f}')
    print(f'Most Common Best Params: {best_params_for_model}')


In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
)
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Model names with adjusted formatting for SMOTE
model_names = ['Logistic Regression\n(SMOTE)', 'Decision Tree\n(SMOTE)', 'KNN\n(SMOTE)', 'SVC\n(SMOTE)']

# Re-splitting and scaling the dataset
X_train, X_test, y_train, y_test = split_scale_pca(X, y, test_size=0.3, random_state=42, n_components=2, verbose=False)

# Step 1: Store confusion matrices for each model
confusion_matrices = {}
test_accuracies = {}

# Calculate the maximum value across all confusion matrices to set consistent bounds
max_value = 0

for name in model_names:
    # Fetch the best model from the dictionary
    best_model = smote_best_models[name.replace('\n(SMOTE)', ' - SMOTE')]

    # Predict on the test set and store the predictions
    y_test_pred = best_model.predict(X_test)
    
    # Store the confusion matrix
    cm = confusion_matrix(y_test, y_test_pred)
    confusion_matrices[name] = cm

    # Store the test accuracy
    test_accuracies[name] = accuracy_score(y_test, y_test_pred)

    # Update max_value with the highest value found in the confusion matrix
    max_value = max(max_value, cm.max())

# Step 2: Plot the stored confusion matrices using consistent scaling
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.flatten()  # Flatten the axes array to easily access each subplot

# Iterate through each model, print performance, and plot the confusion matrix
for idx, name in enumerate(model_names):
    cm = confusion_matrices[name]  # Use the stored confusion matrix
    test_accuracy = test_accuracies[name]  # Use the stored test accuracy

    # Clean the title for printing by removing the newline character
    name_cleaned = name.replace('\n', ' ')

    # Print the accuracy and classification report
    print(f'{name_cleaned} Model Performance:')
    print(f"Testing Accuracy on Full Test Set: {test_accuracy:.2f}")
    print(f'Confusion Matrix:\n{cm}\n')

    # Manually plot the confusion matrix using consistent color scaling
    im = axes[idx].imshow(cm, interpolation='nearest', cmap='Blues', vmin=0, vmax=max_value)  # Set consistent color scaling

    # Annotate the confusion matrix cells with the text (count values)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            axes[idx].text(j, i, format(cm[i, j], 'd'),
                           ha="center", va="center",
                           color="white" if cm[i, j] > max_value / 2 else "black")  # Dynamic text color

    # Set the title and labels
    axes[idx].set_title(f'Confusion Matrix: {name}')
    axes[idx].set_xticks([0, 1])
    axes[idx].set_yticks([0, 1])
    axes[idx].set_xticklabels(['0', '1'])
    axes[idx].set_yticklabels(['0', '1'])
    axes[idx].set_xlabel('Predicted label')
    axes[idx].set_ylabel('True label')

    # Use make_axes_locatable to create a color bar that aligns with the matrix
    divider = make_axes_locatable(axes[idx])
    cax = divider.append_axes("right", size="5%", pad=0.1)
    fig.colorbar(im, cax=cax)

# Adjust layout for better spacing
plt.tight_layout()
plt.subplots_adjust(top=0.9, hspace=0.4)  # Adjust 'top' for more space at the top, and increase 'hspace'
plt.show()
