# Bank Marketing Prediction - Google Colab Notebook

This notebook replicates the bank marketing prediction project in a Google Colab environment.

**Workflow:**
1. **Setup Environment**: Mount Google Drive and install necessary Python libraries.
2. **Prepare Directories & Imports**: Import all libraries and create output directories.
3. **Define Preprocessing Functions**: Define functions to load and prepare the data.
4. **Define Model Training Function**: Define the main function for training, tuning, and evaluating the models.
5. **Run Main Pipeline**: Execute the entire workflow.

### 1. Setup Environment

Mount Google Drive to access the dataset and install the required libraries.

In [None]:
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

# Install required libraries
!pip install pandas scikit-learn matplotlib seaborn

### 2. Prepare Directories & Imports

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Create output directories in the Colab environment
output_dir = 'outputs'
cm_dir = os.path.join(output_dir, 'confusion-matrix')
scorer_dir = os.path.join(output_dir, 'scorer')

os.makedirs(cm_dir, exist_ok=True)
os.makedirs(scorer_dir, exist_ok=True)

split_ratios_for_dir = [0.7, 0.8, 0.9]
for ratio in split_ratios_for_dir:
    split_str = str(ratio).replace('.', '_')
    os.makedirs(os.path.join(scorer_dir, f'splits_{int(ratio*100)}_{100-int(ratio*100)}'), exist_ok=True)

print("Directories created successfully in the Colab environment.")

### 3. Define Preprocessing Functions

In [None]:
def load_data(file_path):
    """Loads data from a CSV file."""
    return pd.read_csv(file_path, sep=';')

def preprocess_data(df):
    """Encodes categorical features and scales numerical features."""
    # Encode categorical features
    categorical_features = df.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    for col in categorical_features:
        df[col] = le.fit_transform(df[col])

    # Scaling numerical features
    # Exclude the target variable 'y' from scaling
    numerical_features = df.select_dtypes(include=np.number).columns.drop('y')
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    
    return df

### 4. Define Model Training Function

In [None]:
def train_and_evaluate(X_train, X_test, y_train, y_test, split_ratio, k_fold_cv):
    """
    Trains and evaluates KNN, Decision Tree, and Naive Bayes models.
    Performs hyperparameter tuning for KNN.
    """
    models = {
        'knn': KNeighborsClassifier(),
        'decision_tree': DecisionTreeClassifier(random_state=42),
        'naive_bayes': GaussianNB()
    }

    results = {}
    
    # Define the parameter grid for KNN
    param_grid_knn = {'n_neighbors': list(range(1, 21))}

    for model_name, model in models.items():
        print(f"--- Running {model_name} with split {split_ratio} and k-fold {k_fold_cv} ---")
        
        # Hyperparameter tuning for KNN
        if model_name == 'knn':
            grid_search = GridSearchCV(model, param_grid_knn, cv=k_fold_cv, scoring='accuracy', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            print(f"Best KNN params: {grid_search.best_params_}")
        else:
            model.fit(X_train, y_train)
            best_model = model

        # Predictions
        y_pred = best_model.predict(X_test)

        # Performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0)
        recall = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)

        # K-fold cross-validation
        cv_scores = cross_val_score(best_model, X_train, y_train, cv=k_fold_cv, scoring='accuracy')
        mean_cv_accuracy = np.mean(cv_scores)

        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix: {model_name.replace("_", " ").title()} (Split {split_ratio})')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        split_str = str(split_ratio).replace('.', '_')
        cm_filename = f'{model_name}_split_{split_str}.png'
        plt.savefig(os.path.join(cm_dir, cm_filename))
        plt.close()

        results[model_name] = {
            'Split Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'Mean CV Accuracy': mean_cv_accuracy
        }
    
    return results

### 5. Run Main Pipeline

In [None]:
def main_colab():
    # Define the dataset path in Google Drive
    file_path = '/content/drive/My Drive/Colab Notebooks/bank-marketing-ml/datasets/bank-full.csv'
    
    # Check if file exists
    if not os.path.exists(file_path):
        print(f"ERROR: Dataset not found at {file_path}")
        print("Please make sure the dataset is uploaded to the correct Google Drive path.")
        return

    print("Loading and preprocessing data...")
    df = load_data(file_path)
    df_processed = preprocess_data(df.copy())

    X = df_processed.drop('y', axis=1)
    y = df_processed['y']

    split_ratios = [0.7, 0.8, 0.9]
    k_folds = [5, 10]
    summary_results = []

    for ratio in split_ratios:
        print(f"\n{'='*20} PROCESSING SPLIT RATIO: {int(ratio*100)}/{100-int(ratio*100)} {'='*20}")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-ratio, random_state=42, stratify=y)
        
        for k_fold in k_folds:
            results = train_and_evaluate(X_train, X_test, y_train, y_test, ratio, k_fold)
            
            for model_name, metrics in results.items():
                summary_results.append({
                    'Model': model_name,
                    'Split Ratio': ratio,
                    'K-Fold': k_fold,
                    'Split Accuracy': metrics['Split Accuracy'],
                    'Mean CV Accuracy': metrics['Mean CV Accuracy']
                })
                
                # Save individual scorer metrics
                split_str_dir = f"splits_{int(ratio*100)}_{100-int(ratio*100)}"
                scorer_path = os.path.join(scorer_dir, split_str_dir)
                metric_df = pd.DataFrame([metrics], index=[model_name])
                metric_df.to_csv(os.path.join(scorer_path, f'{model_name}_metrics.csv'))


    # Create and save summary dataframe
    summary_df = pd.DataFrame(summary_results)
    summary_df.to_csv(os.path.join(scorer_dir, 'summary_results.csv'), index=False)
    print("\nSummary of all runs:")
    print(summary_df)

    # Plotting summary results
    plt.figure(figsize=(12, 7))
    sns.barplot(data=summary_df, x='Model', y='Split Accuracy', hue='Split Ratio')
    plt.title('Split Accuracy for all Models and Split Ratios')
    plt.savefig(os.path.join(scorer_dir, 'split_accuracy_comparison.png'))
    plt.show()

    plt.figure(figsize=(12, 7))
    sns.barplot(data=summary_df, x='Model', y='Mean CV Accuracy', hue='K-Fold')
    plt.title('Mean CV Accuracy for all Models and K-Folds')
    plt.savefig(os.path.join(scorer_dir, 'mean_cv_accuracy_comparison.png'))
    plt.show()
    
    print("\nProcessing complete. All outputs saved in the 'outputs' directory in the Colab environment.")
    print("You can download the 'outputs' folder from the file browser on the left.")

# Run the main function
main_colab()