<a href="https://colab.research.google.com/github/micah-shull/pipelines/blob/main/pipelines_16_ensemble_048_optimal_thresholds_ROC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# ROC Threshold Tuning Notebook

# 1. Load and Preprocess Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import warnings
from loan_data_utils import load_and_preprocess_data

# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Define your URL, categorical columns, and target
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
categorical_columns = ['sex', 'education', 'marriage']
target = 'default_payment_next_month'

# Load and preprocess data
X, y = load_and_preprocess_data(url, categorical_columns, target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['category']).columns.tolist()

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first'))
        ]), categorical_features)
    ])

# 2. Load Optimal Parameters and Thresholds
import json

# Load optimal parameters and thresholds
filename = 'optimal_model_params_thresholds_roc.json'
with open(filename, 'r') as file:
    best_params = json.load(file)

# 3. Define Models
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import HistGradientBoostingClassifier

def define_models(best_params):
    models = {
        "Logistic Regression (ADASYN)": ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('resampler', ADASYN()),
            ('classifier', LogisticRegression(random_state=42, **best_params['Class 1 Recall']['Logistic Regression (ADASYN)']['best_params']))
        ]),
        "Logistic Regression (SMOTE)": ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('resampler', SMOTE()),
            ('classifier', LogisticRegression(random_state=42, **best_params['Class 1 Recall']['Logistic Regression (SMOTE)']['best_params']))
        ]),
        "LGBM (SMOTE)": ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('resampler', SMOTE()),
            ('classifier', LGBMClassifier(random_state=42, **best_params['Class 1 Recall']['LGBM (SMOTE)']['best_params'], force_row_wise=True))
        ]),
        "Logistic Regression (baseline)": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(random_state=42, **best_params['Class 1 Precision']['Logistic Regression (baseline)']['best_params']))
        ]),
        "LGBM (baseline)": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LGBMClassifier(random_state=42, **best_params['Class 1 Precision']['LGBM (baseline)']['best_params'], force_row_wise=True))
        ]),
        "Random Forest (class_weight_balanced)": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced', **best_params['Class 1 Precision']['Random Forest (class_weight_balanced)']['best_params']))
        ]),
        "Logistic Regression (baseline for Class 0 Recall)": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(random_state=42, **best_params['Class 0 Recall']['Logistic Regression (baseline)']['best_params']))
        ]),
        "LGBM (baseline for Class 0 Recall)": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', LGBMClassifier(random_state=42, **best_params['Class 0 Recall']['LGBM (baseline)']['best_params'], force_row_wise=True))
        ]),
        "Random Forest (class_weight_balanced for Class 0 Recall)": Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced', **best_params['Class 0 Recall']['Random Forest (class_weight_balanced)']['best_params']))
        ]),
        "LGBM (RandomUnderSampler)": ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('resampler', RandomUnderSampler()),
            ('classifier', LGBMClassifier(random_state=42, **best_params['Class 0 Precision']['LGBM (RandomUnderSampler)']['best_params'], force_row_wise=True))
        ]),
        "HistGradientBoosting (RandomUnderSampler)": ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('resampler', RandomUnderSampler()),
            ('classifier', HistGradientBoostingClassifier(random_state=42, **best_params['Class 0 Precision']['HistGradientBoosting (RandomUnderSampler)']['best_params']))
        ]),
        "Random Forest (RandomUnderSampler)": ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('resampler', RandomUnderSampler()),
            ('classifier', RandomForestClassifier(random_state=42, **best_params['Class 0 Precision']['Random Forest (RandomUnderSampler)']['best_params']))
        ])
    }
    return models

models = define_models(best_params)

# 4. Train and Evaluate Voting and Stacking Classifiers
from sklearn.metrics import classification_report

def save_results(results, filename):
    # Strip 'optimal_model_params_thresholds_' and '.json' from the filename
    method_name = filename.replace('optimal_model_params_thresholds_', '').replace('.json', '')
    results_filename = f'voting_stacking_results_{method_name}.json'

    # Save the results to a JSON file
    with open(results_filename, 'w') as file:
        json.dump(results, file, indent=4)

def train_and_evaluate_voting_stacking(models, X_train, y_train, X_test, y_test, filename):
    voting_clf = VotingClassifier(estimators=list(models.items()), voting='soft')
    stacking_clf = StackingClassifier(estimators=list(models.items()), final_estimator=LogisticRegression())

    # Train and evaluate the Voting classifier
    voting_clf.fit(X_train, y_train)
    y_pred_voting = voting_clf.predict(X_test)
    print(f'Voting Classifier Performance:')
    print(classification_report(y_test, y_pred_voting))

    # Train and evaluate the Stacking classifier
    stacking_clf.fit(X_train, y_train)
    y_pred_stacking = stacking_clf.predict(X_test)
    print(f'Stacking Classifier Performance:')
    print(classification_report(y_test, y_pred_stacking))

    # Prepare results
    results = {
        'voting': classification_report(y_test, y_pred_voting, output_dict=True),
        'stacking': classification_report(y_test, y_pred_stacking, output_dict=True)
    }

    # Save the results
    save_results(results, filename)

# Define models
models = define_models(best_params)

# Train and evaluate classifiers
train_and_evaluate_voting_stacking(models, X_train, y_train, X_test, y_test, filename)


[LightGBM] [Info] Number of positive: 18691, number of negative: 18691
[LightGBM] [Info] Total Bins 6544
[LightGBM] [Info] Number of data points in the train set: 37382, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Total Bins 3276
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221208 -> initscore=-1.258639
[LightGBM] [Info] Start training from score -1.258639
[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Total Bins 3276
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221208 -> initscore=-1.258639
[LightGBM] [Info] Start training from score -1.258639
[LightGBM] [Info] Number of positive: 5309, number of negativ