<a href="https://colab.research.google.com/github/micah-shull/pipelines/blob/main/pipelines_16_ensemble_041_resampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import sklearn
print(sklearn.__version__)

1.3.2


### Train Model with Balanced Class Weights

In [8]:
# Define the experiment name
experiment_name = 'class_weight_balanced'

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import logging
from loan_data_utils import load_and_preprocess_data, evaluate_model, compile_metrics
import json

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Define your URL, categorical columns, and target
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
categorical_columns = ['sex', 'education', 'marriage']
target = 'default_payment_next_month'

# load and preprocess
X, y = load_and_preprocess_data(url, categorical_columns, target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['category']).columns.tolist()

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first'))
        ]), categorical_features)
    ])

# Define the models to evaluate with class_weight='balanced'
models = {
    'Logistic Regression': LogisticRegression(solver='liblinear', class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'HistGradientBoosting': HistGradientBoostingClassifier(random_state=42),  # Does not support class_weight
    'LGBM': LGBMClassifier(class_weight='balanced', random_state=42)
}

# List to store metrics for each model
metrics_list = []

# Evaluate each model and capture metrics
for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    metrics = evaluate_model(pipeline, X_train, X_test, y_train, y_test, model_name, experiment_name)
    metrics_list.append(metrics)

# Compile metrics into a DataFrame and save to CSV with experiment name
metrics_df = compile_metrics(metrics_list, experiment_name=experiment_name)

# Save the new metrics to a CSV file
metrics_df.to_csv(f'{experiment_name}_metrics.csv', index=False)
logger.info(f"{experiment_name} metrics saved to {experiment_name}_metrics.csv")

# Import the previous CSV of metrics
baseline_metrics_df = pd.read_csv('baseline_metrics.csv')

# Concatenate the previous metrics with the new metrics
combined_metrics_df = pd.concat([baseline_metrics_df, metrics_df], ignore_index=True)

# Save the combined metrics DataFrame to a new CSV file
combined_metrics_df.to_csv('combined_model_metrics.csv', index=False)
logger.info("Combined metrics saved to combined_model_metrics.csv")

# Optionally, save the best performing model
best_model_name = combined_metrics_df.loc[combined_metrics_df['F1_Macro'].idxmax(), 'Model']
best_model = models[best_model_name]

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
        ('classifier', best_model)
])

pipeline.fit(X_train, y_train)
joblib.dump(pipeline, f'best_model_{best_model_name}.pkl')
logger.info(f"Best model saved as best_model_{best_model_name}.pkl")

# Display the metrics DataFrame
metrics_df


[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002234 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3276
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3276
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 30
[LightGBM] [Info] [bin

Unnamed: 0,Model,Experiment,Recall_0,Precision_0,F1_0,Recall_1,Precision_1,F1_1,F1_Macro,Accuracy
0,Logistic Regression,class_weight_balanced,0.691419,0.868782,0.770019,0.632253,0.367821,0.465078,0.617548,0.678333
1,Random Forest,class_weight_balanced,0.947143,0.835094,0.887597,0.341372,0.647143,0.446966,0.667281,0.813167
2,HistGradientBoosting,class_weight_balanced,0.942863,0.840198,0.888575,0.3685,0.646825,0.469515,0.679045,0.815833
3,LGBM,class_weight_balanced,0.7997,0.880122,0.837986,0.616428,0.466363,0.530996,0.684491,0.759167


### Analyze Results

In [9]:
combined_metrics_df

Unnamed: 0,Model,Experiment,Recall_0,Precision_0,F1_0,Recall_1,Precision_1,F1_1,F1_Macro,Accuracy
0,Logistic Regression,baseline,0.969399,0.818724,0.887713,0.24416,0.69379,0.361204,0.624459,0.809
1,Random Forest,baseline,0.939225,0.839679,0.886667,0.3685,0.6326,0.465714,0.67619,0.813
2,HistGradientBoosting,baseline,0.942863,0.840198,0.888575,0.3685,0.646825,0.469515,0.679045,0.815833
3,LGBM,baseline,0.947143,0.840486,0.890633,0.366993,0.663488,0.472586,0.681609,0.818833
4,Logistic Regression,class_weight_balanced,0.691419,0.868782,0.770019,0.632253,0.367821,0.465078,0.617548,0.678333
5,Random Forest,class_weight_balanced,0.947143,0.835094,0.887597,0.341372,0.647143,0.446966,0.667281,0.813167
6,HistGradientBoosting,class_weight_balanced,0.942863,0.840198,0.888575,0.3685,0.646825,0.469515,0.679045,0.815833
7,LGBM,class_weight_balanced,0.7997,0.880122,0.837986,0.616428,0.466363,0.530996,0.684491,0.759167


#### Write Data Utils Script

In [None]:
script_content=r'''
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import joblib
import json
import logging
# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

#--------   Load and Preprocess Data   --------#

def load_data_from_url(url):
    try:
        df = pd.read_excel(url, header=1)
        logging.info("Data loaded successfully from URL.")
    except Exception as e:
        logging.error(f"Error loading data from URL: {e}")
        return None
    return df

def clean_column_names(df):
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]
    return df

def remove_id_column(df):
    if 'id' in df.columns:
        df = df.drop(columns=['id'])
    return df

def rename_columns(df):
    rename_dict = {'pay_0': 'pay_1'}
    df = df.rename(columns=rename_dict)
    return df

def convert_categorical(df, categorical_columns):
    df[categorical_columns] = df[categorical_columns].astype('category')
    return df

def split_features_target(df, target):
    X = df.drop(columns=[target])
    y = df[target]
    return X, y

def load_and_preprocess_data(url, categorical_columns, target):
    df = load_data_from_url(url)
    if df is not None:
        df = clean_column_names(df)
        df = remove_id_column(df)
        df = rename_columns(df)
        df = convert_categorical(df, categorical_columns)
        X, y = split_features_target(df, target)
        return X, y
    return None, None


#--------   Evaluate and Capture Metrics   --------#

def evaluate_model(pipeline, X_train, X_test, y_train, y_test, model_name, experiment_name):
    logger.info(f"Training and evaluating model: {model_name} ({experiment_name})")

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Capture classification report
    report = classification_report(y_test, y_pred, output_dict=True)

    # Extract relevant metrics
    metrics = {
        'Model': model_name,
        'Experiment': experiment_name,
        'Recall_0': report['0']['recall'],
        'Precision_0': report['0']['precision'],
        'F1_0': report['0']['f1-score'],
        'Recall_1': report['1']['recall'],
        'Precision_1': report['1']['precision'],
        'F1_1': report['1']['f1-score'],
        'F1_Macro': report['macro avg']['f1-score'],
        'Accuracy': report['accuracy']
    }

    logger.info(f"Completed evaluation for model: {model_name} ({experiment_name})")
    return metrics

def compile_metrics(metrics_list, experiment_name='experiment_name'):
    metrics_df = pd.DataFrame(metrics_list)
    csv_filename = f'{experiment_name}_metrics.csv'
    metrics_df.to_csv(csv_filename, index=False)
    logger.info(f"Metrics saved to {csv_filename}")
    return metrics_df


def plot_report(metrics_df, palette='viridis'):
    # Generate a summary report using matplotlib or seaborn
    metrics_to_plot = ['Precision_0', 'Recall_0', 'F1_0', 'Precision_1', 'Recall_1', 'F1_1', 'F1_Macro']
    titles = ['Precision Class 0', 'Recall Class 0', 'F1 Score Class 0', 'Precision Class 1', 'Recall Class 1', 'F1 Score Class 1', 'F1 Macro Score']

    fig, axes = plt.subplots(len(metrics_to_plot), 1, figsize=(10, len(metrics_to_plot) * 6))
    fig.suptitle('Model Performance Metrics', fontsize=20)

    for metric, title, ax in zip(metrics_to_plot, titles, axes):
        sns.barplot(x='Model', y=metric, hue=metric, data=metrics_df, palette='viridis', ax=ax, legend=False)
        ax.set_title(title)
        ax.set_ylabel(metric)
        ax.set_xlabel('Model')
        # ax.legend(title='Model Type')

        # Add average line for each metric
        mean_value = metrics_df[metric].mean()
        ax.axhline(mean_value, color='red', linestyle='--')
        ax.text(0.02, mean_value, f'Avg: {mean_value:.2f}', color='red', ha='left', va='center', transform=ax.get_yaxis_transform())

        # Correctly set tick labels
        ax.set_xticks(ax.get_xticks())
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

    plt.tight_layout(rect=[0, 0.03, 1, 0.97])
    plt.savefig('model_performance_metrics_report.png')
    plt.show()
    logger.info("Generated performance report")


'''

# Write the script to a file
with open("loan_data_utils.py", "w") as file:
    file.write(script_content)

print("Script successfully written to loan_data_utils.py")
# Reload script to make functions available for use
import importlib
import loan_data_utils
importlib.reload(loan_data_utils)

from loan_data_utils import *