<a href="https://colab.research.google.com/github/micah-shull/pipelines/blob/main/pipelines_07_pytorch_pipeline_07_feature_engineering_resampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initial Setup

In [5]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from model_pipeline import (
    load_data_from_url, clean_column_names, rename_columns, remove_id_column,
    convert_categorical, split_data, define_preprocessor, preprocess_data,
    calculate_class_weights, convert_to_tensors, SklearnSimpleNN, train_model, evaluate_model
)
from feature_engineering import (
    create_interaction_features, create_payment_to_bill_ratios,
    create_payment_to_limit_ratios, create_bill_to_limit_ratios,
    create_lagged_payment_differences, create_debt_ratio_features,
    create_average_payment_and_bill, create_payment_timeliness_features,
    create_total_payment_and_bill, create_bill_difference_features,
    bin_features, target_encode, rename_columns
)

# Define Global Parameters
best_class_weight = 3.0
best_lower_threshold = 0.10

# Load and Preprocess Data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
data = load_data_from_url(url)
data = clean_column_names(data)
data = rename_columns(data)
data = remove_id_column(data)
categorical_columns = ['sex', 'education', 'marriage']  # Specify your categorical columns
data = convert_categorical(data, categorical_columns=categorical_columns)
target = 'default_payment_next_month'  # Specify your target column


### Feature Engineering

In [3]:
# Apply feature engineering
data = create_interaction_features(data)
data = create_payment_to_bill_ratios(data)
data = create_payment_to_limit_ratios(data)
data = create_bill_to_limit_ratios(data)
data = create_lagged_payment_differences(data)
data = create_debt_ratio_features(data)
data = create_average_payment_and_bill(data)
data = create_payment_timeliness_features(data)
data = create_total_payment_and_bill(data)
data = create_bill_difference_features(data)
data = target_encode(data, target, categorical_columns)
data = bin_features(data, 'age', 5)

### Resampling Methods

In [6]:
# Define a function to run the pipeline with different resampling methods
def run_resampling_pipeline(data, target, resampling_method=None):
    X_train, X_test, y_train, y_test = split_data(data, target=target)
    preprocessor = define_preprocessor(X_train)

    if resampling_method:
        resampling_pipeline = ImbPipeline(steps=[
            ('preprocessor', preprocessor),
            ('resampler', resampling_method)
        ])
        X_train_processed, y_train = resampling_pipeline.fit_resample(X_train, y_train)
        X_test_processed = preprocessor.transform(X_test)
    else:
        X_train_processed, X_test_processed = preprocess_data(preprocessor, X_train, X_test)

    X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor = convert_to_tensors(
        X_train_processed, y_train, X_test_processed, y_test)

    nn_estimator = SklearnSimpleNN(input_dim=X_train_tensor.shape[1], pos_weight=best_class_weight, threshold=best_lower_threshold)
    nn_estimator = train_model(nn_estimator, X_train_tensor, y_train_tensor)
    report = evaluate_model(nn_estimator, X_test_tensor, y_test_tensor, label=str(resampling_method))
    return report

# Compare different resampling methods
smote = SMOTE(random_state=42)
oversample = RandomOverSampler(random_state=42)
undersample = RandomUnderSampler(random_state=42)

reports = {}
reports['SMOTE'] = run_resampling_pipeline(data, target, smote)
reports['RandomOverSampler'] = run_resampling_pipeline(data, target, oversample)
reports['RandomUnderSampler'] = run_resampling_pipeline(data, target, undersample)
reports['No Resampling'] = run_resampling_pipeline(data, target)


Classification Report (SMOTE(random_state=42)):
              precision    recall  f1-score   support

         0.0       0.95      0.04      0.07      4673
         1.0       0.23      0.99      0.37      1327

    accuracy                           0.25      6000
   macro avg       0.59      0.51      0.22      6000
weighted avg       0.79      0.25      0.13      6000

Classification Report (RandomOverSampler(random_state=42)):
              precision    recall  f1-score   support

         0.0       0.96      0.03      0.05      4673
         1.0       0.23      1.00      0.37      1327

    accuracy                           0.24      6000
   macro avg       0.59      0.51      0.21      6000
weighted avg       0.80      0.24      0.12      6000

Classification Report (RandomUnderSampler(random_state=42)):
              precision    recall  f1-score   support

         0.0       0.95      0.00      0.01      4673
         1.0       0.22      1.00      0.36      1327

    accuracy 

In [7]:
import pandas as pd

# Convert reports to DataFrame for analysis
def reports_to_dataframe(reports):
    data = []
    for method, report in reports.items():
        flattened_report = {'method': method}
        for key, subdict in report.items():
            if isinstance(subdict, dict):
                for subkey, value in subdict.items():
                    flattened_report[f"{key}_{subkey}"] = value
            else:
                flattened_report[key] = subdict
        data.append(flattened_report)
    return pd.DataFrame(data)

reports_df = reports_to_dataframe(reports)
print(reports_df[['method', '1.0_recall', '1.0_f1-score', 'macro avg_f1-score']])

# Optional: Save to CSV
reports_df.to_csv("resampling_results.csv", index=False)


               method  1.0_recall  1.0_f1-score  macro avg_f1-score
0               SMOTE    0.993218      0.368465            0.218075
1   RandomOverSampler    0.996232      0.367733            0.211309
2  RandomUnderSampler    0.999246      0.362989            0.185755
3       No Resampling    0.987943      0.377375            0.260540


#### Write Resampling Script

In [None]:
# Function to write script
script_content = """
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN
from sklearn.metrics import classification_report
from model_pipeline import load_data_from_url, clean_column_names, remove_id_column, convert_categorical, split_data
from model_pipeline import SklearnSimpleNN, train_model
import torch

# Function to convert DataFrames to tensors
def convert_to_tensors(X, y):
    X_tensor = torch.tensor(X.values, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.float32).unsqueeze(1)
    return X_tensor, y_tensor

# Function to apply SMOTE
def apply_smote(X_train, y_train):
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    return X_train_res, y_train_res

# Function to apply SMOTEENN
def apply_smoteenn(X_train, y_train):
    smoteenn = SMOTEENN(random_state=42)
    X_train_res, y_train_res = smoteenn.fit_resample(X_train, y_train)
    return X_train_res, y_train_res

# Function to apply ADASYN
def apply_adasyn(X_train, y_train):
    adasyn = ADASYN(random_state=42)
    X_train_res, y_train_res = adasyn.fit_resample(X_train, y_train)
    return X_train_res, y_train_res

# Function to apply undersampling
def apply_undersampling(X_train, y_train):
    undersampler = RandomUnderSampler(random_state=42)
    X_train_res, y_train_res = undersampler.fit_resample(X_train, y_train)
    return X_train_res, y_train_res

# Generalized function to plot class distribution after sampling
def plot_class_distribution_after_sampling(y_train_res, sampling_type):
    plt.figure(figsize=(8, 4))
    sns.countplot(x=y_train_res, hue=y_train_res, palette='viridis', dodge=False,
    order=y_train_res.value_counts().index, legend=False)
    plt.title(f'Class Distribution After Applying {sampling_type}')
    plt.xlabel('Class')
    plt.ylabel('Count')

    # Calculate the percentage for each class
    total = len(y_train_res)
    class_counts = y_train_res.value_counts()
    for i, count in enumerate(class_counts):
        percentage = 100 * count / total
        plt.text(i, count, f'{percentage:.1f}%', ha='center', va='bottom')

    plt.show()

# Function to train and evaluate the model
def train_and_evaluate_model(X_train, y_train, X_test, y_test, class_weights=None):
    input_dim = X_train.shape[1]
    if class_weights is None:
        class_weights = [1.0, 1.0]

    nn_estimator = SklearnSimpleNN(input_dim=input_dim, pos_weight=class_weights[1])
    nn_estimator = train_model(nn_estimator, X_train, y_train)
    y_pred = nn_estimator.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)
    return report, y_pred

# Function to get classification report
def get_classification_report(y_test_tensor, y_prob, threshold, resampling_method):
    y_pred = (y_prob > threshold).astype(int)
    report = classification_report(y_test_tensor.numpy(), y_pred, output_dict=True)
    df_report = pd.DataFrame(report).transpose().reset_index()
    df_report = df_report[df_report['index'].isin(['0.0', '1.0'])]
    df_report.rename(columns={'index': 'class'}, inplace=True)
    df_report['resampling'] = resampling_method
    return df_report

# Function to convert report to DataFrame
def convert_report_to_dataframe(report, resampling_method):
    df_report = pd.DataFrame(report).transpose().reset_index()
    df_report['resampling'] = resampling_method
    df_report = df_report[df_report['index'].isin(['0.0', '1.0'])]
    df_report.rename(columns={'index': 'class'}, inplace=True)
    return df_report

# Function to filter and rename columns
def filter_and_rename_columns(df, metrics=['precision', 'recall', 'f1-score']):
    df = df[metrics + ['class', 'resampling']]
    return df

# Function to combine results
def combine_results(*dfs):
    df_combined = pd.concat(dfs, ignore_index=True)
    df_combined = filter_and_rename_columns(df_combined)
    df_combined.reset_index(drop=True, inplace=True)
    return df_combined

# Function to plot comparison with a descriptive name
def plot_resampling_comparison(df, metrics=['precision', 'recall', 'f1-score']):
    for metric in metrics:
        plt.figure(figsize=(8, 4))
        sns.barplot(data=df, x='class', y=metric, hue='resampling', palette='viridis')
        plt.title(f'Comparison of {metric.capitalize()} by Resampling Technique')
        plt.xlabel('Class')
        plt.ylabel(metric.capitalize())
        plt.legend(title='Resampling', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.show()

# Function to apply resampling and plot class distribution
def apply_resampling_and_plot(X_train, y_train):
    # No Resampling
    plot_class_distribution_after_sampling(y_train, "No Resampling")

    # Apply SMOTE
    X_train_smote, y_train_smote = apply_smote(X_train, y_train)
    plot_class_distribution_after_sampling(y_train_smote, "SMOTE")

    # Apply SMOTEENN
    X_train_smoteenn, y_train_smoteenn = apply_smoteenn(X_train, y_train)
    plot_class_distribution_after_sampling(y_train_smoteenn, "SMOTEENN")

    # Apply ADASYN
    X_train_adasyn, y_train_adasyn = apply_adasyn(X_train, y_train)
    plot_class_distribution_after_sampling(y_train_adasyn, "ADASYN")

    # Apply Undersampling
    X_train_under, y_train_under = apply_undersampling(X_train, y_train)
    plot_class_distribution_after_sampling(y_train_under, "Undersampling")

    return (X_train, y_train), (X_train_smote, y_train_smote), (X_train_smoteenn, y_train_smoteenn), (X_train_adasyn, y_train_adasyn), (X_train_under, y_train_under)

# Function to train and evaluate models
def train_and_evaluate_models(X_train_sets, y_train_sets, X_test, y_test):
    reports = []
    y_preds = []

    for X_train, y_train in zip(X_train_sets, y_train_sets):
        X_train_tensor, y_train_tensor = convert_to_tensors(X_train, y_train)
        X_test_tensor, y_test_tensor = convert_to_tensors(X_test, y_test)
        report, y_pred = train_and_evaluate_model(X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)
        reports.append(report)
        y_preds.append(y_pred)

    return reports, y_preds

# Function to gather reports and predictions into DataFrame
def gather_reports_and_predictions(reports, y_trues, y_preds, model_names):
    combined_report = pd.concat(
        [pd.DataFrame(report).transpose().drop(['accuracy', 'macro avg', 'weighted avg'], errors='ignore').assign(resampling=model_name) for report, model_name in zip(reports, model_names)]
    ).reset_index().rename(columns={'index': 'class'})

    return combined_report

# Main function to load data and call other functions
def run_resampling_report(url, categorical_columns, target):
    # Load and preprocess data
    data = load_data_from_url(url)
    data = clean_column_names(data)
    data = remove_id_column(data)
    data = convert_categorical(data, categorical_columns=categorical_columns)
    X_train, X_test, y_train, y_test = split_data(data, target=target)

    # Apply resampling and plot class distribution
    (X_train, y_train), (X_train_smote, y_train_smote), (X_train_smoteenn, y_train_smoteenn), (X_train_adasyn, y_train_adasyn), (X_train_under, y_train_under) = apply_resampling_and_plot(X_train, y_train)

    # Train and evaluate models
    X_train_sets = [X_train, X_train_smote, X_train_smoteenn, X_train_adasyn, X_train_under]
    y_train_sets = [y_train, y_train_smote, y_train_smoteenn, y_train_adasyn, y_train_under]
    reports, y_preds = train_and_evaluate_models(X_train_sets, y_train_sets, X_test, y_test)

    # Gather reports and predictions
    model_names = ['No Resampling', 'SMOTE', 'SMOTEENN', 'ADASYN', 'Undersampling']
    combined_report = gather_reports_and_predictions(reports, [y_test]*5, y_preds, model_names)

    # Plot multiple classification reports
    plot_resampling_comparison(combined_report)

    return combined_report

"""

# Write the functions to feature_engineering.py script
with open("resampling_utils.py", "w") as file:
    file.write(script_content)

print("Functions successfully written to resampling_utils.py")

# reload script to make function available for use
import importlib
import resampling_utils
importlib.reload(resampling_utils)

