<a href="https://colab.research.google.com/github/micah-shull/pipelines/blob/main/pipelines_16_ensemble_03_model_tuning_class_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sklearn
print(sklearn.__version__)

1.3.2


### Hyperparameter Tuning for Class 1 Recall and Precision

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, precision_score, make_scorer
from loan_data_utils import load_and_preprocess_data
import joblib
import json

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load and preprocess data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
categorical_columns = ['sex', 'education', 'marriage']
target = 'default_payment_next_month'

# Assuming the `load_and_preprocess_data` function is defined elsewhere
X, y = load_and_preprocess_data(url, categorical_columns, target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['category']).columns.tolist()

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(drop='first'))
        ]), categorical_features)
    ])

# Define the LGBMClassifier with initial best parameters
lgbm_clf = LGBMClassifier(random_state=42, class_weight='balanced', force_row_wise=True)

# Create pipeline for LGBMClassifier
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', lgbm_clf)])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__n_estimators': [100, 200],
    'classifier__num_leaves': [31, 50, 100],
    'classifier__min_child_samples': [20, 30, 40],
    'classifier__subsample': [0.7, 0.8, 0.9],
    'classifier__class_weight': [{0: 1, 1: w} for w in np.linspace(1, 10, 10)],
    'classifier__scale_pos_weight': [1, 2, 3, 5, 10]
}

# Define scorer for class 1 recall and precision
scorers = {
    'recall_class_1': make_scorer(recall_score, pos_label=1),
    'precision_class_1': make_scorer(precision_score, pos_label=1)
}

# Function to tune and evaluate LGBMClassifier for class 1
def tune_and_evaluate_lgbm(pipeline, param_grid, X_train, y_train, X_test, y_test, scorers):
    best_models = {}
    best_params = {}
    results = []

    for metric, scorer in scorers.items():
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scorer)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_param = grid_search.best_params_

        y_pred = best_model.predict(X_test)
        recall_1 = recall_score(y_test, y_pred, pos_label=1)
        precision_1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)

        best_models[metric] = best_model
        best_params[metric] = best_param
        results.append({
            'Metric': metric,
            'Recall Class 1': recall_1,
            'Precision Class 1': precision_1
        })

    results_df = pd.DataFrame(results)
    return best_models, best_params, results_df

# Tune and evaluate LGBMClassifier for class 1
best_lgbm_models, best_lgbm_params, evaluation_results_lgbm = tune_and_evaluate_lgbm(
    pipeline, param_grid, X_train, y_train, X_test, y_test, scorers
)

# Save the best LGBM models and parameters
joblib.dump(best_lgbm_models, 'best_lgbm_models_class_1.pkl')
with open('best_lgbm_params_class_1.json', 'w') as json_file:
    json.dump(best_lgbm_params, json_file, indent=4)

# Print the evaluation results
print(evaluation_results_lgbm)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 4247, number of negative: 14953
[LightGBM] [Info] Total Bins 3271
[LightGBM] [Info] Number of data points in the train set: 19200, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221198 -> initscore=-1.258699
[LightGBM] [Info] Start training from score -1.258699
[LightGBM] [Info] Number of positive: 4247, number of negative: 14953
[LightGBM] [Info] Total Bins 3273
[LightGBM] [Info] Number of data points in the train set: 19200, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221198 -> initscore=-1.258699
[LightGBM] [Info] Start training from score -1.258699
[LightGBM] [Info] Number of positive: 4248, number of negative: 14952
[LightGBM] [Info] Total Bins 3276
[LightGBM] [Info] Number of data points in the train set: 19200, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221250 -> initscore=-1.258

### Load the Saved Models and Parameters

To integrate the tuned LGBM models for class 1 recall and precision into the VotingClassifier and StackingClassifier while keeping the other two models for class 0, we'll follow these steps:

1. Load the previously saved best models for class 0 recall and precision.
2. Load the tuned LGBM models for class 1 recall and precision.
3. Integrate all four models into the VotingClassifier and StackingClassifier.




In [None]:
import joblib
import json

# Load the best models and parameters for class 0 recall and precision
best_models = joblib.load('best_models.pkl')
with open('best_params.json', 'r') as json_file:
    best_params = json.load(json_file)

# Load the tuned LGBM models for class 1 recall and precision
best_lgbm_models = joblib.load('best_lgbm_models_class_1.pkl')
with open('best_lgbm_params_class_1.json', 'r') as json_file:
    best_lgbm_params = json.load(json_file)

# Print the models and their parameters used for each metric
print("Best models and their parameters for each metric:\n")

# Print best models for class 0 recall and precision
for metric, model in best_models.items():
    model_name = model.named_steps['classifier'].__class__.__name__
    print(f"\nBest model for {metric}: {model_name}")
    print("Best parameters:")
    for param, value in best_params[metric].items():
        print(f"  {param}: {value}")

# Print tuned LGBM models for class 1 recall and precision
for metric, model in best_lgbm_models.items():
    model_name = model.named_steps['classifier'].__class__.__name__
    print(f"\nTuned LGBM model for {metric}: {model_name}")
    print("Tuned parameters:")
    for param, value in best_lgbm_params[metric].items():
        print(f"  {param}: {value}")

### Integrate Optimized LGBM Classifier into Voting and Stacking Classifiers

In [None]:
import joblib
import json
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Load the best models and parameters for class 0 recall and precision
best_models = joblib.load('best_models.pkl')
with open('best_params.json', 'r') as json_file:
    best_params = json.load(json_file)

# Load the tuned LGBM models for class 1 recall and precision
best_lgbm_models = joblib.load('best_lgbm_models_class_1.pkl')
with open('best_lgbm_params_class_1.json', 'r') as json_file:
    best_lgbm_params = json.load(json_file)

# Extract optimized LGBM models for class 1
optimized_lgbm_model_recall = best_lgbm_models['recall_class_1']
optimized_lgbm_model_precision = best_lgbm_models['precision_class_1']

# Extract best models for class 0 recall and precision
best_model_recall_class_0 = best_models['recall_class_0']
best_model_precision_class_0 = best_models['precision_class_0']

# Initialize the VotingClassifier with all four models
voting_clf_optimized = VotingClassifier(estimators=[
    ('lgbm_recall', optimized_lgbm_model_recall),
    ('lgbm_precision', optimized_lgbm_model_precision),
    ('best_recall_class_0', best_model_recall_class_0),
    ('best_precision_class_0', best_model_precision_class_0)
], voting='soft')

# Initialize the StackingClassifier with all four models
stacking_clf_optimized = StackingClassifier(estimators=[
    ('lgbm_recall', optimized_lgbm_model_recall),
    ('lgbm_precision', optimized_lgbm_model_precision),
    ('best_recall_class_0', best_model_recall_class_0),
    ('best_precision_class_0', best_model_precision_class_0)
], final_estimator=LogisticRegression())

# Fit the classifiers on the training data
voting_clf_optimized.fit(X_train, y_train)
stacking_clf_optimized.fit(X_train, y_train)

# Predict with the VotingClassifier
y_pred_voting_optimized = voting_clf_optimized.predict(X_test)

# Predict with the StackingClassifier
y_pred_stacking_optimized = stacking_clf_optimized.predict(X_test)

def evaluate_and_print_performance(y_test, y_pred, classifier_name):
    # Evaluate the performance of the classifier
    recall_1 = recall_score(y_test, y_pred, pos_label=1)
    precision_1 = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    precision_0 = precision_score(y_test, y_pred, pos_label=0, zero_division=0)
    f1_macro = f1_score(y_test, y_pred, average='macro')
    accuracy = accuracy_score(y_test, y_pred)

    # Print the evaluation metrics for the classifier
    print(f"\n{classifier_name} Performance:")
    print(f'Recall Class 1: {recall_1:.4f}')
    print(f'Precision Class 1: {precision_1:.4f}')
    print(f'Recall Class 0: {recall_0:.4f}')
    print(f'Precision Class 0: {precision_0:.4f}')
    print(f'F1 Macro: {f1_macro:.4f}')
    print(f'Accuracy: {accuracy:.4f}')
    print(f"\nClassification Report for {classifier_name}:\n")
    print(classification_report(y_test, y_pred))

    return {
        'Recall Class 1': recall_1,
        'Precision Class 1': precision_1,
        'Recall Class 0': recall_0,
        'Precision Class 0': precision_0,
        'F1 Macro': f1_macro,
        'Accuracy': accuracy
    }

# Evaluate and print performance for VotingClassifier
voting_results = evaluate_and_print_performance(y_test, y_pred_voting_optimized, "Optimized VotingClassifier")

# Evaluate and print performance for StackingClassifier
stacking_results = evaluate_and_print_performance(y_test, y_pred_stacking_optimized, "Optimized StackingClassifier")

# Combine results and plot
def plot_results(results):
    results_df = pd.DataFrame(results).T.reset_index()
    results_df.columns = ['Metric', 'VotingClassifier', 'StackingClassifier']

    # Plot the results
    plt.figure(figsize=(14, 10))
    sns.barplot(data=results_df.melt(id_vars='Metric', var_name='Model', value_name='Score'),
                x='Metric', y='Score', hue='Model', palette='viridis')

    plt.title('Performance Comparison: VotingClassifier vs. StackingClassifier', fontsize=16)
    plt.ylabel('Score', fontsize=14)
    plt.xlabel('Metric', fontsize=14)
    plt.xticks(rotation=45, fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(loc='upper right', fontsize=12)
    plt.tight_layout()
    plt.show()

results = {
    'VotingClassifier': voting_results,
    'StackingClassifier': stacking_results
}
plot_results(results)

# Save the final VotingClassifier and StackingClassifier models
joblib.dump(voting_clf_optimized, 'voting_classifier_optimized.pkl')
joblib.dump(stacking_clf_optimized, 'stacking_classifier_optimized.pkl')
