<a href="https://colab.research.google.com/github/micah-shull/pipelines/blob/main/pipelines_16_ensemble_02_stacking_08_class_specific_thresholds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Load Best Performing Model and Evaluate

In [2]:
import joblib
import json
import pandas as pd
import numpy as np

from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report
from loan_data_utils import load_and_preprocess_data
from sklearn.pipeline import Pipeline

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load and preprocess data (assuming this function is defined in loan_data_utils)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
categorical_columns = ['sex', 'education', 'marriage']
target = 'default_payment_next_month'

# Load and preprocess data
X, y = load_and_preprocess_data(url, categorical_columns, target)

# Split the data into training and testing sets using the same random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Load the best models and parameters
best_models = joblib.load('best_models.pkl')
with open('best_params.json', 'r') as json_file:
    best_params = json.load(json_file)

# Load the optimal thresholds
with open('optimal_thresholds.json', 'r') as json_file:
    optimal_thresholds = json.load(json_file)

threshold_class_1 = optimal_thresholds['threshold_class_1']
threshold_class_0 = optimal_thresholds['threshold_class_0']

# Function to apply class-specific thresholds
def predict_with_class_specific_thresholds(model, X_test, y_test, threshold_class_1, threshold_class_0):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = np.zeros_like(y_proba)

    # Apply thresholds
    y_pred[(y_proba >= threshold_class_1) & (y_test == 1)] = 1
    y_pred[(y_proba >= threshold_class_0) & (y_test == 0)] = 1

    return y_pred

# Load the VotingClassifier model with optimal thresholds
voting_clf = joblib.load('voting_classifier_optimal_threshold.pkl')

# Predict using the VotingClassifier with optimal thresholds
y_pred_optimal = predict_with_class_specific_thresholds(voting_clf, X_test, y_test, threshold_class_1, threshold_class_0)

# Evaluate the model
recall_1 = recall_score(y_test, y_pred_optimal, pos_label=1)
precision_1 = precision_score(y_test, y_pred_optimal, pos_label=1, zero_division=0)
recall_0 = recall_score(y_test, y_pred_optimal, pos_label=0)
precision_0 = precision_score(y_test, y_pred_optimal, pos_label=0, zero_division=0)
f1_macro = f1_score(y_test, y_pred_optimal, average='macro')
accuracy = accuracy_score(y_test, y_pred_optimal)

# Print the evaluation metrics
print(f'Recall Class 1: {recall_1:.4f}')
print(f'Precision Class 1: {precision_1:.4f}')
print(f'Recall Class 0: {recall_0:.4f}')
print(f'Precision Class 0: {precision_0:.4f}')
print(f'F1 Macro: {f1_macro:.4f}')
print(f'Accuracy: {accuracy:.4f}')

# Print the classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_optimal))

# Save the new VotingClassifier model
joblib.dump(voting_clf, 'voting_classifier_optimal_threshold.pkl')

print("New VotingClassifier model saved to 'voting_classifier_optimal_threshold.pkl'")


Recall Class 1: 0.9540
Precision Class 1: 0.6825
Recall Class 0: 0.8740
Precision Class 0: 0.9853
F1 Macro: 0.8610
Accuracy: 0.8917

Classification Report:

              precision    recall  f1-score   support

           0       0.99      0.87      0.93      4673
           1       0.68      0.95      0.80      1327

    accuracy                           0.89      6000
   macro avg       0.83      0.91      0.86      6000
weighted avg       0.92      0.89      0.90      6000

New VotingClassifier model saved to 'voting_classifier_optimal_threshold.pkl'


### Updated Class Specific Threshold Prediction Function

In [7]:
import joblib
import json
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report
from loan_data_utils import load_and_preprocess_data
from sklearn.model_selection import train_test_split

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load and preprocess data (assuming this function is defined in loan_data_utils)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
categorical_columns = ['sex', 'education', 'marriage']
target = 'default_payment_next_month'
X, y = load_and_preprocess_data(url, categorical_columns, target)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Load the best VotingClassifier model with optimal thresholds
voting_clf = joblib.load('voting_classifier_optimal_threshold.pkl')

# Load the optimal thresholds from the file
with open('optimal_thresholds.json', 'r') as json_file:
    optimal_thresholds = json.load(json_file)

threshold_class_1 = optimal_thresholds['threshold_class_1']
threshold_class_0 = optimal_thresholds['threshold_class_0']

# Function to apply class-specific thresholds
def predict_with_class_specific_thresholds(model, X, threshold_class_1, threshold_class_0):
    y_proba = model.predict_proba(X)
    y_pred = np.zeros(y_proba.shape[0])

    # Apply thresholds to obtain predictions
    y_pred[y_proba[:, 1] >= threshold_class_1] = 1  # Predict class 1 for probabilities above threshold_class_1
    y_pred[y_proba[:, 0] >= threshold_class_0] = 0  # Predict class 0 for probabilities above threshold_class_0

    return y_pred

# Predict with the best VotingClassifier model using the optimal thresholds
y_pred_optimal = predict_with_class_specific_thresholds(voting_clf, X_test, threshold_class_1, threshold_class_0)

# Evaluate the performance of the VotingClassifier
recall_1 = recall_score(y_test, y_pred_optimal, pos_label=1)
precision_1 = precision_score(y_test, y_pred_optimal, pos_label=1, zero_division=0)
recall_0 = recall_score(y_test, y_pred_optimal, pos_label=0)
precision_0 = precision_score(y_test, y_pred_optimal, pos_label=0, zero_division=0)
f1_macro = f1_score(y_test, y_pred_optimal, average='macro')
accuracy = accuracy_score(y_test, y_pred_optimal)

# Print the evaluation metrics
print(f'Recall Class 1: {recall_1:.4f}')
print(f'Precision Class 1: {precision_1:.4f}')
print(f'Recall Class 0: {recall_0:.4f}')
print(f'Precision Class 0: {precision_0:.4f}')
print(f'F1 Macro: {f1_macro:.4f}')
print(f'Accuracy: {accuracy:.4f}')

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_optimal))

# Save the new VotingClassifier model
joblib.dump(voting_clf, 'voting_classifier_optimal_threshold.pkl')

print("New VotingClassifier model saved to 'voting_classifier_optimal_threshold.pkl'")


Recall Class 1: 0.6398
Precision Class 1: 0.4424
Recall Class 0: 0.7710
Precision Class 0: 0.8829
F1 Macro: 0.6731
Accuracy: 0.7420

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.77      0.82      4673
           1       0.44      0.64      0.52      1327

    accuracy                           0.74      6000
   macro avg       0.66      0.71      0.67      6000
weighted avg       0.79      0.74      0.76      6000

New VotingClassifier model saved to 'voting_classifier_optimal_threshold.pkl'


### Summary Explanation

**Problem:**

When using a `VotingClassifier` to predict class labels, we initially faced a performance issue due to incorrect application of class-specific thresholds. The thresholds were intended to improve precision and recall for each class but were not applied correctly in the prediction process.

1. **Initial Incorrect Implementation**:
   - The first version of the threshold application code incorrectly referenced the true class labels (`y_test`) when applying thresholds, which is not how prediction should work. Predictions should be based solely on the model's output probabilities and not the actual class labels.

   ```python
   def predict_with_class_specific_thresholds(model, X_test, y_test, threshold_class_1, threshold_class_0):
       y_proba = model.predict_proba(X_test)[:, 1]
       y_pred = np.zeros_like(y_proba)

       # Incorrectly uses y_test
       y_pred[(y_proba >= threshold_class_1) & (y_test == 1)] = 1
       y_pred[(y_proba >= threshold_class_0) & (y_test == 0)] = 1

       return y_pred
   ```

2. **Correct Implementation**:
   - The correct implementation involves using only the model's predicted probabilities for each class to apply the class-specific thresholds. This ensures that predictions are based solely on the output of the model.

   ```python
   def predict_with_class_specific_thresholds(model, X, threshold_class_1, threshold_class_0):
       y_proba = model.predict_proba(X)
       y_pred = np.zeros(y_proba.shape[0])

       # Apply thresholds to obtain predictions
       y_pred[y_proba[:, 1] >= threshold_class_1] = 1  # Predict class 1 for probabilities above threshold_class_1
       y_pred[y_proba[:, 0] >= threshold_class_0] = 0  # Predict class 0 for probabilities above threshold_class_0

       return y_pred
   ```

**Resolution:**

1. **Load and Preprocess Data**:
   - Ensure consistent data loading and preprocessing.

   ```python
   X, y = load_and_preprocess_data(url, categorical_columns, target)
   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
   ```

2. **Apply Class-Specific Thresholds Correctly**:
   - Use the correct function to apply thresholds based solely on predicted probabilities.

   ```python
   def predict_with_class_specific_thresholds(model, X, threshold_class_1, threshold_class_0):
       y_proba = model.predict_proba(X)
       y_pred = np.zeros(y_proba.shape[0])

       # Apply thresholds to obtain predictions
       y_pred[y_proba[:, 1] >= threshold_class_1] = 1  # Predict class 1 for probabilities above threshold_class_1
       y_pred[y_proba[:, 0] >= threshold_class_0] = 0  # Predict class 0 for probabilities above threshold_class_0

       return y_pred
   ```


By ensuring the correct application of class-specific thresholds, we resolved the performance issues and achieved the desired balance between precision and recall for each class. This approach allows the model to make predictions that better reflect the underlying distribution and importance of each class.