<a href="https://colab.research.google.com/github/wandb/edu/blob/main/decision-opt-course/1_profit_curves.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
<!--- @wandbcode{decisionopt-nb1} -->

# Lesson 1 - Profit Curves

## A Basic Model As A Starting Point

Model building isn't our focus, so I won't go into great depth on this. Data is from [this dataset](https://www.kaggle.com/datasets/blastchar/telco-customer-churn) to predict churn from a telecom company.

In [None]:
%pip install wandb xgboost

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import wandb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.compose import make_column_selector as selector
from wandb.xgboost import WandbCallback
from xgboost import XGBClassifier
from pathlib import Path

plt.style.use('fivethirtyeight')

In [None]:
# Let's log into W&B to load data and track our experiments
wandb.login()

In [None]:
# We will load dataset from wandb Artifact
with wandb.init(project="profit_curves") as run:
    artifact = run.use_artifact('wandb_course/decision_opt/telco-customer-churn:latest', type='dataset')
    artifact_dir = artifact.download()
    path = Path(artifact_dir)

In [None]:
data = pd.read_csv(path/"WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.head()

In [None]:
run1 = wandb.init(project="profit_curves")

target = "Churn"
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(target, axis=1), data[target] == "Yes", test_size=0.2, random_state=0
)
cols_to_use = [
    "tenure",
    "PhoneService",
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
    "MonthlyCharges",
]

preprocessor = ColumnTransformer(
    transformers=[("one_hot", OneHotEncoder(), selector(dtype_include="object"))],
    remainder="passthrough",  # Leave numerical variables unchanged
)

# Create pipeline
pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", XGBClassifier())])
pipeline.fit(X_train[cols_to_use], y_train)
y_pred = pipeline.predict_proba(X_test[cols_to_use])[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
log_loss_val = log_loss(y_test, y_pred)

# Log metrics to W&B
wandb.log({"roc_auc": roc_auc, "log_loss": log_loss_val})

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

threshold = 0.5  # Set your custom threshold here
y_pred_binary = np.where(y_pred >= threshold, 1, 0)
cm = confusion_matrix(y_test, y_pred_binary)

# format is
# [[TN, FP],
#  [FN, TP]]

# Log confusion matrix to W&B
wandb.log({'confusion_matrix': wandb.plot.confusion_matrix(
    probs=None,
    y_true=y_test.tolist(),
    preds=y_pred_binary.tolist(),
    class_names=['Not Churn', 'Churn'])}
    )


print("Confusion Matrix:")
print(cm)

In [None]:
def profit_curve(y_true, y_pred, payoff_matrix, n_points=101):
    """Calculate profit curve for a binary classifier.

    Args:
        y_true (array-like): True labels.
        y_pred (array-like): Predicted probabilities.
        payoff_matrix (array-like): Payoff matrix.
        n_points (int): Number of points to calculate.

    Returns:
        tuple: x and y values for the profit curve.
    """
    # Calculate profit for each threshold
    thresholds = np.linspace(0, 1, n_points)
    profits = []
    for threshold in thresholds:
        y_pred_binary = np.where(y_pred >= threshold, 1, 0)
        cm = confusion_matrix(y_true, y_pred_binary)
        profit = (cm * payoff_matrix).sum()
        profits.append(profit)
    return thresholds, profits


# Costs $80 to give discount to a customer who will not churn
# Worth $200 to give discount to a customer who will otherwise churn
payoff_matrix = np.array([[0, -80], [0, 200]])
first_thresholds, first_profits = profit_curve(y_test, y_pred, payoff_matrix)
print(first_thresholds)
print(first_profits)

In [None]:
# Log your data as a wandb.Table
def log_profit_curve(thresholds, profits):
    data = [[x, y] for (x, y) in zip(thresholds, profits)]
    profit_curve_table = wandb.Table(data=data, columns=["Threshold", "Profit"])
    line_plot = wandb.plot.line(profit_curve_table, x='Threshold', y='Profit', title='Value of Offering Discounts to Prevent Churn')
    wandb.log({"profit_curve_table": profit_curve_table, "profit_curve": line_plot})

log_profit_curve(first_thresholds, first_profits)

In [None]:
def find_best_threshold_and_profit(thresholds, profits):
    """Find the best threshold and profit.
    
    Args:
        thresholds (array-like): Threshold values.
        profits (array-like): Profit values.
    
    Returns:
        tuple: Best threshold and profit.
    """
    best_profit = max(profits)
    best_threshold = thresholds[profits.index(best_profit)]
    return best_threshold, best_profit
    
best_thresh, best_profits = find_best_threshold_and_profit(first_thresholds, first_profits)

wandb.log({"best_threshold": best_thresh, "best_profit": best_profits})

print(f"Ideal threshold is {best_thresh:.2f} which yields profits of ${best_profits:.0f}")


In [None]:
# We'll finish our first experiment (run) here. 
wandb.finish()

## Prioritizing Work

In [None]:
# start a new W&B run
run2 = wandb.init(project="profit_curves")

field_lost_in_broken_pipeline = "Contract"

# refit model and get new predictions without this field
new_cols_to_use = [col for col in cols_to_use if col != field_lost_in_broken_pipeline]
pipeline.fit(X_train[new_cols_to_use], y_train,
             classifier__callbacks=[WandbCallback()]
             )
new_y_pred = pipeline.predict_proba(X_test[new_cols_to_use])[:, 1]

roc_auc = roc_auc_score(y_test, new_y_pred)
log_loss_val = log_loss(y_test, new_y_pred)

# Log metrics to W&B
wandb.log({"roc_auc": roc_auc, "log_loss": log_loss_val})

new_thresholds, new_profits = profit_curve(y_test, new_y_pred, payoff_matrix)
log_profit_curve(new_thresholds, new_profits)
new_best_thresh, new_best_profits = find_best_threshold_and_profit(new_thresholds, new_profits)
wandb.log({"best_threshold": new_best_thresh, "best_profit": new_best_profits})
print(f"Ideal threshold is {new_best_thresh:.2f} which yields profits of ${new_best_profits:.0f}")

In [None]:
# Let's finish our second run here
wandb.finish()

In [None]:
print(f"Change in profit due to lost field: {new_best_profits - best_profits:.0f}")

The most important field is `tenure`. It is an interesting exercise to see modify this code slightly and see how the results change when you remove `tenure`.

## Model Improvements

In [None]:
# Let's start our third run here. Because it's a synthtic dataset, we'll use tags to indicate that
run3 = wandb.init(project="profit_curves", tags=["synthetic_data"])

In [None]:
def make_synth_accurate_data(y_pred, y_true, closeness_factor=0.1):
    """Create synthetic data that is accurate.

    Args:
        y_pred (array-like): Predicted probabilities.
        y_true (array-like): True labels.
        closeness_factor (float): A measure of how much to increase the accuracy of the synthetic data.

    Returns:
        array: Synthetic data.
    """
    errors = y_pred - y_true
    synth_data = y_pred - errors * closeness_factor
    synth_data_AUC = roc_auc_score(y_true, synth_data)
    print(
        f"closeness_factor: {closeness_factor:.2f}.   AUC of synthetic data: {synth_data_AUC:.2f}"
    )
    return synth_data


for i in np.linspace(0, 0.2, 11):
    make_synth_accurate_data(y_pred, y_test, i)

A closeness factor of 0.14 creates synthetic data that's a proxy for how good we expect predictions to be after this modeling improvement (recall that we hypothesized the modeling improvement would yield an AUC of 0.9)

In [None]:
synth_preds = make_synth_accurate_data(y_pred, y_test, 0.14)
print(
    f"""
Sanity check that means make sense:
{synth_preds.mean():.3f}
{y_pred.mean():.3f}
{y_test.mean():.3f}
"""
)

Now show profit curves we expect after this modeling improvement. We'll compare it to the profit curve from the best model built so far.

In [None]:
synth_preds = make_synth_accurate_data(y_pred, y_test, 0.14)
roc_auc = roc_auc_score(y_test, synth_preds)
log_loss_val = log_loss(y_test, synth_preds)

# Log metrics to W&B
wandb.log({"roc_auc": roc_auc, "log_loss": log_loss_val})
synth_pred_thresholds, synth_pred_profits = profit_curve(y_test, synth_preds, payoff_matrix)
log_profit_curve(synth_pred_thresholds, synth_pred_profits)
synth_best_thresh, synth_best_profits = find_best_threshold_and_profit(synth_pred_thresholds, synth_pred_profits)
wandb.log({"best_threshold": synth_best_thresh, "best_profit": synth_best_profits})
print(f"Ideal threshold is {synth_best_thresh:.2f} which yields profits of ${synth_best_profits:.0f}")

In [None]:
wandb.finish()

## Beyond Simple-Number Decision Thresholds

In [None]:
data.MonthlyCharges.hist()

In [None]:
# Previously assumed this was 200 for all customers.
# Mean MonthlyCharge is 68
X_test["value_of_promoting_to_churning_customer"] = data.MonthlyCharges * 3

# Previously assumed this was -80 for all customers.
X_test["cost_of_promoting_to_non_churning_customer"] = -80

In [None]:
def group_specific_profit_curve(X_test, y_test, y_pred, low_thresh, high_thresh):
    in_group = (X_test.MonthlyCharges > low_thresh) & (
        X_test.MonthlyCharges <= high_thresh
    )
    y_test_in_group = y_test[in_group]
    y_pred_in_group = y_pred[in_group]
    mean_monthly_charge = X_test.MonthlyCharges[in_group].mean()
    payoff_matrix = np.array([[0, -80], [0, 3 * mean_monthly_charge]])
    thresholds, profits = profit_curve(y_test_in_group, y_pred_in_group, payoff_matrix)
    best_thresh, best_profits = find_best_threshold_and_profit(thresholds, profits)
    return best_thresh, best_profits

Divide customers into three groups and assign different thresholds for each

In [None]:
# Find ideal threshold for customers paying less than $20/month
thresh_low, profits_low = group_specific_profit_curve(X_test, y_test, y_pred, 0, 20)
# Find ideal threshold for customers paying between $20 and $60/month
thresh_medium, profits_medium = group_specific_profit_curve(
    X_test, y_test, y_pred, 20, 60
)
# Find ideal threshold for customers paying between $60 and $100/month
thresh_high, profits_high = group_specific_profit_curve(X_test, y_test, y_pred, 60, 100)
# Find ideal threshold for customers paying more than $100/month
thresh_very_high, profits_very_high = group_specific_profit_curve(
    X_test, y_test, y_pred, 100, 200
)

print(
    f"""
With flexible thresholds, the profit is {(profits_low + profits_medium + profits_high + profits_very_high):.0f}.
Previous profit was {best_profits}"""
)