In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.frozen import FrozenEstimator
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import fbeta_score
from skrub import ToCategorical, MinHashEncoder, TableVectorizer

from churn_classification_engine.config import settings

In [2]:
DATA_PATH = ".." / settings.data_dir / "train.csv"

TARGET: str = "CHURN"

In [3]:
df = pd.read_csv(DATA_PATH, index_col="CUSTOMER_ID")

In [4]:
df.head()

Unnamed: 0_level_0,COUNTRY_CODE,PLAN_AGE,ACTIVITY_DAY_COUNT,SUPPORT_TICKET_COUNT,DELINQUENCY_DAY_COUNT,PLAN_NAME,BILLING_PERIOD,CURRENCY,MONTHLY_PAYMENT_AMOUNT,DAYS_SINCE_LAST_LOGIN,CHURN
CUSTOMER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
114383,ES,11,,,,Starter,month,USD,70,,0
116777,GB,7,,,,Pro,month,USD,160,,0
106515,US,21,,,,Pro,year,USD,110,,0
148924,DE,3,,,,Starter,month,EUR,75,,1
193586,FR,1,13.0,,1.0,Starter,month,EUR,75,0.0,0


In [5]:
X = df.drop(columns=TARGET)
y = df[TARGET]

## Threshold tuning

By selecting the right threshold based on our application needs, we can significantly improve the model's real-world performance.

In [12]:
scores = []
y_pred = pd.Series()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    pipeline = Pipeline(
        steps=[
            (
                "tablevectorizer",
                TableVectorizer(
                    high_cardinality=MinHashEncoder(n_components=35),
                    low_cardinality=ToCategorical(),
                ),
            ),
            (
                "histgradientboostingclassifier",
                HistGradientBoostingClassifier(
                    class_weight="balanced",
                    learning_rate=0.024803608992165237,
                    max_iter=146,
                    max_depth=12,
                    min_samples_leaf=14,
                    max_bins=137,
                    random_state=42,
                ),
            ),
        ]
    )
    pipeline.fit(X.iloc[train_index], y.iloc[train_index])

    calibrated_pipeline = CalibratedClassifierCV(
        FrozenEstimator(pipeline), method="sigmoid"
    )
    calibrated_pipeline.fit(X.iloc[train_index], y.iloc[train_index])

    fold_y_pred = calibrated_pipeline.predict_proba(X.iloc[test_index])[:, 1]
    y_pred = pd.concat([y_pred, pd.Series(fold_y_pred, index=y.iloc[test_index].index)])

    fold_scores = {}

    for t in np.arange(0, 1, 0.05):
        fold_scores[t] = fbeta_score(y.iloc[test_index], fold_y_pred >= t, beta=2)
    scores.append(fold_scores)

  y_pred = pd.concat([y_pred, pd.Series(fold_y_pred, index=y.iloc[test_index].index)])


In [14]:
pd.DataFrame(scores).mean()

0.00    0.484430
0.05    0.555506
0.10    0.574853
0.15    0.563493
0.20    0.531698
0.25    0.470920
0.30    0.407856
0.35    0.278476
0.40    0.142206
0.45    0.060388
0.50    0.030160
0.55    0.025837
0.60    0.023891
0.65    0.015869
0.70    0.009697
0.75    0.003957
0.80    0.002250
0.85    0.000450
0.90    0.000000
0.95    0.000000
dtype: float64

To maximize our $F_2$**Score**, which  is our north star metric, we have to use a threshold of **.1**.

In [17]:
(y_pred >= 0.1).mean()

0.5806289881494986

Consider more than the half of the dataset as customer at-risk is certainly too big for the Retention team.  
Then we can consider several levels of risk, to priorize the retention tasks.  

In [37]:
for i in np.arange(0, 1, 0.05):
    print(f"Threshold: {i}")
    mask = y_pred >= i
    print(f"Part of churners above this threshold: {df.loc[mask, 'CHURN'].mean()}")
    print(f"# of customers above the threshold: {y_pred[mask].shape[0]}")
    print("======")

Threshold: 0.0
Part of churners above this threshold: 0.15819279854147675
# of customers above the threshold: 87760
Threshold: 0.05
Part of churners above this threshold: 0.20448319181461805
# of customers above the threshold: 66069
Threshold: 0.1
Part of churners above this threshold: 0.2402661119397127
# of customers above the threshold: 50956
Threshold: 0.15000000000000002
Part of churners above this threshold: 0.27511483663353486
# of customers above the threshold: 38533
Threshold: 0.2
Part of churners above this threshold: 0.30301415487094085
# of customers above the threshold: 30025
Threshold: 0.25
Part of churners above this threshold: 0.3293768545994065
# of customers above the threshold: 22242
Threshold: 0.30000000000000004
Part of churners above this threshold: 0.35438997952547274
# of customers above the threshold: 16606
Threshold: 0.35000000000000003
Part of churners above this threshold: 0.3940288713910761
# of customers above the threshold: 9144
Threshold: 0.4
Part of chu

**High-risk customers: >=.75**  
**Risky customers: >=.5**  
**Moderate-risk customers: >= .3**  
**Low-risk customers: >= .1**  
**No-risk customers: < .1**  

If we suppose that Retention Retention team can only handle 20% of the customer base.

In [19]:
print(
    f"Then, we have to take a threshold of {y_pred.sort_values(ascending=False).iloc[int(df.shape[0] / 5)]}"
)

Then, we have to take a threshold of 0.29217426861851425


To simplify, the Retention team have to consider High-risk, risky and moderate-risk customers