In [43]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.frozen import FrozenEstimator
from sklearn.metrics import log_loss, brier_score_loss, fbeta_score
from skrub import ToCategorical, MinHashEncoder, TableVectorizer

from churn_classification_engine.config import settings

In [45]:
DATA_PATH = ".." / settings.data_dir / "train.csv"

In [46]:
train_df = pd.read_csv(DATA_PATH, index_col="CUSTOMER_ID")

In [47]:
train_df.head()

Unnamed: 0_level_0,COUNTRY_CODE,PLAN_AGE,ACTIVITY_DAY_COUNT,SUPPORT_TICKET_COUNT,DELINQUENCY_DAY_COUNT,PLAN_NAME,BILLING_PERIOD,CURRENCY,MONTHLY_PAYMENT_AMOUNT,DAYS_SINCE_LAST_LOGIN,CHURN
CUSTOMER_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
114383,ES,11,,,,Starter,month,USD,70,,0
116777,GB,7,,,,Pro,month,USD,160,,0
106515,US,21,,,,Pro,year,USD,110,,0
148924,DE,3,,,,Starter,month,EUR,75,,1
193586,FR,1,13.0,,1.0,Starter,month,EUR,75,0.0,0


In [48]:
X = train_df.drop(columns=["CHURN", "COUNTRY_CODE"])
y = train_df["CHURN"]

### Calibration

Calibration refers to the process of adjusting the predicted probabilities of a model to better match the actual likelihood of an event.  
A well-calibrated model provides probability estimates that reflect the true underlying probabilities of the outcomes.  Then, it improves probability estimates, which can be useful for tasks where decision-making depends on confidence levels, and enhanced model evaluation for metrics like log loss, which rely on accurate probability estimates.

In [53]:
uncalibrated_log_loss = []
uncalibrated_brier_score = []
uncalibrated_fbeta_score = []
calibrated_log_loss = []
calibrated_brier_score = []
calibrated_fbeta_score = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    pipeline = Pipeline(
        steps=[
            (
                "tablevectorizer",
                TableVectorizer(
                    # high_cardinality=MinHashEncoder(n_components=35),
                    low_cardinality=ToCategorical(),
                ),
            ),
            (
                "histgradientboostingclassifier",
                HistGradientBoostingClassifier(
                    class_weight="balanced",
                    learning_rate=0.03239405091048051,
                    max_iter=108,
                    max_depth=5,
                    min_samples_leaf=48,
                    max_bins=155,
                    random_state=42,
                ),
            ),
        ]
    )
    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    pipeline.fit(X_train, y_train)

    calibrated_pipeline = CalibratedClassifierCV(
        FrozenEstimator(pipeline), method="sigmoid"
    )
    calibrated_pipeline.fit(X_train, y_train)

    uncalibrated_y_pred = pipeline.predict_proba(X_test)[:, 1]
    calibrated_y_pred = calibrated_pipeline.predict_proba(X_test)[:, 1]

    uncalibrated_log_loss.append(log_loss(y_test, uncalibrated_y_pred))
    uncalibrated_brier_score.append(
        brier_score_loss(y_test, uncalibrated_y_pred)
    )
    calibrated_log_loss.append(log_loss(y_test, calibrated_y_pred))
    calibrated_brier_score.append(
        brier_score_loss(y_test, calibrated_y_pred)
    )

In [54]:
pd.DataFrame(
    {
        "uncalibrated_log_loss": uncalibrated_log_loss,
        "calibrated_log_loss": calibrated_log_loss,
        "uncalibrated_brier_score": uncalibrated_brier_score,
        "calibrated_brier_score": calibrated_brier_score,
    }
).mean()

uncalibrated_log_loss       0.579701
calibrated_log_loss         0.373535
uncalibrated_brier_score    0.206585
calibrated_brier_score      0.117826
dtype: float64

Thanks to calibration, we can consider that our probability estimate is closer to the reality.