In [70]:
import warnings
warnings.filterwarnings("ignore")

import time
from tqdm import tqdm

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import KFold, StratifiedKFold

from typing import List, Optional, Tuple
from functools import partial

import catboost
from catboost import CatBoostClassifier, Pool

In [5]:
def create_numerical_aggs(data: pd.DataFrame,
                          groupby_id: str,
                          aggs: dict,
                          prefix: str = None,
                          suffix: str = None,
                          ) -> pd.DataFrame:
    if not prefix:
        prefix = ""
    if not suffix:
        suffix = ""

    data_grouped = data.groupby(groupby_id)
    stats = data_grouped.agg(aggs)
    stats.columns = [f"{prefix}{feature}_{stat}{suffix}".lower() for feature, stat in stats]
    stats = stats.reset_index()

    return stats

def find_max_consequtive(x):
    if isinstance(x, int):
        return x
    max_c, run_m = 1, 1
    for i in range(1, len(x)):
        if x[i] - x[i-1] == 1:
            run_m += 1
        else:
            if max_c < run_m:
                max_c = run_m
            run_m = 1
    return max_c

def find_min_delta(x):
    min_delta = 365
    length = len(x)
    if length < 2:
        return min_delta
    
    for i in range(1, len(x)):
        delta = x[i] - x[i-1]
        if delta < min_delta:
            min_delta = delta
    return min_delta

def find_mean_delta(x):
    mean_delta = 365
    length = len(x)
    if length < 2:
        return mean_delta
    
    deltas = []
    for i in range(1, len(x)):
        deltas.append(x[i] - x[i-1])
    return np.mean(deltas)

def find_max_delta(x):
    max_delta = 0
    length = len(x)
    if length < 2:
        return max_delta
    
    for i in range(1, len(x)):
        delta = x[i] - x[i-1]
        if delta > max_delta:
            max_delta = delta
    return max_delta

In [6]:
users = pd.read_csv("/kaggle/input/onlineretail/OnlineRetail.csv", parse_dates=["InvoiceDate"], encoding='unicode_escape')
users["InvoiceDate"] = pd.to_datetime(users["InvoiceDate"].dt.date)
users.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01,3.39,17850.0,United Kingdom


In [7]:
# taking only UK
users_uk = users[users["Country"] == "United Kingdom"]
users_uk.drop("Country", axis=1, inplace=True)

# split into target and working datasets: better select more months in target, e.g. 3 months!
users_nov = users_uk[(users_uk["InvoiceDate"] > "2011-08-31") & (users_uk["InvoiceDate"] < "2011-12-01")]
users_prev = users_uk[users_uk["InvoiceDate"] < "2011-09-01"]

# remove unavailable customer ids
users_prev = users_prev[users_prev["CustomerID"].notna()]
users_nov = users_nov[users_nov["CustomerID"].notna()]

# compute churn target: users who never order anything in november
users_prev["target"] = 1 - users_prev["CustomerID"].isin(users_nov["CustomerID"]).astype(int)

# lower description
users_prev["Description"] = users_prev["Description"].str.lower()

# revenue
users_prev["Revenue"] = users_prev["UnitPrice"] * users_prev["Quantity"]

# add date features
users_prev["DayMonth"] = users_prev["InvoiceDate"].dt.day.astype("str")
users_prev["DayWeek"] = users_prev["InvoiceDate"].dt.dayofweek.astype("str")
users_prev["Month"] = users_prev["InvoiceDate"].dt.month.astype("str")
users_prev["WeekOfYear"] = users_prev["InvoiceDate"].dt.week
users_prev["DayOfYear"] = users_prev["InvoiceDate"].dt.dayofyear

# calculate total customer's statistics
aggs = {"Quantity": ["sum", "max", "min", "count"], "UnitPrice": ["mean", "max", "sum", "min"]}
stats = create_numerical_aggs(users_prev, groupby_id="CustomerID", aggs=aggs, suffix="_by_id")
users_prev = pd.merge(users_prev, stats, on="CustomerID", how="left")

# number of returned orders
qneg_users = users_prev[users_prev["Quantity"] < 0]
qneg_users = qneg_users.groupby("CustomerID", as_index=False)["Quantity"].count() # ["Quantity"].transform("count")
qneg_users.columns = ["CustomerID", "return_cnt"]

users_prev = users_prev.merge(qneg_users, on="CustomerID", how="left")
users_prev.fillna(0, inplace=True)

# number of max consequtive days of ordering
tmp = pd.concat([users_prev["CustomerID"], users_prev["DayOfYear"]], axis=1).sort_values(["CustomerID", "DayOfYear"]).sort_values(["CustomerID", "DayOfYear"])
tmp.drop_duplicates(inplace=True)
tmp = tmp.groupby("CustomerID", as_index=True).agg({"DayOfYear": lambda x: (x - x.min() + 1)})
tmp = tmp["DayOfYear"].apply(lambda x: find_max_consequtive(x.tolist()))
tmp = tmp.reset_index()
tmp = tmp.rename(columns={"DayOfYear": "conseq"})
users_prev = users_prev.merge(tmp, on="CustomerID", how="left")

# min, max, mean days between orders
tmp = users_prev.groupby(["CustomerID"], as_index=False).agg({"DayOfYear": lambda x: sorted(list(set(x)))}).rename(columns={"DayOfYear": "DaysList"})
tmp["min_delta"] = tmp["DaysList"].apply(find_min_delta)
tmp["max_delta"] = tmp["DaysList"].apply(find_max_delta)
tmp["mean_delta"] = tmp["DaysList"].apply(find_mean_delta)
tmp.drop("DaysList", axis=1, inplace=True)

users_prev = users_prev.merge(tmp, on="CustomerID", how="left")

# compute monthly quantity and unit prices
aggs = {"Quantity": ["sum",],
        "Revenue": ["sum",]}

for month in users_prev["Month"].unique():
    stats = create_numerical_aggs(users_prev[users_prev["Month"] == month], groupby_id=["CustomerID", "Month"], aggs=aggs, suffix=f"_by_id_month_{month}")
    stats.drop("Month", axis=1, inplace=True)
    users_prev = users_prev.merge(stats, on=["CustomerID"], how="left")
    
users_prev.fillna(0, inplace=True)  # assuming no orders in current month

users_prev["mean_upr_mothly"] = users_prev[['revenue_sum_by_id_month_12',
                                             'revenue_sum_by_id_month_1',
                                             'revenue_sum_by_id_month_2',
                                             'revenue_sum_by_id_month_3',
                                             'revenue_sum_by_id_month_4',
                                             'revenue_sum_by_id_month_5',
                                             'revenue_sum_by_id_month_6',
                                             'revenue_sum_by_id_month_7',
                                             'revenue_sum_by_id_month_8',
                                            # 'unitprice_sum_by_id_month_9',
                                            # 'unitprice_sum_by_id_month_10'
                                           ]].mean(axis=1)

users_prev["stdv_upr_mothly"] = users_prev[['revenue_sum_by_id_month_12',
                                             'revenue_sum_by_id_month_1',
                                             'revenue_sum_by_id_month_2',
                                             'revenue_sum_by_id_month_3',
                                             'revenue_sum_by_id_month_4',
                                             'revenue_sum_by_id_month_5',
                                             'revenue_sum_by_id_month_6',
                                             'revenue_sum_by_id_month_7',
                                             'revenue_sum_by_id_month_8',
                                            # 'unitprice_sum_by_id_month_9',
                                            # 'unitprice_sum_by_id_month_10'
                                           ]].std(axis=1)

users_prev["mean_qnt_mothly"] = users_prev[['quantity_sum_by_id_month_12',
                                            'quantity_sum_by_id_month_1',
                                            'quantity_sum_by_id_month_2',
                                            'quantity_sum_by_id_month_3',
                                            'quantity_sum_by_id_month_4',
                                            'quantity_sum_by_id_month_5',
                                            'quantity_sum_by_id_month_6',
                                            'quantity_sum_by_id_month_7',
                                            'quantity_sum_by_id_month_8',
                                           # 'quantity_sum_by_id_month_9',
                                           # 'quantity_sum_by_id_month_10'
                                           ]].mean(axis=1)

users_prev["stdv_qnt_mothly"] = users_prev[['quantity_sum_by_id_month_12',
                                            'quantity_sum_by_id_month_1',
                                            'quantity_sum_by_id_month_2',
                                            'quantity_sum_by_id_month_3',
                                            'quantity_sum_by_id_month_4',
                                            'quantity_sum_by_id_month_5',
                                            'quantity_sum_by_id_month_6',
                                            'quantity_sum_by_id_month_7',
                                            'quantity_sum_by_id_month_8',
                                           # 'quantity_sum_by_id_month_9',
                                           # 'quantity_sum_by_id_month_10'
                                           ]].std(axis=1)

# lifetimes style recency: last date - first date 
tmp = users_prev.groupby("CustomerID", as_index=False).agg({"InvoiceDate": lambda x: (x.max() - x.min()).days}).rename(columns={"InvoiceDate": "LTRecency"})
users_prev = users_prev.merge(tmp, on="CustomerID", how="left")

# compute frequency
tmp = users_prev.groupby("CustomerID", as_index=False)["InvoiceNo"].count().rename(columns={"InvoiceNo": "frequency"})
users_prev = users_prev.merge(tmp, on="CustomerID", how="left")

# compute recency
df_recency = users_prev.groupby("CustomerID", as_index=False)["InvoiceDate"].max()
df_recency.columns = ['CustomerID', 'LastPurchaseDate']
recent_date = df_recency['LastPurchaseDate'].max()

df_recency['Recency'] = df_recency['LastPurchaseDate'].apply(lambda x: (recent_date - x).days)
df_recency.drop("LastPurchaseDate", axis=1, inplace=True)

users_prev = users_prev.merge(df_recency, on="CustomerID", how="left")

users_prev['R_rank'] = users_prev['Recency'].rank(ascending=False)
users_prev['F_rank'] = users_prev['frequency'].rank(ascending=True)
users_prev['M_rank'] = users_prev['Revenue'].rank(ascending=True)

# normalizing rank
users_prev['R_rank'] = (users_prev['R_rank']/users_prev['R_rank'].max())*100
users_prev['F_rank'] = (users_prev['F_rank']/users_prev['F_rank'].max())*100
users_prev['M_rank'] = (users_prev['F_rank']/users_prev['M_rank'].max())*100

# calculating RFM-score
alpha = beta = gamma = 1
mult = 0.05

users_prev['RFM_Score'] = alpha * users_prev['R_rank'] + beta * users_prev['F_rank'] + gamma * users_prev['M_rank']
users_prev['RFM_Score'] *= mult

# text features: cut on most recent 1000 tokens
tmp = users_prev.sort_values("InvoiceDate").groupby(["CustomerID"], as_index=False)["Description"].sum().rename(columns={"Description": "Text"})
tmp["Text"] = tmp["Text"].apply(lambda x: " ".join(x.split()[-1000:]))
users_prev = users_prev.merge(tmp, on="CustomerID", how="left").drop("Description", axis=1)

In [8]:
users_prev.head(2)

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerID,target,Revenue,DayMonth,DayWeek,...,mean_qnt_mothly,stdv_qnt_mothly,LTRecency,frequency,Recency,R_rank,F_rank,M_rank,RFM_Score,Text
0,536365,85123A,6,2010-12-01,2.55,17850.0,1,15.3,1,2,...,188.111111,579.484349,71,312,202,4.205177,76.582104,0.037329,4.04123,antique white wooden picture frame white finis...
1,536365,71053,6,2010-12-01,3.39,17850.0,1,20.34,1,2,...,188.111111,579.484349,71,312,202,4.205177,76.582104,0.037329,4.04123,antique white wooden picture frame white finis...


In [10]:
users_prev["CustomerID"].nunique()  # small amount of unique users

3027

In [11]:
# aggregate everything
users = users_prev.groupby("CustomerID").agg({
    "StockCode": ["last", lambda x: x.nunique()],
    "target": "last",
    "Revenue": "last",
    "DayWeek": lambda x: x.value_counts().index[0],
    "Month": "last",
    "quantity_sum_by_id": "last",
    'quantity_max_by_id': "last",
    'quantity_min_by_id': "last", 
    'quantity_count_by_id': "last",
    'unitprice_mean_by_id': "last",
    'unitprice_max_by_id': "last",
    'unitprice_sum_by_id': "last",
    'unitprice_min_by_id': "last",
    'return_cnt': "last",
    'conseq': "last",
    'min_delta': "last", 
    'max_delta': "last", 
    'mean_delta': "last",
    'quantity_sum_by_id_month_12': "last", 
    'revenue_sum_by_id_month_12': "last",
    'quantity_sum_by_id_month_1': "last", 
    'revenue_sum_by_id_month_1': "last",
    'quantity_sum_by_id_month_2': "last", 
    'revenue_sum_by_id_month_2': "last",
    'quantity_sum_by_id_month_3': "last", 
    'revenue_sum_by_id_month_3': "last",
    'quantity_sum_by_id_month_4': "last", 
    'revenue_sum_by_id_month_4': "last",
    'quantity_sum_by_id_month_5': "last", 
    'revenue_sum_by_id_month_5': "last",
    'quantity_sum_by_id_month_6': "last", 
    'revenue_sum_by_id_month_6': "last",
    'quantity_sum_by_id_month_7': "last", 
    'revenue_sum_by_id_month_7': "last",
    'quantity_sum_by_id_month_8': "last", 
    'revenue_sum_by_id_month_8': "last",
    'mean_upr_mothly': "last", 
    'stdv_upr_mothly': "last", 
    'mean_qnt_mothly': "last",
    'stdv_qnt_mothly': "last", 
    'LTRecency': "last", 
    'frequency': "last", 
    'Recency': "last", 
    'R_rank': "last",
    'F_rank': "last", 
    'M_rank': "last", 
    'RFM_Score': "last", 
    'Text': "last"
})
users.columns = [col[0] if col[0] != "StockCode" else "StockCode" + str(i+1) for i, col in enumerate(users.columns)]

In [17]:
X, y = users.drop("target", axis=1), users["target"]
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [36]:
def print_scores(folds_scores, train_scores):
    print(f"Train score by each fold: {train_scores}")
    print(f"Valid score by each fold: {folds_scores}")
    print(f"Train mean score by each fold:{np.mean(train_scores):.5f} +/- {np.std(train_scores):.5f}")
    print(f"Valid mean score by each fold:{np.mean(folds_scores):.5f} +/- {np.std(folds_scores):.5f}")
    print("*" * 50)
    
def create_bootstrap_samples(data: np.array, n_samples: int = 1000) -> np.array:
    bootstrap_idx = np.random.randint(
        low=0, high=len(data), size=(n_samples, len(data))
    )
    return bootstrap_idx


def create_bootstrap_metrics(y_true: np.array,
                             y_pred: np.array,
                             metric: callable,
                             n_samlpes: int = 1000) -> list:
    scores = []

    if isinstance(y_true, pd.Series):
        y_true = y_true.values

    bootstrap_idx = create_bootstrap_samples(y_true)
    for idx in bootstrap_idx:
        y_true_bootstrap = y_true[idx]
        y_pred_bootstrap = y_pred[idx]

        score = metric(y_true_bootstrap, y_pred_bootstrap)
        scores.append(score)

    return scores


def calculate_confidence_interval(scores: list, conf_interval: float = 0.95) -> tuple:
    left_bound = np.percentile(
        scores, ((1 - conf_interval) / 2) * 100
    )
    right_bound = np.percentile(
        scores, (conf_interval + ((1 - conf_interval) / 2)) * 100
    )

    return left_bound, right_bound

def _predict(estimator, x_valid, probas=True):
    if hasattr(estimator, "predict_proba") and probas:
        y_pred = estimator.predict_proba(x_valid)[:, 1]
    else:
        y_pred = estimator.predict(x_valid)

    return y_pred

def calculate_permutation_importance(estimator,
                                     metric: callable,
                                     x_valid: pd.DataFrame,
                                     y_valid: pd.DataFrame,
                                     maximize: bool = True,
                                     probas: bool = False
                                     ) -> pd.Series:
    y_pred = _predict(estimator, x_valid, probas)
    base_score = metric(y_valid, y_pred)
    scores, delta = {}, {}

    for feature in tqdm(x_valid.columns):
        x_valid_ = x_valid.copy(deep=True)
        np.random.seed(42)
        x_valid_[feature] = np.random.permutation(x_valid_[feature])

        y_pred = _predict(estimator, x_valid_, probas)
        feature_score = metric(y_valid, y_pred)

        if maximize:
            delta[feature] = base_score - feature_score
        else:
            delta[feature] = feature_score - base_score

        scores[feature] = feature_score

    scores, delta = pd.Series(scores), pd.Series(delta)
    scores = scores.sort_values(ascending=False)
    delta = delta.sort_values(ascending=False)

    return scores, delta
    
def catboost_cross_validation(X: pd.DataFrame,
                              y: pd.Series,
                              params: dict = None,
                              cv=None,
                              categorical: list = None,
                              textual: list = None,
                              rounds: int = 50,
                              verbose: bool = True,
                              preprocess: object = None,
                              score_fn: callable = roc_auc_score,
                              calculate_ci: bool = False,
                              n_samples: int = 1000,
                              confidence: float = 0.95,
                              seed: int = 42):

    minor_class_counts = y.value_counts(normalize=True).values[-1]

    if cv is None:
        if minor_class_counts >= 0.05:
            cv = KFold(n_splits=5, shuffle=True, random_state=seed)
        else:
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    if params is None:
        if len(X) <= 50_000:
            sub_params = {
                "grow_policy": "SymmetricTree",
                "boosting_type": "Ordered",
                "score_function": "Cosine",
                "depth": 6,
            }
        else:
            sub_params = {
                "grow_policy": "Lossguide",
                "boosting_type": "Plain",
                "score_function": "L2",
                "depth": 16,
                "min_data_in_leaf": 200,
                "max_leaves": 2**16 // 8,
            }
        params = {
            "iterations": 1000,
            "learning_rate": 0.01,
            "loss_function": "Logloss",
            "eval_metric": "AUC",
            "task_type": "CPU",
            "use_best_model": True,
            "thread_count": -1,
            "silent": True,
            "random_seed": seed,
            "allow_writing_files": False,
            "auto_class_weights": "SqrtBalanced" if minor_class_counts < 0.05 else None,
            "bagging_temperature": 1,
            "max_bin": 255,
            "l2_leaf_reg": 10,
            "subsample": 0.9,
            "bootstrap_type": "MVS",
            "colsample_bylevel": 0.9,
        }
        params.update(sub_params)

    prediction_type = "Probability" if score_fn.__name__ == "roc_auc_score" else "Class"

    estimators, folds_scores, train_scores = [], [], []

    oof_preds = np.zeros(X.shape[0])

    if verbose:
        print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
        print("Estimating best number of trees.")

    best_iterations = []

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        if preprocess is not None:
            x_train = preprocess.fit_transform(x_train, y_train)
            x_valid = preprocess.transform(x_valid)

        train_pool = Pool(x_train, y_train, cat_features=categorical, text_features=textual)
        valid_pool = Pool(x_valid, y_valid, cat_features=categorical, text_features=textual)

        model = CatBoostClassifier(**params).fit(
            train_pool,
            eval_set=valid_pool,
            early_stopping_rounds=rounds
            )

        best_iterations.append(model.get_best_iteration())

    best_iteration = int(np.median(best_iterations))  # int(np.mean(best_iterations))
    params["iterations"] = best_iteration

    cv.random_state = seed % 3
    if verbose:
        print(f"Evaluating cross validation with {best_iteration} trees.")

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        if preprocess is not None:
            x_train = preprocess.fit_transform(x_train, y_train)
            x_valid = preprocess.transform(x_valid)

        train_pool = Pool(x_train, y_train, cat_features=categorical, text_features=textual)
        valid_pool = Pool(x_valid, y_valid, cat_features=categorical, text_features=textual)

        model = CatBoostClassifier(**params).fit(
            train_pool,
            eval_set=valid_pool,
            )

        train_score = catboost.CatBoost.predict(model, train_pool, prediction_type=prediction_type)
        if prediction_type == "Probability":
            train_score = train_score[:, 1]
        train_score = score_fn(y_train, train_score)

        valid_scores = catboost.CatBoost.predict(model, valid_pool, prediction_type=prediction_type)
        if prediction_type == "Probability":
            valid_scores = valid_scores[:, 1]

        oof_preds[valid_idx] = valid_scores
        score = score_fn(y_valid, oof_preds[valid_idx])

        folds_scores.append(round(score, 5))
        train_scores.append(round(train_score, 5))

        if verbose:
            print(f"Fold {fold + 1}, Train score = {train_score:.5f}, Valid score = {score:.5f}")
        estimators.append(model)

    if verbose:
        oof_scores = score_fn(y, oof_preds)
        print_scores(folds_scores, train_scores)
        print(f"OOF-score {score_fn.__name__}: {oof_scores:.5f}")
        if calculate_ci:
            bootstrap_scores = create_bootstrap_metrics(y, oof_preds, score_fn, n_samlpes=n_samples)
            left_bound, right_bound = calculate_confidence_interval(bootstrap_scores, conf_interval=confidence)
            print(f"Expected metric value lies between: {left_bound:.5f} and {right_bound:.5f}",
                  f"with confidence of {confidence*100}%")

    return estimators, oof_preds, np.mean(folds_scores)

In [19]:
y.value_counts(normalize=True)

0    0.564255
1    0.435745
Name: target, dtype: float64

In [22]:
cb_params = {
            "grow_policy": "SymmetricTree",
            "boosting_type": "Ordered",
            "score_function": "Cosine",
            "depth": 3,
            "iterations": 1000,
            "learning_rate": 0.1,
            "loss_function": "Logloss",
            "eval_metric": "AUC",
            "task_type": "CPU",
            "use_best_model": True,
            "thread_count": -1,
            "silent": True,
            "random_seed": 42,
            "allow_writing_files": False,
            "auto_class_weights": None,
            "bagging_temperature": 1,
            "max_bin": 16,  # tune
            "l2_leaf_reg": 69,
            "subsample": 0.8,
            "bootstrap_type": "MVS",
            "colsample_bylevel": 0.6,
            # "max_ctr_complexity": 4,
            # "random_strength": 0.8,
}

categorical = [col for col in users.columns if users[col].dtype == "object" and col != "Text"]
textual = ["Text",]
cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

_, _, _ = catboost_cross_validation(X, y, 
                                    params=cb_params, 
                                    rounds=50, 
                                    cv=cv, 
                                    calculate_ci=True, 
                                    categorical=categorical,
                                    textual=textual,
                                    preprocess=None,
                                    seed=42)

Sat Jan 28 23:06:26 2023, Cross-Validation, 3027 rows, 48 cols
Estimating best number of trees.
Evaluating cross validation with 29 trees.
Fold 1, Train score = 0.75795, Valid score = 0.74624
Fold 2, Train score = 0.75646, Valid score = 0.76321
Fold 3, Train score = 0.76219, Valid score = 0.73045
Fold 4, Train score = 0.74973, Valid score = 0.78420
Fold 5, Train score = 0.75877, Valid score = 0.73804
Fold 6, Train score = 0.76271, Valid score = 0.71709
Train score by each fold: [0.75795, 0.75646, 0.76219, 0.74973, 0.75877, 0.76271]
Valid score by each fold: [0.74624, 0.76321, 0.73045, 0.7842, 0.73804, 0.71709]
Train mean score by each fold:0.75797 +/- 0.00430
Valid mean score by each fold:0.74654 +/- 0.02196
**************************************************
OOF-score roc_auc_score: 0.74420
Expected metric value lies between: 0.72813 and 0.76060 with confidence of 95.0%


In [28]:
for fold, (train_idx, valid_idx) in enumerate(StratifiedKFold(n_splits=6, shuffle=True, random_state=42).split(X, y)):
    if fold == 3:
        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

In [37]:
scores, deltas = calculate_permutation_importance(CatBoostClassifier(**cb_params).fit(X.loc[train_idx], 
                                                                                      y[train_idx],
                                                                                      eval_set=[(X.loc[valid_idx], y[valid_idx]),],
                                                                                      cat_features=categorical, 
                                                                                      text_features=textual),
                                                 roc_auc_score,
                                                 X.loc[valid_idx],
                                                 y[valid_idx],
                                                 maximize=True,
                                                 probas=True
                                                 )

100%|██████████| 48/48 [00:01<00:00, 27.58it/s]


In [39]:
deltas = deltas[deltas>0].index.tolist()

In [73]:
cb_params = {
            "grow_policy": "SymmetricTree",
            "boosting_type": "Ordered",
            "score_function": "Cosine",
            "depth": 3,
            "iterations": 1000,
            "learning_rate": 0.01,
            "loss_function": "Logloss",
            "eval_metric": "AUC",
            "task_type": "CPU",
            "use_best_model": True,
            "thread_count": -1,
            "silent": True,
            "random_seed": 42,
            "allow_writing_files": False,
            #"auto_class_weights": None,
            #"bagging_temperature": 0,
            "max_bin": 16,  # tune
            "l2_leaf_reg": 100,
            "subsample": 1,
            "bootstrap_type": "MVS",
            "colsample_bylevel": 0.9,
            # "max_ctr_complexity": 4,
            "random_strength": 8,
}

categorical = [col for col in deltas if users[col].dtype == "object" and col != "Text"]

cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=42)

_, _, _ = catboost_cross_validation(X[deltas], y, 
                                    params=cb_params, 
                                    rounds=50, 
                                    cv=cv, 
                                    score_fn=accuracy_score,  # fine balance
                                    calculate_ci=True, 
                                    categorical=categorical,
                                    textual=None,
                                    preprocess=None,
                                    seed=42)

Sat Jan 28 23:26:41 2023, Cross-Validation, 3027 rows, 13 cols
Estimating best number of trees.
Evaluating cross validation with 40 trees.
Fold 1, Train score = 0.68953, Valid score = 0.69109
Fold 2, Train score = 0.68795, Valid score = 0.68317
Fold 3, Train score = 0.69112, Valid score = 0.67525
Fold 4, Train score = 0.68569, Valid score = 0.70040
Fold 5, Train score = 0.68648, Valid score = 0.68849
Fold 6, Train score = 0.68847, Valid score = 0.67262
Train score by each fold: [0.68953, 0.68795, 0.69112, 0.68569, 0.68648, 0.68847]
Valid score by each fold: [0.69109, 0.68317, 0.67525, 0.7004, 0.68849, 0.67262]
Train mean score by each fold:0.68821 +/- 0.00181
Valid mean score by each fold:0.68517 +/- 0.00947
**************************************************
OOF-score accuracy_score: 0.68517
Expected metric value lies between: 0.66798 and 0.70268 with confidence of 95.0%
