In [1]:
import datetime
import gc
import pathlib
import random
from typing import Dict, List, Tuple, Union
import sys

from catboost import CatBoostClassifier, Pool
from joblib import dump
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.model_selection import StratifiedGroupKFold, TimeSeriesSplit
from sklearn.preprocessing import OrdinalEncoder

sys.path.append('..')

from scripts.evaluate import Evaluator
from scripts.get_depth_paths import get_depth_paths
from scripts.get_logger import get_logger
from scripts.merge_dataset import merge_dataset


gc.enable()

In [2]:
class PathHandler:
    competition_dir = pathlib.Path('../../inputs')
    parquet_files_dir = competition_dir.joinpath('parquet_files')
    feature_dir = pathlib.Path('../../outputs/features')
    now_time = datetime.datetime.now()
    output_dir = pathlib.Path(
        f'../../outputs/output_catboost/model_outputs_{now_time.date()}-{now_time.hour:02}-{now_time.minute:02}'
    )
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)


paths = PathHandler()

### load cast type features
- cast bool types into pl.Int8 because of NoneType and NaN.

In [3]:
bool_features = pd.read_csv(paths.feature_dir.joinpath('bool_features.csv'))
float64_features = pd.read_csv(paths.feature_dir.joinpath('float64_features.csv'))
string_features = pd.read_csv(paths.feature_dir.joinpath('string_features.csv'))
date_features = pd.read_csv(paths.feature_dir.joinpath('date_features.csv'))

bool_features['cast_dtype'] = pl.Int8
float64_features['cast_dtype'] = pl.Float32
string_features['cast_dtype'] = pl.String
date_features['cast_dtype'] = pl.Date
if paths.feature_dir.joinpath('useful_features.csv').is_file():
    useful_features = pd.read_csv(paths.feature_dir.joinpath('useful_features.csv'))
else:
    useful_features = None
#useful_features = None
display(useful_features)

None

In [4]:
train_depth_paths = get_depth_paths(paths.parquet_files_dir, 'train')
print(f'number of test paths: {sum(len(v1) for v1 in train_depth_paths.values())}')

number of test paths: 31


In [5]:
train_base_data = pl.read_parquet(
    paths.parquet_files_dir.joinpath('train/train_base.parquet')
)
train_base_data = train_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.String,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
        'target': pl.Int64,
    }
)
display(train_base_data)

case_id,date_decision,MONTH,WEEK_NUM,target
i64,str,i64,i64,i64
0,"""2019-01-03""",201901,0,0
1,"""2019-01-03""",201901,0,0
2,"""2019-01-04""",201901,0,0
3,"""2019-01-03""",201901,0,0
4,"""2019-01-04""",201901,0,1
…,…,…,…,…
2703450,"""2020-10-05""",202010,91,0
2703451,"""2020-10-05""",202010,91,0
2703452,"""2020-10-05""",202010,91,0
2703453,"""2020-10-05""",202010,91,0


In [6]:
%%time


depth_data = merge_dataset(
    train_base_data,
    train_depth_paths,
    bool_features,
    float64_features,
    string_features,
    date_features,
    useful_features,
    '012'
)
display(depth_data)
display(depth_data.dtypes.value_counts())

loading `static_0`
	(1003757, 168)
	(522902, 168)
loading `static_cb_0`
	(1500476, 53)
loading `applprev_1`
	(782997, 92)
	(438525, 92)
loading `other_1`
	(51109, 21)
loading `tax_registry_a_1`
	(457934, 7)
loading `tax_registry_b_1`
	(150732, 7)
loading `tax_registry_c_1`
	(482265, 7)
loading `credit_bureau_a_1`
	(335275, 240)
	(549263, 240)
	(325127, 240)
	(176608, 240)
loading `credit_bureau_b_1`
	(36500, 134)
loading `deposit_1`
	(105111, 7)
loading `person_1`
	(1526659, 46)
loading `debitcard_1`
	(111772, 14)
loading `applprev_2`
	(1221522, 4)
loading `person_2`
	(1435105, 9)
loading `credit_bureau_a_2`
	(98303, 127)
	(118481, 127)
	(23734, 127)
	(156749, 127)
	(190486, 127)
	(190313, 127)
	(231250, 127)
	(150426, 127)
	(45056, 127)
	(77457, 127)
	(103033, 127)
loading `credit_bureau_b_2`
	(36447, 26)


Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,day_first_contractmaturitydate_151D_1,day_first_lastupdate_260D_1,day_first_contractenddate_991D_1,day_first_openingdate_313D_1,day_first_birth_259D_1,day_first_birthdate_87D_1,day_first_empl_employedfrom_271D_1,day_first_openingdate_857D_1,day_first_first_empls_employedfrom_796D_2,day_first_first_pmts_date_1107D_2
0,0,2019-01-03,201901,0,0,,,1917.599976,0.000000,0.0,...,,,,,1.0,,15.0,,,
1,1,2019-01-03,201901,0,0,,,3134.000000,0.000000,0.0,...,,,,,1.0,,29.0,,,
2,2,2019-01-04,201901,0,0,,,4937.000000,0.000000,0.0,...,,,,,1.0,,15.0,,,
3,3,2019-01-03,201901,0,0,,,4643.600098,0.000000,0.0,...,,,,,1.0,,15.0,,,
4,4,2019-01-04,201901,0,1,,,3390.199951,0.000000,0.0,...,,,,,1.0,,15.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,0.0,176561.359375,3675.400146,0.000000,0.0,...,,,,,1.0,,,,,
1526655,2703451,2020-10-05,202010,91,0,0.0,301276.468750,7088.600098,6191.600098,0.0,...,,,,,1.0,,,,,
1526656,2703452,2020-10-05,202010,91,0,0.0,14232.400391,7788.800293,0.000000,0.0,...,,,,,1.0,,,,,
1526657,2703453,2020-10-05,202010,91,0,0.0,197371.578125,1195.400024,2827.199951,0.0,...,,,28.0,29.0,1.0,,,29.0,,


float32    955
object     100
int8         6
int64        4
Name: count, dtype: int64

CPU times: user 17min 41s, sys: 42.2 s, total: 18min 23s
Wall time: 1min 10s


### drop columns which have many nans

In [7]:
ratio_nan = depth_data.isna().sum() / len(depth_data)
display(ratio_nan)

depth_data = depth_data[ratio_nan[ratio_nan < 0.8].index]
display(depth_data)

case_id                                      0.000000
date_decision                                0.000000
MONTH                                        0.000000
WEEK_NUM                                     0.000000
target                                       0.000000
                                               ...   
day_first_birthdate_87D_1                    0.991840
day_first_empl_employedfrom_271D_1           0.628797
day_first_openingdate_857D_1                 0.932508
day_first_first_empls_employedfrom_796D_2    0.998203
day_first_first_pmts_date_1107D_2            0.976126
Length: 1065, dtype: float64

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target,actualdpdtolerance_344P_0,amtinstpaidbefduel24m_4187115A_0,annuity_780A_0,annuitynextmonth_57A_0,applicationcnt_361L_0,...,day_first_dateofrealrepmt_138D_1,day_first_lastupdate_1112D_1,day_first_lastupdate_388D_1,day_first_numberofoverdueinstlmaxdat_148D_1,day_first_numberofoverdueinstlmaxdat_641D_1,day_first_overdueamountmax2date_1002D_1,day_first_overdueamountmax2date_1142D_1,day_first_refreshdate_3813885D_1,day_first_birth_259D_1,day_first_empl_employedfrom_271D_1
0,0,2019-01-03,201901,0,0,,,1917.599976,0.000000,0.0,...,,,,,,,,,1.0,15.0
1,1,2019-01-03,201901,0,0,,,3134.000000,0.000000,0.0,...,,,,,,,,,1.0,29.0
2,2,2019-01-04,201901,0,0,,,4937.000000,0.000000,0.0,...,,,,,,,,,1.0,15.0
3,3,2019-01-03,201901,0,0,,,4643.600098,0.000000,0.0,...,,,,,,,,,1.0,15.0
4,4,2019-01-04,201901,0,1,,,3390.199951,0.000000,0.0,...,,,,,,,,,1.0,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0,0.0,176561.359375,3675.400146,0.000000,0.0,...,27.0,8.0,25.0,5.0,,,,17.0,1.0,
1526655,2703451,2020-10-05,202010,91,0,0.0,301276.468750,7088.600098,6191.600098,0.0,...,18.0,8.0,18.0,,,,,17.0,1.0,
1526656,2703452,2020-10-05,202010,91,0,0.0,14232.400391,7788.800293,0.000000,0.0,...,23.0,30.0,9.0,,,,,17.0,1.0,
1526657,2703453,2020-10-05,202010,91,0,0.0,197371.578125,1195.400024,2827.199951,0.0,...,10.0,8.0,1.0,,,,,17.0,1.0,


### select features

In [8]:
# if paths.feature_dir.joinpath('selected_features.csv').is_file():
#     selected_features = pd.read_csv(paths.feature_dir.joinpath('selected_features.csv'))
#     selected_features = selected_features['processed_Variable'].to_list()
# else:
#     selected_features = []

# depth_data = depth_data[['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target'] + selected_features]
# display(depth_data)

### training

In [9]:
class CFG:
    def __init__(
            self,
            outputs_dir: pathlib.Path,
            n_splits: int = 5,
            lr: float = 0.1,
            depth: int = 6,
            iterations: int = 1000,
            loss_function: str = 'Logloss',
            eval_metric: str = 'Logloss',
            subsample: int = 0.66,
            rsm: float = 1.0,
            l2_leaf_reg: float = 3.0,
            min_data_in_leaf: int = 1,
            bootstrap_type: str = 'MVS',
            seed: int = 42,
            task_type: str = 'cpu',
            debag: bool = False,
        ):
        
        self.outputs_dir = outputs_dir
        self.n_splits = n_splits
        self.lr = lr
        self.depth = depth
        self.iterations = iterations
        self.loss_function = loss_function
        self.eval_metric = eval_metric
        self.subsample = subsample
        self.rsm = rsm
        self.l2_leaf_reg = l2_leaf_reg
        self.min_data_in_leaf = min_data_in_leaf
        self.bootstrap_type = bootstrap_type
        self.seed = seed
        self.task_type = task_type
        self.debag = debag
        
    def seed_everything(self):
        random.seed(self.seed)
        np.random.seed(self.seed)

In [10]:
def train(config, logger, X):
    
    # kfold = StratifiedGroupKFold(
    #     n_splits=config.n_splits,
    #     shuffle=True,
    #     random_state=config.seed,
    # )

    y = X['target'].copy()
    weeks = X[['WEEK_NUM']].copy()
    oof = X[['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target']].copy()
    oof_probas = -1 * np.ones(len(oof))
    folds = -1 * np.ones(len(oof))
    X.drop(columns=['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target'], inplace=True)
    categorical_features = X.dtypes.index[X.dtypes==object].to_list()
    X.fillna({col: 'none' for col in categorical_features}, inplace=True)
    print(categorical_features)
    display(X.dtypes.value_counts())
    display(oof)

    logger.info(f'number of features: {X.shape}')
    
    np.save(config.outputs_dir.joinpath('training_features.npy'), X.columns.to_numpy())
    
    fimps = pd.DataFrame(
        index=X.columns,
        columns=[f'fold_{k+1}_fimp' for k in range(config.n_splits)]
    )
    
    model_params = {
        'depth': config.depth,
        'iterations': config.iterations,
        'loss_function': config.loss_function,
        'eval_metric': config.eval_metric,
        'learning_rate': config.lr,
        'random_seed': config.seed,
        'subsample': config.subsample,
        'rsm': config.rsm,
        'l2_leaf_reg': config.l2_leaf_reg,
        'min_data_in_leaf': config.min_data_in_leaf,
        'bootstrap_type': config.bootstrap_type,
        'task_type': config.task_type,
    }
    if config.bootstrap_type == 'Bayesian':
        del model_params['subsample']
        del model_params['rsm']

    min_weeks = weeks.min().item()
    max_weeks = weeks.max().item()
    step = 16
    
    #for k, (train_indices, valid_indices) in enumerate(kfold.split(X=X, y=y, groups=weeks)):
    for k, train_start_week in enumerate(np.arange(min_weeks, max_weeks-step, step)):

        train_end_week = train_start_week + step
        valid_start_week = train_end_week
        valid_end_week = valid_start_week + step

        train_indices = weeks.query('@train_start_week <= WEEK_NUM < @train_end_week').index
        valid_indices = weeks.query('@valid_start_week <= WEEK_NUM < @valid_end_week').index

        logger.info(f'fold: {k+1}')
        logger.info(f'# of train: {len(train_indices)}')
        logger.info(f'# of valid: {len(valid_indices)}')

        logger.info(f'train WEEK_NUM: {oof.iloc[train_indices]["WEEK_NUM"].unique()}')
        logger.info(f'valid WEEK_NUM: {oof.iloc[valid_indices]["WEEK_NUM"].unique()}')
        
        train_X = X.iloc[train_indices]
        valid_X = X.iloc[valid_indices]
        
        train_y = y.iloc[train_indices]
        valid_y = y.iloc[valid_indices]

        train_pool = Pool(data=train_X, label=train_y, cat_features=categorical_features)
        valid_pool = Pool(data=valid_X, label=valid_y, cat_features=categorical_features)
        
        model = CatBoostClassifier(**model_params)
        model.fit(
            train_pool,
            eval_set=valid_pool,
            use_best_model=True,
            early_stopping_rounds=100,
            verbose=100,
        )

        date = ''.join(config.outputs_dir.stem.split('-')[-4:])
        model.save_model(
            config.outputs_dir.joinpath(f'catboost_fold{k+1}_{date}.cbm')
        )
        
        fimps.loc[train_X.columns, f'fold_{k+1}_fimp'] = model.get_feature_importance(valid_pool, type='PredictionValuesChange')
        
        oof_probas[valid_indices] = model.predict_proba(valid_X)[:, 1]
        folds[valid_indices] = k
        
        del train_X, train_y, valid_X, valid_y
        del train_pool, valid_pool
        del model
        gc.collect()
        
    oof.loc[:, 'probability'] = oof_probas
    oof.loc[:, 'fold'] = folds
    
    del oof_probas, folds
    gc.collect()

    logger.info('Done!\n')
    
    return oof, fimps

In [11]:
%%time


config = CFG(
    outputs_dir=paths.output_dir,
    n_splits=5,
    lr=5e-02,
    iterations=10000,
    loss_function='Logloss',
    eval_metric='NormalizedGini',
    min_data_in_leaf=100,
    subsample=1.0,
    rsm=1.0,
    seed=42,
    bootstrap_type='Bayesian',
    task_type='CPU',
    debag=False,
)
config.seed_everything()

logger = get_logger(paths.output_dir.joinpath('output.log'))

log = [f'{k} = {config.__dict__[k]}' for k, v in config.__dict__.items() if not k.startswith('__')]
logger.info('\n'.join(log))
logger.info('\n')

if config.debag:
    oof, fimps = train(config, logger, depth_data.iloc[:50000])
else:
    oof, fimps = train(config, logger, depth_data)

2024-04-26 11:15:14,407 scripts.get_logger:21 <module> [INFO]:
outputs_dir = ../../outputs/output_catboost/model_outputs_2024-04-26-11-14
n_splits = 5
lr = 0.05
depth = 6
iterations = 10000
loss_function = Logloss
eval_metric = NormalizedGini
subsample = 1.0
rsm = 1.0
l2_leaf_reg = 3.0
min_data_in_leaf = 100
bootstrap_type = Bayesian
seed = 42
task_type = CPU
debag = False
2024-04-26 11:15:14,407 scripts.get_logger:22 <module> [INFO]:




['bankacctype_710L_0', 'cardtype_51L_0', 'credtype_322L_0', 'disbursementtype_67L_0', 'inittransactioncode_186L_0', 'lastapprcommoditycat_1041M_0', 'lastapprcommoditytypec_5251766M_0', 'lastcancelreason_561M_0', 'lastrejectcommoditycat_161M_0', 'lastrejectcommodtypec_5251769M_0', 'lastrejectreason_759M_0', 'lastrejectreasonclient_4145040M_0', 'lastst_736L_0', 'paytype1st_925L_0', 'paytype_783L_0', 'previouscontdistrict_112M_0', 'twobodfilling_608L_0', 'typesuite_864L_0', 'description_5085714M_0', 'education_1103M_0', 'education_88M_0', 'maritalst_385M_0', 'maritalst_893M_0', 'requesttype_4525192L_0', 'riskassesment_302T_0', 'first_cancelreason_3545846M_1', 'first_credacc_status_367L_1', 'first_credtype_587L_1', 'first_district_544M_1', 'first_education_1138M_1', 'first_familystate_726L_1', 'first_inittransactioncode_279L_1', 'first_postype_4733339M_1', 'first_profession_152M_1', 'first_rejectreason_755M_1', 'first_rejectreasonclient_4145042M_1', 'first_status_219L_1', 'first_name_45272

float32    611
object      88
int8         6
Name: count, dtype: int64

Unnamed: 0,case_id,date_decision,MONTH,WEEK_NUM,target
0,0,2019-01-03,201901,0,0
1,1,2019-01-03,201901,0,0
2,2,2019-01-04,201901,0,0
3,3,2019-01-03,201901,0,0
4,4,2019-01-04,201901,0,1
...,...,...,...,...,...
1526654,2703450,2020-10-05,202010,91,0
1526655,2703451,2020-10-05,202010,91,0
1526656,2703452,2020-10-05,202010,91,0
1526657,2703453,2020-10-05,202010,91,0


2024-04-26 11:15:19,440 scripts.get_logger:21 train [INFO]:
number of features: (1526659, 705)
2024-04-26 11:15:19,462 scripts.get_logger:62 train [INFO]:
fold: 1
2024-04-26 11:15:19,463 scripts.get_logger:63 train [INFO]:
# of train: 261113
2024-04-26 11:15:19,463 scripts.get_logger:64 train [INFO]:
# of valid: 311125
2024-04-26 11:15:19,468 scripts.get_logger:66 train [INFO]:
train WEEK_NUM: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
2024-04-26 11:15:19,473 scripts.get_logger:67 train [INFO]:
valid WEEK_NUM: [16 17 18 19 20 21 22 23 24 25 26 27 29 28 30 31]


0:	test: 0.0891859	best: 0.0891859 (0)	total: 619ms	remaining: 1h 43m 9s
100:	test: 0.6127121	best: 0.6127121 (100)	total: 44.5s	remaining: 1h 12m 45s
200:	test: 0.6363511	best: 0.6363511 (200)	total: 1m 29s	remaining: 1h 13m 3s
300:	test: 0.6472117	best: 0.6472117 (300)	total: 2m 14s	remaining: 1h 12m 23s
400:	test: 0.6524382	best: 0.6524628 (399)	total: 2m 59s	remaining: 1h 11m 32s
500:	test: 0.6565722	best: 0.6565722 (500)	total: 3m 44s	remaining: 1h 10m 47s
600:	test: 0.6590887	best: 0.6591129 (598)	total: 4m 30s	remaining: 1h 10m 23s
700:	test: 0.6617039	best: 0.6617052 (699)	total: 5m 16s	remaining: 1h 10m 4s
800:	test: 0.6632484	best: 0.6632484 (800)	total: 6m 3s	remaining: 1h 9m 34s
900:	test: 0.6653972	best: 0.6653972 (900)	total: 6m 49s	remaining: 1h 8m 57s
1000:	test: 0.6666628	best: 0.6666978 (998)	total: 7m 36s	remaining: 1h 8m 24s


In [None]:
display(oof)
print(oo.query('probability != -1')['probability'].describe())

_, ax = plt.subplots()
sns.histplot(data=oof.query('probability != -1'), x='probability', hue='target', bins=50, ax=ax)
plt.yscale('log')
plt.show()

In [None]:
fimps['fimp_mean'] = fimps.mean(axis=1)
fimps['fimp_std'] = fimps.std(axis=1)
fimps.sort_values('fimp_mean', inplace=True)
display(fimps)

_, ax = plt.subplots(figsize=(8, 16*(len(fimps)//100)))
ax.barh(y=fimps.index, width=fimps['fimp_mean'], xerr=fimps['fimp_std'], capsize=3)
plt.tight_layout()
plt.show()

In [None]:
oof.to_csv(paths.output_dir.joinpath('oof.csv'), index=False)

fimps = fimps.reset_index(names=['processed_Variable'])
fimps.to_csv(paths.output_dir.joinpath('feature_importances.csv'), index=False)

$$
\mathrm{stability metric}=mean(gini)+88.0⋅min(0,a)−0.5⋅std(residuals)
$$

In [None]:
evaluater = Evaluator(oof.query('probability!=-1'))
gini_per_week, outcome = evaluater.plot_gini()
display(gini_per_week)
print(outcome)

logger.info(
    f'stability: {outcome["stability"].item()}\n'
    + f'slope: {outcome["slope"].item()}\n'
    + f'intercept: {outcome["intercept"].item()}\n'
)

outcome.to_csv(paths.output_dir.joinpath('outcome.csv'), index=False)