In [None]:
import datetime
import gc
import os
import pathlib
import random
import sys
sys.path.append('..')
from typing import Any, Dict, List, Tuple, Union

from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import StratifiedGroupKFold

from scripts.evaluate import Evaluator
from scripts.get_logger import get_logger


gc.enable()

In [None]:
class PathHandler:
    competition_dir = pathlib.Path('../../inputs')
    parquet_files_dir = competition_dir.joinpath('parquet_files')
    feature_dir = pathlib.Path('../../outputs/features')
    now_time = datetime.datetime.now()
    dataset_dir = pathlib.Path('../../dataset')
    output_dir = pathlib.Path(
        '../../outputs/output_catboost/'
        + f'model_outputs_{now_time.date()}-{now_time.hour:02}-{now_time.minute:02}'
    )
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)


paths = PathHandler()

### load depth_data

In [None]:
depth_data = pd.read_parquet(paths.dataset_dir.joinpath('depth_data.parquet'))
depth_data = depth_data.query('is_test==0')
depth_data.drop(columns=['is_test'], inplace=True)
categorical_features = list(np.load(paths.dataset_dir.joinpath('categorical_features.npy'), allow_pickle=True))

depth_data.fillna({col: 'none' for col in categorical_features}, inplace=True)
depth_data[categorical_features] = depth_data[categorical_features].astype('category')

display(depth_data)
print(len(categorical_features))

display(depth_data.dtypes.value_counts())

In [None]:
class CFG:
    def __init__(
            self,
            outputs_dir: pathlib.Path,
            n_splits: int = 5,
            lr: float = 0.1,
            depth: int = 6,
            iterations: int = 1000,
            loss_function: str = 'Logloss',
            eval_metric: str = 'Logloss',
            subsample: int = 0.66,
            rsm: float = 1.0,
            l2_leaf_reg: float = 3.0,
            min_data_in_leaf: int = 1,
            seed: int = 42,
            task_type: str = 'cpu',
            thread_count: int = -1,
            debag: bool = False,
        ):
        
        self.outputs_dir = outputs_dir
        self.n_splits = n_splits
        self.lr = lr
        self.depth = depth
        self.iterations = iterations
        self.loss_function = loss_function
        self.eval_metric = eval_metric
        self.subsample = subsample
        self.rsm = rsm
        self.l2_leaf_reg = l2_leaf_reg
        self.min_data_in_leaf = min_data_in_leaf
        self.seed = seed
        self.task_type = task_type
        self.thread_count = thread_count
        self.debag = debag
        
    def seed_everything(self):
        random.seed(self.seed)
        np.random.seed(self.seed)

In [None]:
def train(
        config: Dict[str, Any],
        logger: object,
        X: pd.DataFrame,
        categorical_features: List[str],
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    
    kfold = StratifiedGroupKFold(
        n_splits=config.n_splits,
        shuffle=True,
        random_state=config.seed,
    )

    y = X['target'].copy()
    weeks = X[['WEEK_NUM']].copy()
    oof = X[['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target']].copy()
    oof_probas = -1 * np.ones(len(oof))
    folds = -1 * np.ones(len(oof))
    X.drop(columns=['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target'], inplace=True)

    display(X.dtypes.value_counts())
    display(oof)

    logger.info(f'number of features: {X.shape}')
    
    np.save(config.outputs_dir.joinpath('training_features.npy'), X.columns.to_numpy())
    np.save(config.outputs_dir.joinpath('categorical_features.npy'), categorical_features)
    
    fimps = pd.DataFrame(
        index=X.columns,
        columns=[f'fold_{k+1}_fimp' for k in range(config.n_splits)],
    )

    model_params = {
        'depth': config.depth,
        'iterations': config.iterations,
        'loss_function': config.loss_function,
        'eval_metric': config.eval_metric,
        'learning_rate': config.lr,
        'random_seed': config.seed,
        'subsample': config.subsample,
        'rsm': config.rsm,
        'l2_leaf_reg': config.l2_leaf_reg,
        'min_data_in_leaf': config.min_data_in_leaf,
        'task_type': config.task_type,
        'thread_count': config.thread_count,
    }

    for k, (train_indices, valid_indices) in enumerate(kfold.split(X=X, y=y, groups=weeks)):

        logger.info(f'fold: {k+1}')
        logger.info(f'# of train: {len(train_indices)}')
        logger.info(f'# of valid: {len(valid_indices)}')

        logger.info(f'train WEEK_NUM: {oof.iloc[train_indices]["WEEK_NUM"].unique()}')
        logger.info(f'valid WEEK_NUM: {oof.iloc[valid_indices]["WEEK_NUM"].unique()}')
        
        train_X = X.iloc[train_indices]
        valid_X = X.iloc[valid_indices]
        
        train_y = y.iloc[train_indices]
        valid_y = y.iloc[valid_indices]

        train_pool = Pool(
            data=train_X,
            label=train_y,
            cat_features=categorical_features
        )
        valid_pool = Pool(
            data=valid_X,
            label=valid_y,
            cat_features=categorical_features
        )
        
        model = CatBoostClassifier(**model_params)
        model.fit(
            train_pool,
            eval_set=valid_pool,
            use_best_model=True,
            early_stopping_rounds=100,
            verbose=100,
        )

        date = ''.join(config.outputs_dir.stem.split('-')[-4:])
        model.save_model(
            config.outputs_dir.joinpath(f'catboost_fold{k+1}_{date}.cbm')
        )
        
        fimps.loc[X.columns, f'fold_{k+1}_fimp'] = model.get_feature_importance(
            valid_pool,
            type='PredictionValuesChange'
        )
        
        oof_probas[valid_indices] = model.predict_proba(valid_X)[:, 1]
        folds[valid_indices] = k
        
        del train_X, train_y, valid_X, valid_y
        del train_pool, valid_pool
        del model
        gc.collect()
        
    oof.loc[:, 'probability'] = oof_probas
    oof.loc[:, 'fold'] = folds
    
    del oof_probas, folds
    gc.collect()

    logger.info('Done!\n')
    
    return oof, fimps

In [None]:
%%time


config = CFG(
    outputs_dir=paths.output_dir,
    n_splits=5,
    lr=5e-02,
    iterations=10000,
    loss_function='Logloss',
    eval_metric='NormalizedGini',
    min_data_in_leaf=100,
    subsample=0.8,
    rsm=0.8,
    seed=42,
    l2_leaf_reg=10,
    task_type='CPU',
    thread_count=int(os.cpu_count()*0.8),
    debag=False,
)
config.seed_everything()

logger = get_logger(paths.output_dir.joinpath('output.log'))

log = [
    f'{k} = {config.__dict__[k]}'
    for k, v in config.__dict__.items()
    if not k.startswith('__')
]
logger.info('\n'.join(log))
logger.info('\n')

if config.debag:
    depth_data = depth_data.sample(n=10000, random_state=config.seed)
    depth_data = depth_data.reset_index(drop=True)

oof, fimps = train(config, logger, depth_data, categorical_features)

In [None]:
display(oof)
print(oof.query('probability != -1')['probability'].describe())

_, ax = plt.subplots()
sns.histplot(
    data=oof.query('probability != -1'),
    x='probability',
    hue='target',
    bins=50,
    ax=ax
)
plt.yscale('log')
plt.show()

In [None]:
fimps['fimp_mean'] = fimps.mean(axis=1)
fimps['fimp_std'] = fimps.std(axis=1)
fimps.sort_values('fimp_mean', inplace=True)
display(fimps)

_, ax = plt.subplots(figsize=(8, 16*(len(fimps)//100)))
ax.barh(y=fimps.index, width=fimps['fimp_mean'], xerr=fimps['fimp_std'], capsize=3)
plt.tight_layout()
plt.show()

In [None]:
oof.to_csv(paths.output_dir.joinpath('oof.csv'), index=False)

fimps = fimps.reset_index(names=['processed_Variable'])
fimps.to_csv(paths.output_dir.joinpath('feature_importances.csv'), index=False)

In [None]:
evaluater = Evaluator(oof.query('probability!=-1'))
gini_per_week, outcome = evaluater.plot_gini()
display(gini_per_week)
print(outcome)

logger.info(
    f'stability: {outcome["stability"].item()}\n'
    + f'slope: {outcome["slope"].item()}\n'
    + f'intercept: {outcome["intercept"].item()}\n'
)

outcome.to_csv(paths.output_dir.joinpath('outcome.csv'), index=False)