In [1]:
import datetime
import gc
import os
import pathlib
import random
import sys
sys.path.append('..')
from typing import Dict, List, Tuple, Union

from joblib import load
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.model_selection import StratifiedGroupKFold, TimeSeriesSplit
from sklearn.preprocessing import OrdinalEncoder

from scripts.evaluate import Evaluator
from scripts.get_logger import get_logger


gc.enable()

In [2]:
class PathHandler:
    competition_dir = pathlib.Path('../../inputs')
    parquet_files_dir = competition_dir.joinpath('parquet_files')
    feature_dir = pathlib.Path('../../outputs/features')
    now_time = datetime.datetime.now()
    dataset_dir = pathlib.Path('../../dataset')
    output_dir = pathlib.Path(
        '../../outputs/output_lgb/'
        + f'model_outputs_{now_time.date()}-{now_time.hour:02}-{now_time.minute:02}'
    )
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)


paths = PathHandler()

### load depth_data

In [3]:
depth_data = pd.read_parquet(paths.dataset_dir.joinpath('depth_data.parquet'))
depth_data = depth_data.query('is_test==0')
depth_data.drop(columns=['is_test'], inplace=True)
categorical_features = list(np.load(paths.dataset_dir.joinpath('categorical_features.npy'), allow_pickle=True))

depth_data[categorical_features] = depth_data[categorical_features].astype('category')

display(depth_data)
print(len(categorical_features))

display(depth_data.dtypes.value_counts())

KeyboardInterrupt: 

In [None]:
class CFG:
    def __init__(
            self,
            outputs_dir: pathlib.Path,
            n_splits: int = 5,
            boosting: str = 'gbdt',
            lr: float = 0.1,
            max_depth: int = -1,
            num_leaves: int = 31,
            min_data_in_leaf: int = 20,
            min_sum_hessian_in_leaf: float = 1e-03,
            bagging_fraction: float = 1.0,
            feature_fraction: float = 1.0,
            feature_fraction_bynode: float = 1.0,
            num_boost_round: int = 1000,
            lambda_l1: float = 0.0,
            lambda_l2: float = 0.0,
            seed: int = 42,
            device: str = 'cpu',
            num_threads: int = 0,
            is_unbalance: bool = False,
            drop_rate: float = 0.1,
            max_drop: int = 50,
            skip_drop: float = 0.5,
            extra_trees: bool = False,
            debag: bool = False,
        ):
        
        self.outputs_dir = outputs_dir
        self.n_splits = n_splits
        self.boosting = boosting
        self.lr = lr
        self.max_depth = max_depth
        self.num_leaves = num_leaves
        self.min_data_in_leaf = min_data_in_leaf
        self.min_sum_hessian_in_leaf = min_sum_hessian_in_leaf
        self.bagging_fraction = bagging_fraction
        self.feature_fraction = feature_fraction
        self.feature_fraction_bynode = feature_fraction_bynode
        self.num_boost_round = num_boost_round
        self.lambda_l1 = lambda_l1
        self.lambda_l2 = lambda_l2
        self.seed = seed
        self.device = device
        self.num_threads = num_threads
        self.is_unbalance = is_unbalance
        self.drop_rate = drop_rate
        self.max_drop = max_drop
        self.skip_drop = skip_drop
        self.extra_trees = extra_trees
        self.debag = debag
        
    def seed_everything(self):
        random.seed(self.seed)
        np.random.seed(self.seed)

In [None]:
def train(config, logger, X, categorical_features):
    
    kfold = StratifiedGroupKFold(
        n_splits=config.n_splits,
        shuffle=True,
        random_state=config.seed,
    )

    y = X['target'].copy()
    weeks = X[['WEEK_NUM']].copy()
    oof = X[['case_id', 'date_decision', 'WEEK_NUM', 'MONTH', 'target']].copy()
    oof_probas = -1 * np.ones(len(oof))
    folds = -1 * np.ones(len(oof))
    X.drop(columns=['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target'], inplace=True)
    display(X.dtypes.value_counts())
    display(oof)

    logger.info(f'number of features: {X.shape}')
    
    np.save(config.outputs_dir.joinpath('training_features.npy'), X.columns.to_numpy())
    np.save(config.outputs_dir.joinpath('categorical_features.npy'), categorical_features)
    
    fimps = pd.DataFrame(
        index=X.columns,
        columns=[f'fold_{k+1}_fimp' for k in range(config.n_splits)]
    )

    model_params = {
        'task': 'train',
        'boosting': config.boosting,
        'objective': 'binary',
        'metric': ['auc', 'average_precision', 'binary_logloss'],
        'learning_rate': config.lr,
        'seed': config.seed,
        'max_depth': config.max_depth,
        'num_leaves': config.num_leaves,
        'min_data_in_leaf': config.min_data_in_leaf,
        'min_sum_hessian_in_leaf': config.min_sum_hessian_in_leaf,
        'bagging_fraction': config.bagging_fraction,
        'feature_fraction': config.feature_fraction,
        'feature_fraction_bynode': config.feature_fraction_bynode,
        'lambda_l1': config.lambda_l1,
        'lambda_l2': config.lambda_l2,
        'device': config.device,
        #'max_bin': 100,
        'num_threads': config.num_threads,
        'extra_trees': config.extra_trees,
        'is_unbalance': config.is_unbalance,
    }
    if config.boosting == 'dart':
        model_params['drop_rate'] = config.drop_rate
        model_params['max_drop'] = config.max_drop
        model_params['skip_drop'] = config.skip_drop
    
    lgb.register_logger(logger) 
    
    for k, (train_indices, valid_indices) in enumerate(kfold.split(X=X, y=y, groups=weeks)):

        logger.info(f'fold: {k+1}')
        logger.info(f'# of train: {len(train_indices)}')
        logger.info(f'# of valid: {len(valid_indices)}')

        logger.info(f'train WEEK_NUM: {oof.iloc[train_indices]["WEEK_NUM"].unique()}')
        logger.info(f'valid WEEK_NUM: {oof.iloc[valid_indices]["WEEK_NUM"].unique()}')
        
        train_X = X.iloc[train_indices]
        valid_X = X.iloc[valid_indices]
        
        train_y = y.iloc[train_indices]
        valid_y = y.iloc[valid_indices]
        
        train_dataset = lgb.Dataset(
            data=train_X,
            label=train_y,
            categorical_feature=categorical_features,
        )
        valid_dataset = lgb.Dataset(
            data=valid_X,
            label=valid_y,
            categorical_feature=categorical_features,
            reference=train_dataset,
        )
        
        callbacks = [
            lgb.early_stopping(stopping_rounds=100, first_metric_only=True, verbose=True),
            lgb.log_evaluation(100),
        ]
        
        model = lgb.train(
            params=model_params,
            train_set=train_dataset,
            valid_sets=[train_dataset, valid_dataset],
            valid_names=['train', 'valid'],
            callbacks=callbacks,
            num_boost_round=config.num_boost_round,
        )

        date = ''.join(config.outputs_dir.stem.split('-')[-4:])
        model.save_model(
            config.outputs_dir.joinpath(f'lgb_fold{k+1}_{date}.txt'),
            num_iteration=model.best_iteration
        )
        
        fimps.loc[X.columns, f'fold_{k+1}_fimp']  = model.feature_importance(
            importance_type='gain'
        )
        
        oof_probas[valid_indices] = model.predict(
            valid_X,
            num_iteration=model.best_iteration
        )
        folds[valid_indices] = k
        
        del train_X, train_y, valid_X, valid_y
        del train_dataset, valid_dataset
        del model
        gc.collect()
        
    oof.loc[:, 'probability'] = oof_probas
    oof.loc[:, 'fold'] = folds
    
    del oof_probas, folds
    gc.collect()

    logger.info('Done!\n')
    
    return oof, fimps

In [None]:
%%time


config = CFG(
    outputs_dir=paths.output_dir,
    boosting='gbdt',
    n_splits=5,
    lr=5e-02,
    max_depth=10,
    num_leaves=64,
    min_data_in_leaf=100,
    min_sum_hessian_in_leaf=1.0,
    bagging_fraction=0.8,
    feature_fraction=0.8,
    feature_fraction_bynode=0.8,
    num_boost_round=10000,
    lambda_l1=0.1,
    lambda_l2=10,
    seed=42,
    device='CPU',
    num_threads=int(os.cpu_count()*0.8),
    # drop_rate=0.3,
    # max_drop=50,
    # skip_drop=0.5,
    extra_trees=True,
    debag=False,
)
config.seed_everything()

logger = get_logger(paths.output_dir.joinpath('output.log'))

log = [
    f'{k} = {config.__dict__[k]}'
    for k, v in config.__dict__.items()
    if not k.startswith('__')
]
logger.info('\n'.join(log))
logger.info('\n')

if config.debag:
    depth_data = depth_data.sample(n=10000, random_state=config.seed)
    depth_data = depth_data.reset_index(drop=True)
    
oof, fimps = train(config, logger, depth_data, categorical_features)

In [None]:
np.save(config.outputs_dir.joinpath('categorical_features.npy'), categorical_features)

In [None]:
display(oof)
print(oof.query('probability != -1')['probability'].describe())

_, ax = plt.subplots()
sns.histplot(
    data=oof.query('probability != -1'),
    x='probability',
    hue='target',
    bins=50,
    ax=ax
)
plt.yscale('log')
plt.show()

In [None]:
fimps['fimp_mean'] = fimps.mean(axis=1)
fimps['fimp_std'] = fimps.std(axis=1)
fimps.sort_values('fimp_mean', inplace=True)
display(fimps)

_, ax = plt.subplots(figsize=(8, 16*(len(fimps)//100)))
ax.barh(y=fimps.index, width=fimps['fimp_mean'], xerr=fimps['fimp_std'], capsize=3)
plt.tight_layout()
plt.show()

In [None]:
oof.to_csv(paths.output_dir.joinpath('oof.csv'), index=False)

fimps = fimps.reset_index(names=['processed_Variable'])
fimps.to_csv(paths.output_dir.joinpath('feature_importances.csv'), index=False)

In [None]:
evaluater = Evaluator(oof.query('probability!=-1'))
gini_per_week, outcome = evaluater.plot_gini()
display(gini_per_week)
print(outcome)

logger.info(
    f'stability: {outcome["stability"].item()}\n'
    + f'slope: {outcome["slope"].item()}\n'
    + f'intercept: {outcome["intercept"].item()}\n'
)

outcome.to_csv(paths.output_dir.joinpath('outcome.csv'), index=False)

print(f'std(residuals) = {gini_per_week["residuals"].std():.4f}')

In [None]:
from sklearn.metrics import roc_auc_score


oof_ = oof.copy()
display(oof_.groupby('WEEK_NUM')[['probability', 'target']].apply(lambda x: roc_auc_score(x['target'], x['probability'])))