In [None]:
import datetime
import gc
import pathlib
import random
from typing import Dict, List, Tuple, Union
import sys

from joblib import dump
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.model_selection import StratifiedGroupKFold, TimeSeriesSplit
from sklearn.preprocessing import OrdinalEncoder

sys.path.append('..')

from scripts.evaluate import Evaluator
from scripts.get_depth_paths import get_depth_paths
from scripts.get_logger import get_logger
from scripts.merge_dataset import merge_dataset


gc.enable()

In [None]:
class PathHandler:
    competition_dir = pathlib.Path('../../inputs')
    parquet_files_dir = competition_dir.joinpath('parquet_files')
    feature_dir = pathlib.Path('../../outputs/features')
    now_time = datetime.datetime.now()
    output_dir = pathlib.Path(
        f'../../outputs/output_lgb/model_outputs_{now_time.date()}-{now_time.hour:02}-{now_time.minute:02}'
    )
    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)


paths = PathHandler()

### load cast type features
- cast bool types into pl.Int8 because of NoneType and NaN.

In [None]:
bool_features = pd.read_csv(paths.feature_dir.joinpath('bool_features.csv'))
float64_features = pd.read_csv(paths.feature_dir.joinpath('float64_features.csv'))
string_features = pd.read_csv(paths.feature_dir.joinpath('string_features.csv'))
date_features = pd.read_csv(paths.feature_dir.joinpath('date_features.csv'))

bool_features['cast_dtype'] = pl.Int8
float64_features['cast_dtype'] = pl.Float32
string_features['cast_dtype'] = pl.String
date_features['cast_dtype'] = pl.Date
if paths.feature_dir.joinpath('useful_features.csv').is_file():
    useful_features = pd.read_csv(paths.feature_dir.joinpath('useful_features.csv'))
else:
    useful_features = None
#useful_features = None
display(useful_features)

In [None]:
train_depth_paths = get_depth_paths(paths.parquet_files_dir, 'train')
print(f'number of test paths: {sum(len(v1) for v1 in train_depth_paths.values())}')

In [None]:
train_base_data = pl.read_parquet(
    paths.parquet_files_dir.joinpath('train/train_base.parquet')
)
train_base_data = train_base_data.cast(
    {
        'case_id': pl.Int64,
        'date_decision': pl.String,
        'MONTH': pl.Int64,
        'WEEK_NUM': pl.Int64,
        'target': pl.Int64,
    }
)
display(train_base_data)

In [None]:
%%time


depth_data = merge_dataset(
    train_base_data,
    train_depth_paths,
    bool_features,
    float64_features,
    string_features,
    date_features,
    useful_features,
    '012'
)
display(depth_data)
display(depth_data.dtypes.value_counts())

In [None]:
display(depth_data[depth_data.dtypes.index[depth_data.dtypes=='float64']])

### drop columns which have many nans

In [None]:
ratio_nan = depth_data.isna().sum() / len(depth_data)
display(ratio_nan)

depth_data = depth_data[ratio_nan[ratio_nan < 0.5].index]
display(depth_data)

### encoding

In [None]:
def encode_objects(
        depth_data: pd.DataFrame,
        output_dir: pathlib.Path
    ) -> pd.DataFrame:

    if not output_dir.is_dir():
        output_dir.mkdir(parents=True)

    object_columns = depth_data.dtypes.index[depth_data.dtypes==object].to_list()
    if 'date_decision' in object_columns:
        object_columns.remove('date_decision')
    print(len(object_columns))
    
    object_data = []
    for col in object_columns:
        encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        encoded_class = encoder.fit_transform(depth_data[col].values.reshape(-1, 1))
        object_data.append(encoded_class.astype(np.float32))
        dump(encoder, output_dir.joinpath(f'encoder_{col}.joblib'))
    depth_data.drop(columns=object_columns, inplace=True)
    object_data = np.concatenate(object_data, axis=1)
    object_data = pd.DataFrame(object_data, columns=object_columns)
    
    return pd.concat([depth_data, object_data], axis=1)

In [None]:
%%time


depth_data = encode_objects(
    depth_data,
    paths.output_dir.joinpath('encoders')
)
display(depth_data)

### training

In [None]:
class CFG:
    def __init__(
            self,
            outputs_dir: pathlib.Path,
            n_splits: int = 5,
            lr: float = 0.1,
            max_depth: int = -1,
            num_leaves: int = 31,
            min_data_in_leaf: int = 20,
            bagging_fraction: float = 1.0,
            feature_fraction_bynode: float = 1.0,
            num_boost_round: int = 1000,
            seed: int = 42,
            device: str = 'cpu',
            debag: bool = False,
        ):
        
        self.outputs_dir = outputs_dir
        self.n_splits = n_splits
        self.lr = lr
        self.max_depth = max_depth
        self.num_leaves = num_leaves
        self.min_data_in_leaf = min_data_in_leaf
        self.bagging_fraction = bagging_fraction
        self.feature_fraction_bynode = feature_fraction_bynode
        self.num_boost_round = num_boost_round
        self.seed = seed
        self.device = device
        self.debag = debag
        
    def seed_everything(self):
        random.seed(self.seed)
        np.random.seed(self.seed)

In [None]:
def train(config, logger, Xy):
    
    kfold = StratifiedGroupKFold(
        n_splits=config.n_splits,
        shuffle=True,
        random_state=config.seed,
    )
    
    y = Xy['target'].copy()
    groups = Xy['WEEK_NUM'].copy()
    oof = Xy[['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target']].copy()
    oof_probas = -1 * np.ones(len(oof))
    folds = -1 * np.ones(len(oof))
    X = Xy[[col for col in Xy.columns if col not in ['case_id', 'date_decision', 'MONTH', 'WEEK_NUM', 'target']]].copy()
    display(X)
    display(X.dtypes.value_counts())
    display(oof)

    logger.info(f'number of features: {X.shape}')
    
    np.save(config.outputs_dir.joinpath('training_features.npy'), X.columns.to_numpy())
    
    del Xy
    gc.collect()
    
    fimps = pd.DataFrame(
        index=X.columns,
        columns=[f'fold_{k+1}_fimp' for k in range(config.n_splits)]
    )
    
    model_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': ['average_precision', 'auc', 'binary_logloss'],
        'learning_rate': config.lr,
        'seed': config.seed,
        'max_depth': config.max_depth,
        'num_leaves': config.num_leaves,
        'min_data_in_leaf': config.min_data_in_leaf,
        'bagging_fraction': config.bagging_fraction,
        'feature_fraction_bynode': config.feature_fraction_bynode,
        'device': config.device,
    }
    
    lgb.register_logger(logger) 
    
    for k, (train_indices, valid_indices) in enumerate(kfold.split(X=X, y=y, groups=groups)):
        logger.info(f'fold: {k+1}')
        logger.info(f'# of train: {len(train_indices)}')
        logger.info(f'# of valid: {len(valid_indices)}')

        logger.info(f'train WEEK_NUM: {oof.iloc[train_indices]["WEEK_NUM"].unique()}')
        logger.info(f'valid WEEK_NUM: {oof.iloc[valid_indices]["WEEK_NUM"].unique()}')
        
        train_X = X.iloc[train_indices]
        valid_X = X.iloc[valid_indices]
        
        train_y = y.iloc[train_indices]
        valid_y = y.iloc[valid_indices]
        
        train_dataset = lgb.Dataset(data=train_X, label=train_y)
        valid_dataset = lgb.Dataset(data=valid_X, label=valid_y, reference=train_dataset)
        
        callbacks = [
            lgb.early_stopping(stopping_rounds=100, first_metric_only=True, verbose=True),
            lgb.log_evaluation(100),
        ]
        
        model = lgb.train(
            params=model_params,
            train_set=train_dataset,
            valid_sets=[train_dataset, valid_dataset],
            valid_names=['train', 'valid'],
            callbacks=callbacks,
            num_boost_round=config.num_boost_round,
        )

        date = ''.join(config.outputs_dir.stem.split('-')[-4:])
        model.save_model(
            config.outputs_dir.joinpath(f'lgb_fold{k+1}_{date}.txt'),
            num_iteration=model.best_iteration
        )
        
        fimps.loc[train_X.columns, f'fold_{k+1}_fimp'] = model.feature_importance(importance_type='gain')
        
        oof_probas[valid_indices] = model.predict(valid_X, num_iteration=model.best_iteration)
        folds[valid_indices] = k
        
        del train_X, train_y, valid_X, valid_y
        del train_dataset, valid_dataset
        del model
        gc.collect()
        
    oof.loc[:, 'probability'] = oof_probas
    oof.loc[:, 'fold'] = folds
    
    del oof_probas, folds
    gc.collect()

    logger.info('Done!\n')
    
    return oof, fimps

In [None]:
%%time


config = CFG(
    outputs_dir=paths.output_dir,
    n_splits=5,
    lr=1e-02,
    max_depth=-1,
    min_data_in_leaf=100,
    bagging_fraction=0.8,
    feature_fraction_bynode=1.0,
    num_boost_round=10000,
    seed=42,
    device='CPU',
    debag=False,
)
config.seed_everything()

logger = get_logger(paths.output_dir.joinpath('output.log'))

log = [f'{k} = {config.__dict__[k]}' for k, v in config.__dict__.items() if not k.startswith('__')]
logger.info('\n'.join(log))
logger.info('\n')

if config.debag:
    oof, fimps = train(config, logger, depth_data.iloc[:10000])
else:
    oof, fimps = train(config, logger, depth_data.query('WEEK_NUM < 63'))

In [None]:
display(oof)
print(oof['probability'].describe())

_, ax = plt.subplots()
sns.histplot(data=oof, x='probability', hue='target', bins=50, ax=ax)
plt.yscale('log')
plt.show()

In [None]:
fimps['fimp_mean'] = fimps.mean(axis=1)
fimps['fimp_std'] = fimps.std(axis=1)
fimps.sort_values('fimp_mean', inplace=True)
display(fimps)

_, ax = plt.subplots(figsize=(8, 16*(len(fimps)//100)))
ax.barh(y=fimps.index, width=fimps['fimp_mean'], xerr=fimps['fimp_std'], capsize=3)
plt.tight_layout()
plt.show()

In [None]:
oof.to_csv(paths.output_dir.joinpath('oof.csv'), index=False)

fimps = fimps.reset_index(names=['processed_Variable'])
fimps.to_csv(paths.output_dir.joinpath('feature_importances.csv'), index=False)

$$
\mathrm{stability metric}=mean(gini)+88.0⋅min(0,a)−0.5⋅std(residuals)
$$

In [None]:
evaluater = Evaluator(oof)
gini_per_week, outcome = evaluater.plot_gini()
display(gini_per_week)
print(outcome)

logger.info(
    f'stability: {outcome["stability"].item()}\n'
    + f'slope: {outcome["slope"].item()}\n'
    + f'intercept: {outcome["intercept"].item()}\n'
)

outcome.to_csv(paths.output_dir.joinpath('outcome.csv'), index=False)