### Library / Config

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import polars as pl
import os
import gc
from collections import defaultdict
from typing import *
from typing_extensions import Literal
import matplotlib.pyplot as plt
import pdb
import pickle
from tqdm import tqdm
from pathlib import Path
import itertools

from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb
import lightgbm as lgb

from src.data_process import load_train_df, load_labels, TrainIterator
from src.utils import reduce_memory, seed_everything, TimeUtil, Logger,  vis_feat_imp
from src.metrics import get_score_and_th
from src.gbdt.feature_engineering import execute_feature_engineering
from src.gbdt.model import all_level_group_train
from src.gbdt.feature_selection import select_feature

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_colwidth', 1000)

In [None]:
class CFG:
    exp = '090'
    run_type: Literal['all', 'half', 'dev', 'debug'] = 'all'
    
    input_path = Path('../data/input')
    output_path = Path(f'../data/output/exp_{exp}')
    output_path.mkdir(parents=True, exist_ok=True)
    
    fold = 5
    seed = 77
    model_type: Literal['lgb', 'xgb', 'cat'] = 'xgb'
    
    xgb_params = {
        'objective': 'binary:logistic',
        'eval_metric':'logloss',
        'tree_method': 'hist', # gpu_hist
        'learning_rate': 0.01,
        'max_depth': 4,
        'max_leaves': 16,
        'subsample': 0.80,
        'colsample_bytree': 0.40,
        'seed': seed
    }
    # 閾値最適化
    all_opt = True
    each_opt = False

In [None]:
seed_everything(CFG.seed)
logger = Logger(CFG.output_path, CFG.exp, CFG)

### Feature Engineering

In [None]:
# load
train_df = load_train_df(CFG.run_type, CFG.input_path / 'train.parquet')
train_labels = load_labels(CFG.input_path / 'train_labels.csv')
iter_train = TrainIterator(train_df)

In [None]:
# execute fe
all_df = execute_feature_engineering(iter_train)

In [None]:
for i in range(3):
    print(all_df[i].shape)

### Train GBDT

In [None]:
oof_df, models_all = all_level_group_train(
    CFG,
    all_df,
    train_labels,
    use_feat_dict = None,
    use_prev_pred = True,
    save_model = True,
    train_type = 'non_fs'
)

### Metric

In [None]:
results = get_score_and_th(
    oof_df,
    all_opt_flag=True,
    each_opt_flag=True,
    logger=logger
)

### Feature Importance

In [None]:
level_group = 0
fold = 0
vis_feat_imp(models_all[level_group][fold], 50)

In [None]:
level_group = 1
fold = 0
vis_feat_imp(models_all[level_group][fold], 50)

In [None]:
level_group = 2
fold = 0
vis_feat_imp(models_all[level_group][fold], 50)

### Feature Selection

In [None]:
use_feat_nums = [1000, 2000, 3000] # 1000, 1000, 1000
selected_feat_dict = select_feature(CFG.output_path, CFG.model_type, use_feat_nums=use_feat_nums)
[len(feat) for _, feat_list in selected_feat_dict.items() for feat in feat_list]

### Re Train

In [None]:
oof_df, models_all = all_level_group_train(
    CFG,
    all_df,
    train_labels,
    use_feat_dict=selected_feat_dict,
    use_prev_pred=True,
    save_model = True,
    train_type = 'with_fs'
)

### Metric

In [None]:
results = get_score_and_th(
    oof_df,
    all_opt_flag=True,
    each_opt_flag=True,
    logger=logger
)

### Feature Importance

In [None]:
level_group = 0
fold = 0
vis_feat_imp(models_all[level_group][fold], 50)

In [None]:
level_group = 1
fold = 0
vis_feat_imp(models_all[level_group][fold], 50)

In [None]:
level_group = 2
fold = 0
vis_feat_imp(models_all[level_group][fold], 50)