In [1]:
import pandas as pd
import yaml
import sys
import os
from glob import glob
from pathlib import Path
import gc
import pickle
import joblib

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.lgb import fit_lgb
from utils.metric import compute_comptetition_metric
from utils.set_seed import seed_base
from feature_engineering.fe_xgb import generate_features

MODEL_NAME = "lgb_model"
PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])

CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

seed_base(CFG["env"]["seed"])

2023-11-09 05:30:28.049068: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-09 05:30:28.117151: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-09 05:30:28.570288: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/u

exp_129
rm: cannot remove '/kaggle/output/exp_129': No such file or directory


In [2]:
# 特徴生成
files = glob(f"{CFG['dataset']['step_csv_dir']}/*.parquet")
train, features = generate_features(files)

# cv splitとマージ
cv_split = pd.read_csv(CFG['dataset']['cv_split_path'])
train["fold"] = train["series_id"].map(cv_split.set_index("series_id")["fold"])
display(train.head(5))
train.shape

generate features: 100%|██████████| 277/277 [04:27<00:00,  1.04it/s]


Unnamed: 0,anglez,anglez_diff_abs,anglez_diff_abs_clip5,anglez_mean,enmo,enmo_diff_abs,enmo_mean,minutes,same_count,total_seconds,...,enmo_rolling_median_100_shift_1000,anglez_diff_abs_clip5_rolling_median_100_shift_1000,enmo_std_rolling_median_100_shift_1000,anglez_diff_abs_clip5_std_rolling_median_100_shift_1000,enmo_rolling_square_mean_100_shift_1000,anglez_diff_abs_clip5_rolling_square_mean_100_shift_1000,enmo_std_rolling_square_mean_100_shift_1000,anglez_diff_abs_clip5_std_rolling_square_mean_100_shift_1000,reduce_step,fold
29,-89.259064,0.057546,0.057546,-10.070161,0.029858,0.000262,0.104063,837.5,0.0,47637.5,...,,,,,,,,,707,3
30,-87.457848,0.147454,0.147454,-0.726222,0.029458,0.000533,0.095024,957.5,0.0,47757.5,...,,,,,,,,,731,3
31,-86.225922,0.169739,0.169739,-5.723403,0.030287,0.001758,0.092043,1077.5,0.0,47877.5,...,,,,,,,,,755,3
32,-75.722588,5.349769,2.968245,-4.199389,0.083004,0.049383,0.083214,1197.5,0.0,47997.5,...,,,,,,,,,779,3
33,-0.061125,3.569804,1.781771,2.269062,0.021142,0.006329,0.088176,1317.5,0.0,48117.5,...,,,,,,,,,803,3


(493733, 1307)

In [3]:
from utils.postprocess import dynamic_range_nms
from multiprocessing import Pool
from tqdm import tqdm

def objective(trial):
    reg_alpha = trial.suggest_loguniform('reg_alpha', 1e-3, 1e3)
    reg_lambda = trial.suggest_loguniform('reg_lambda', 1e-3, 1e3)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.2, 0.8)
    num_leaves = trial.suggest_int('num_leaves', 10, 70)

    CFG[MODEL_NAME]["lightgbm"]["learning_rate"] = 0.1
    CFG[MODEL_NAME]["lightgbm"]["reg_alpha"] = reg_alpha
    CFG[MODEL_NAME]["lightgbm"]["reg_lambda"] = reg_lambda
    CFG[MODEL_NAME]["lightgbm"]["max_depth"] = max_depth
    CFG[MODEL_NAME]["lightgbm"]["colsample_bytree"] = colsample_bytree
    CFG[MODEL_NAME]["lightgbm"]["num_leaves"] = num_leaves

    # 学習
    dfs = []
    for event in ["wakeup", "onset"]:
        trn_oof, models = fit_lgb(
            X=train, 
            y=train[f"{event}_target"], 
            folds=train["fold"].astype(int), 
            features=features.all_features(),
            params=CFG[MODEL_NAME]["lightgbm"], 
            es_rounds=200,
            verbose=0,
            log=False,
            only_fold0=True,
        )
        train["score"] = trn_oof
        train["event"] = event
        dfs.append(train[["series_id", "step", "score", "event"]])
    gc.collect()
    oof = pd.concat(dfs, axis=0)
    oof["step"] = oof["step"].astype(int)
    oof = oof.sort_values(["series_id", "step"]).reset_index(drop=True)

    # post process
    oof = oof[oof["score"] > 0.005]
    oof["step"] = oof["step"].astype(int)
    groups = [group for _, group in oof.groupby("series_id")]
    with Pool(30) as p:  
        results = list(tqdm(p.imap(dynamic_range_nms, groups), total=len(groups)))
    sub = pd.concat(results)
    sub["score"] = sub["reduced_score"]
    
    labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
    labels = labels[labels["series_id"].isin(sub["series_id"].unique())]
    score, ap_table = compute_comptetition_metric(labels, sub)

    return score

In [4]:
import optuna

study = optuna.create_study(direction='maximize')
study.enqueue_trial({
    "reg_alpha": 0.02,
    "reg_lambda": 0.2,
    "max_depth": 7,
    "num_leaves": 40,
    "colsample_bytree": 0.7,
})
study.optimize(objective, n_trials=50)

print(f'best_params: {study.best_params}')
print(f'best_value: {study.best_value}')
display(study.trials_dataframe().sort_values("value", ascending=False).head(10))
study.trials_dataframe().to_csv(os.path.join(CFG["output_dir"], "xgb_optuna_result.csv"), index=False)

[32m[I 2023-11-09 05:35:05,993][0m A new study created in memory with name: no-name-663fa742-ab4a-4d16-8641-21b0ed720a65[0m


100%|██████████| 55/55 [00:01<00:00, 50.79it/s]
[32m[I 2023-11-09 05:37:23,135][0m Trial 0 finished with value: 0.7716001261243699 and parameters: {'reg_alpha': 0.02, 'reg_lambda': 0.2, 'max_depth': 7, 'colsample_bytree': 0.7, 'num_leaves': 40}. Best is trial 0 with value: 0.7716001261243699.[0m
100%|██████████| 55/55 [00:01<00:00, 47.65it/s]
[32m[I 2023-11-09 05:39:25,743][0m Trial 1 finished with value: 0.7720642160298445 and parameters: {'reg_alpha': 12.866118450307523, 'reg_lambda': 9.711159190476145, 'max_depth': 10, 'colsample_bytree': 0.6462188274051617, 'num_leaves': 45}. Best is trial 1 with value: 0.7720642160298445.[0m
100%|██████████| 55/55 [00:01<00:00, 44.39it/s]
[32m[I 2023-11-09 05:41:50,835][0m Trial 2 finished with value: 0.7738979065455843 and parameters: {'reg_alpha': 167.22110175249878, 'reg_lambda': 0.5309479510020025, 'max_depth': 8, 'colsample_bytree': 0.5649846715886738, 'num_leaves': 39}. Best is trial 2 with value: 0.7738979065455843.[0m
100%|███████

best_params: {'reg_alpha': 1.0085303887352641, 'reg_lambda': 10.32610667162794, 'max_depth': 9, 'colsample_bytree': 0.4540970973784533, 'num_leaves': 33}
best_value: 0.7779477664093645


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_max_depth,params_num_leaves,params_reg_alpha,params_reg_lambda,system_attrs_fixed_params,state
18,18,0.777948,2023-11-09 06:14:19.636183,2023-11-09 06:16:06.224571,0 days 00:01:46.588388,0.454097,9,33,1.00853,10.326107,,COMPLETE
46,46,0.777252,2023-11-09 07:08:06.712972,2023-11-09 07:10:07.949552,0 days 00:02:01.236580,0.424019,10,29,10.113795,212.770074,,COMPLETE
3,3,0.775994,2023-11-09 05:41:50.836664,2023-11-09 05:43:49.609420,0 days 00:01:58.772756,0.399357,7,44,42.162801,4.662809,,COMPLETE
27,27,0.775031,2023-11-09 06:30:47.008489,2023-11-09 06:32:38.518067,0 days 00:01:51.509578,0.401082,8,37,0.06317,6.549308,,COMPLETE
38,38,0.774912,2023-11-09 06:51:27.998788,2023-11-09 06:53:39.444832,0 days 00:02:11.446044,0.357407,7,35,2.510119,285.480974,,COMPLETE
13,13,0.77489,2023-11-09 06:04:45.894367,2023-11-09 06:06:41.228454,0 days 00:01:55.334087,0.32578,10,32,0.31378,175.318086,,COMPLETE
34,34,0.774861,2023-11-09 06:44:11.137343,2023-11-09 06:45:57.471653,0 days 00:01:46.334310,0.459855,10,32,0.014318,2.537349,,COMPLETE
40,40,0.774759,2023-11-09 06:55:44.475035,2023-11-09 06:57:50.567870,0 days 00:02:06.092835,0.417703,7,48,72.785865,76.781957,,COMPLETE
26,26,0.774473,2023-11-09 06:29:05.598146,2023-11-09 06:30:47.007095,0 days 00:01:41.408949,0.334965,10,50,12.539618,139.6575,,COMPLETE
24,24,0.774343,2023-11-09 06:25:16.711269,2023-11-09 06:27:11.403777,0 days 00:01:54.692508,0.273238,8,43,0.114707,995.45434,,COMPLETE
