In [1]:
import pandas as pd
import yaml
import sys
import os
from glob import glob
from pathlib import Path
import gc
import pickle

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.xgb import fit_xgb
from utils.metric import compute_comptetition_metric
from utils.set_seed import seed_base
from feature_engineering.fe_xgb import generate_features

MODEL_NAME = "xgb_model"
PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
print(CFG[MODEL_NAME]["execution"]["exp_id"])

CFG["output_dir"] = f"/kaggle/output/{CFG[MODEL_NAME]['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

seed_base(CFG["env"]["seed"])

2023-11-09 03:06:43.935547: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-09 03:06:44.319007: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-09 03:06:45.382834: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/u

exp_128


In [2]:
# 特徴生成
files = glob(f"{CFG['dataset']['step_csv_dir']}/*.parquet")
train, features = generate_features(files)

# cv splitとマージ
cv_split = pd.read_csv(CFG['dataset']['cv_split_path'])
train["fold"] = train["series_id"].map(cv_split.set_index("series_id")["fold"])
display(train.head(5))
train.shape

generate features: 100%|██████████| 277/277 [04:27<00:00,  1.04it/s]


Unnamed: 0,anglez,anglez_diff_abs,anglez_diff_abs_clip5,anglez_mean,enmo,enmo_diff_abs,enmo_mean,minutes,same_count,total_seconds,...,enmo_rolling_median_100_shift_1000,anglez_diff_abs_clip5_rolling_median_100_shift_1000,enmo_std_rolling_median_100_shift_1000,anglez_diff_abs_clip5_std_rolling_median_100_shift_1000,enmo_rolling_square_mean_100_shift_1000,anglez_diff_abs_clip5_rolling_square_mean_100_shift_1000,enmo_std_rolling_square_mean_100_shift_1000,anglez_diff_abs_clip5_std_rolling_square_mean_100_shift_1000,reduce_step,fold
29,-89.259064,0.057546,0.057546,-10.070161,0.029858,0.000262,0.104063,837.5,0.0,47637.5,...,,,,,,,,,707,3
30,-87.457848,0.147454,0.147454,-0.726222,0.029458,0.000533,0.095024,957.5,0.0,47757.5,...,,,,,,,,,731,3
31,-86.225922,0.169739,0.169739,-5.723403,0.030287,0.001758,0.092043,1077.5,0.0,47877.5,...,,,,,,,,,755,3
32,-75.722588,5.349769,2.968245,-4.199389,0.083004,0.049383,0.083214,1197.5,0.0,47997.5,...,,,,,,,,,779,3
33,-0.061125,3.569804,1.781771,2.269062,0.021142,0.006329,0.088176,1317.5,0.0,48117.5,...,,,,,,,,,803,3


(493733, 1307)

In [5]:
from utils.postprocess import dynamic_range_nms
from multiprocessing import Pool
from tqdm import tqdm

def objective(trial):
    reg_alpha = trial.suggest_loguniform('reg_alpha', 1e-3, 1e3)
    reg_lambda = trial.suggest_loguniform('reg_lambda', 1e-3, 1e3)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.2, 0.8)
    subsample = trial.suggest_uniform('subsample', 0.2, 0.8)

    CFG[MODEL_NAME]["xgboost"]["learning_rate"] = 0.1
    CFG[MODEL_NAME]["xgboost"]["reg_alpha"] = reg_alpha
    CFG[MODEL_NAME]["xgboost"]["reg_lambda"] = reg_lambda
    CFG[MODEL_NAME]["xgboost"]["max_depth"] = max_depth
    CFG[MODEL_NAME]["xgboost"]["colsample_bytree"] = colsample_bytree
    CFG[MODEL_NAME]["xgboost"]["subsample"] = subsample

    # 学習
    dfs = []
    for event in ["wakeup", "onset"]:
        trn_oof, models = fit_xgb(
            X=train, 
            y=train[f"{event}_target"], 
            folds=train["fold"].astype(int), 
            features=features.all_features(),
            params=CFG[MODEL_NAME]["xgboost"], 
            es_rounds=200,
            verbose=0,
            log=False,
            only_fold0=True,
        )
        train["score"] = trn_oof
        train["event"] = event
        dfs.append(train[["series_id", "step", "score", "event"]])
    gc.collect()
    oof = pd.concat(dfs, axis=0)
    oof["step"] = oof["step"].astype(int)
    oof = oof.sort_values(["series_id", "step"]).reset_index(drop=True)

    # post process
    oof = oof[oof["score"] > 0.005]
    oof["step"] = oof["step"].astype(int)
    groups = [group for _, group in oof.groupby("series_id")]
    with Pool(30) as p:  
        results = list(tqdm(p.imap(dynamic_range_nms, groups), total=len(groups)))
    sub = pd.concat(results)
    sub["score"] = sub["reduced_score"]
    
    labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
    labels = labels[labels["series_id"].isin(sub["series_id"].unique())]
    score, ap_table = compute_comptetition_metric(labels, sub)

    return score

In [6]:
import optuna

study = optuna.create_study(direction='maximize')
study.enqueue_trial({
    'reg_alpha': 0.02,
    'reg_lambda': 0.2,
    'max_depth': 5,
    'colsample_bytree': 0.7,
    'subsample': 1.0,
})
study.optimize(objective, n_trials=200)

print(f'best_params: {study.best_params}')
print(f'best_value: {study.best_value}')
display(study.trials_dataframe().sort_values("value", ascending=False).head(10))
study.trials_dataframe().to_csv(os.path.join(CFG["output_dir"], "xgb_optuna_result.csv"), index=False)

[32m[I 2023-11-09 03:12:38,985][0m A new study created in memory with name: no-name-b2f46625-f56f-40a2-8d13-402a0f419556[0m
100%|██████████| 55/55 [00:01<00:00, 51.07it/s]
[32m[I 2023-11-09 03:13:28,723][0m Trial 0 finished with value: 0.7659111532210794 and parameters: {'reg_alpha': 0.02, 'reg_lambda': 0.2, 'max_depth': 5, 'colsample_bytree': 0.7, 'subsample': 1.0}. Best is trial 0 with value: 0.7659111532210794.[0m
100%|██████████| 55/55 [00:01<00:00, 45.88it/s]
[32m[I 2023-11-09 03:14:27,130][0m Trial 1 finished with value: 0.764807811197652 and parameters: {'reg_alpha': 7.754404731139009, 'reg_lambda': 144.24245511270348, 'max_depth': 8, 'colsample_bytree': 0.693819072509801, 'subsample': 0.31360995818183257}. Best is trial 0 with value: 0.7659111532210794.[0m
100%|██████████| 55/55 [00:01<00:00, 42.88it/s]
[32m[I 2023-11-09 03:17:02,769][0m Trial 2 finished with value: 0.7692462208521783 and parameters: {'reg_alpha': 361.25362278170536, 'reg_lambda': 11.581619579200968,

KeyboardInterrupt: 

```
[32m[I 2023-11-09 03:12:38,985][0m A new study created in memory with name: no-name-b2f46625-f56f-40a2-8d13-402a0f419556[0m
100%|██████████| 55/55 [00:01<00:00, 51.07it/s]
[32m[I 2023-11-09 03:13:28,723][0m Trial 0 finished with value: 0.7659111532210794 and parameters: {'reg_alpha': 0.02, 'reg_lambda': 0.2, 'max_depth': 5, 'colsample_bytree': 0.7, 'subsample': 1.0}. Best is trial 0 with value: 0.7659111532210794.[0m
100%|██████████| 55/55 [00:01<00:00, 45.88it/s]
[32m[I 2023-11-09 03:14:27,130][0m Trial 1 finished with value: 0.764807811197652 and parameters: {'reg_alpha': 7.754404731139009, 'reg_lambda': 144.24245511270348, 'max_depth': 8, 'colsample_bytree': 0.693819072509801, 'subsample': 0.31360995818183257}. Best is trial 0 with value: 0.7659111532210794.[0m
100%|██████████| 55/55 [00:01<00:00, 42.88it/s]
[32m[I 2023-11-09 03:17:02,769][0m Trial 2 finished with value: 0.7692462208521783 and parameters: {'reg_alpha': 361.25362278170536, 'reg_lambda': 11.581619579200968, 'max_depth': 4, 'colsample_bytree': 0.6372777034755515, 'subsample': 0.31160411711723834}. Best is trial 2 with value: 0.7692462208521783.[0m
100%|██████████| 55/55 [00:01<00:00, 44.32it/s]
[32m[I 2023-11-09 03:18:07,273][0m Trial 3 finished with value: 0.7722767009515699 and parameters: {'reg_alpha': 0.0010214278592556361, 'reg_lambda': 431.3715764353598, 'max_depth': 9, 'colsample_bytree': 0.3463000143186279, 'subsample': 0.4139444869474824}. Best is trial 3 with value: 0.7722767009515699.[0m
100%|██████████| 55/55 [00:01<00:00, 43.64it/s]
[32m[I 2023-11-09 03:19:01,345][0m Trial 4 finished with value: 0.7663349070284671 and parameters: {'reg_alpha': 61.10849499835949, 'reg_lambda': 3.295781510750199, 'max_depth': 5, 'colsample_bytree': 0.6332257717839725, 'subsample': 0.27646278043325123}. Best is trial 3 with value: 0.7722767009515699.[0m
100%|██████████| 55/55 [00:01<00:00, 44.68it/s]
[32m[I 2023-11-09 03:21:02,238][0m Trial 5 finished with value: 0.7765971729688774 and parameters: {'reg_alpha': 647.315233130446, 'reg_lambda': 0.002504317907017841, 'max_depth': 5, 'colsample_bytree': 0.6855170818614766, 'subsample': 0.7074097251040323}. Best is trial 5 with value: 0.7765971729688774.[0m
100%|██████████| 55/55 [00:01<00:00, 50.46it/s]
[32m[I 2023-11-09 03:21:57,128][0m Trial 6 finished with value: 0.7734747943331782 and parameters: {'reg_alpha': 0.0066459777206784795, 'reg_lambda': 9.701079996613258, 'max_depth': 9, 'colsample_bytree': 0.3435982010212464, 'subsample': 0.6108403906449434}. Best is trial 5 with value: 0.7765971729688774.[0m
100%|██████████| 55/55 [00:01<00:00, 49.65it/s]
[32m[I 2023-11-09 03:23:07,081][0m Trial 7 finished with value: 0.7627834812646899 and parameters: {'reg_alpha': 0.08995522825349633, 'reg_lambda': 11.225805856131577, 'max_depth': 10, 'colsample_bytree': 0.687608808269947, 'subsample': 0.21106949132875386}. Best is trial 5 with value: 0.7765971729688774.[0m
100%|██████████| 55/55 [00:01<00:00, 45.17it/s]
[32m[I 2023-11-09 03:24:22,141][0m Trial 8 finished with value: 0.7751016993184126 and parameters: {'reg_alpha': 6.128668695535251, 'reg_lambda': 128.62372384941955, 'max_depth': 10, 'colsample_bytree': 0.5517042881181866, 'subsample': 0.517949516829534}. Best is trial 5 with value: 0.7765971729688774.[0m
100%|██████████| 55/55 [00:01<00:00, 44.71it/s]
[32m[I 2023-11-09 03:25:47,417][0m Trial 9 finished with value: 0.7646399135012059 and parameters: {'reg_alpha': 0.003114711892725157, 'reg_lambda': 22.21106763867474, 'max_depth': 3, 'colsample_bytree': 0.5680066142762996, 'subsample': 0.6568445849173845}. Best is trial 5 with value: 0.7765971729688774.[0m
100%|██████████| 55/55 [00:01<00:00, 45.57it/s]
[32m[I 2023-11-09 03:27:31,249][0m Trial 10 finished with value: 0.77992148704883 and parameters: {'reg_alpha': 646.9540084235595, 'reg_lambda': 0.0019445318504395892, 'max_depth': 7, 'colsample_bytree': 0.45872479696247925, 'subsample': 0.7323540248936015}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 46.15it/s]
[32m[I 2023-11-09 03:29:01,892][0m Trial 11 finished with value: 0.7767068751313824 and parameters: {'reg_alpha': 818.3706284231081, 'reg_lambda': 0.0011620517847494576, 'max_depth': 7, 'colsample_bytree': 0.7852536763372584, 'subsample': 0.7312742103116459}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 45.56it/s]
[32m[I 2023-11-09 03:30:34,762][0m Trial 12 finished with value: 0.7783421582991752 and parameters: {'reg_alpha': 609.0209739686968, 'reg_lambda': 0.0012784881051895653, 'max_depth': 7, 'colsample_bytree': 0.7716653154458131, 'subsample': 0.7535747757020815}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 49.05it/s]
[32m[I 2023-11-09 03:31:31,434][0m Trial 13 finished with value: 0.7766044069085027 and parameters: {'reg_alpha': 94.02337261008222, 'reg_lambda': 0.011428414657265068, 'max_depth': 7, 'colsample_bytree': 0.44952329070843555, 'subsample': 0.7930793182096213}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 50.56it/s]
[32m[I 2023-11-09 03:32:20,478][0m Trial 14 finished with value: 0.7685724457191525 and parameters: {'reg_alpha': 50.37232482749595, 'reg_lambda': 0.019042672594285638, 'max_depth': 6, 'colsample_bytree': 0.4347850158692836, 'subsample': 0.5725117912297522}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 49.76it/s]
[32m[I 2023-11-09 03:33:07,574][0m Trial 15 finished with value: 0.7553366741622113 and parameters: {'reg_alpha': 0.4457208615653316, 'reg_lambda': 0.17779309375669705, 'max_depth': 8, 'colsample_bytree': 0.24693407327593966, 'subsample': 0.4779689512049458}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 45.61it/s]
[32m[I 2023-11-09 03:35:32,309][0m Trial 16 finished with value: 0.7741613672128598 and parameters: {'reg_alpha': 926.5329486384549, 'reg_lambda': 0.0011779227538626785, 'max_depth': 6, 'colsample_bytree': 0.798186724218621, 'subsample': 0.7826023535940081}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 49.48it/s]
[32m[I 2023-11-09 03:36:25,255][0m Trial 17 finished with value: 0.7682674372317964 and parameters: {'reg_alpha': 10.703548111461213, 'reg_lambda': 0.007363922173731001, 'max_depth': 8, 'colsample_bytree': 0.5064812974144216, 'subsample': 0.6519265348159572}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 49.43it/s]
[32m[I 2023-11-09 03:37:20,544][0m Trial 18 finished with value: 0.77510130130397 and parameters: {'reg_alpha': 138.77695140308919, 'reg_lambda': 0.04015518799051069, 'max_depth': 7, 'colsample_bytree': 0.5691982193039433, 'subsample': 0.5594675608916664}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 44.22it/s]
[32m[I 2023-11-09 03:38:27,220][0m Trial 19 finished with value: 0.7707833153555865 and parameters: {'reg_alpha': 197.44813038356648, 'reg_lambda': 0.004640748696295598, 'max_depth': 6, 'colsample_bytree': 0.5010708032483168, 'subsample': 0.4324690049380608}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 50.80it/s]
[32m[I 2023-11-09 03:39:27,075][0m Trial 20 finished with value: 0.7675061154088376 and parameters: {'reg_alpha': 21.856390042417583, 'reg_lambda': 0.04673395012527677, 'max_depth': 9, 'colsample_bytree': 0.43856573792551495, 'subsample': 0.6356692886168362}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 44.17it/s]
[32m[I 2023-11-09 03:40:57,125][0m Trial 21 finished with value: 0.7774420997363622 and parameters: {'reg_alpha': 830.6979330942446, 'reg_lambda': 0.001032121954773389, 'max_depth': 7, 'colsample_bytree': 0.7753014699132922, 'subsample': 0.7177287225881526}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 49.78it/s]
[32m[I 2023-11-09 03:41:53,970][0m Trial 22 finished with value: 0.7724517066524975 and parameters: {'reg_alpha': 197.88039011810952, 'reg_lambda': 0.002321105825098142, 'max_depth': 7, 'colsample_bytree': 0.7476790173109986, 'subsample': 0.7005214452389613}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 44.50it/s]
[32m[I 2023-11-09 03:43:04,769][0m Trial 23 finished with value: 0.776346772044944 and parameters: {'reg_alpha': 917.9925838861417, 'reg_lambda': 0.0010094535792412166, 'max_depth': 8, 'colsample_bytree': 0.7585398022881457, 'subsample': 0.7870758250145581}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 44.01it/s]
[32m[I 2023-11-09 03:44:17,664][0m Trial 24 finished with value: 0.772186609684403 and parameters: {'reg_alpha': 323.7639420066721, 'reg_lambda': 0.004583736117068223, 'max_depth': 6, 'colsample_bytree': 0.7504736752211844, 'subsample': 0.6842417598856664}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 43.81it/s]
[32m[I 2023-11-09 03:45:08,457][0m Trial 25 finished with value: 0.7695136970921159 and parameters: {'reg_alpha': 33.77400702904948, 'reg_lambda': 0.011629410199016465, 'max_depth': 7, 'colsample_bytree': 0.799692039310626, 'subsample': 0.612716315229106}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 47.26it/s]
[32m[I 2023-11-09 03:46:06,253][0m Trial 26 finished with value: 0.7765470380001004 and parameters: {'reg_alpha': 139.71079017429275, 'reg_lambda': 0.0031106383136578018, 'max_depth': 8, 'colsample_bytree': 0.6438131617717605, 'subsample': 0.7413845941113374}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 44.67it/s]
[32m[I 2023-11-09 03:47:32,716][0m Trial 27 finished with value: 0.7727453477979149 and parameters: {'reg_alpha': 361.18373307788914, 'reg_lambda': 0.005731052386766636, 'max_depth': 5, 'colsample_bytree': 0.7338201661069816, 'subsample': 0.7404579110654198}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 43.65it/s]
[32m[I 2023-11-09 03:48:17,837][0m Trial 28 finished with value: 0.7646853325038934 and parameters: {'reg_alpha': 2.696571080779577, 'reg_lambda': 0.0010708948015336638, 'max_depth': 6, 'colsample_bytree': 0.6144102284003101, 'subsample': 0.6716674787112018}. Best is trial 10 with value: 0.77992148704883.[0m
100%|██████████| 55/55 [00:01<00:00, 44.67it/s]
[32m[I 2023-11-09 03:49:11,271][0m Trial 29 finished with value: 0.7713013017373458 and parameters: {'reg_alpha': 75.57130005285961, 'reg_lambda': 0.24830546381957905, 'max_depth': 7, 'colsample_bytree': 0.59959913204779, 'subsample': 0.7972876104127654}. Best is trial 10 with value: 0.77992148704883.[0m
[33m[W 2023-11-09 03:49:14,901][0m Trial 30 failed with parameters: {'reg_alpha': 34.03466898942686, 'reg_lambda': 0.024198712356986025, 'max_depth': 4, 'colsample_bytree': 0.7157955412751139, 'subsample': 0.7465073649869591} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_101112/1424116022.py", line 22, in objective
    trn_oof, models = fit_xgb(
  File "/kaggle/src/utils/xgb.py", line 36, in fit_xgb
    dtrain = xgb.DMatrix(X[features][trn_idx], label=y[trn_idx], enable_categorical=True)
  File "/opt/conda/lib/python3.10/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/opt/conda/lib/python3.10/site-packages/xgboost/core.py", line 743, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "/opt/conda/lib/python3.10/site-packages/xgboost/data.py", line 970, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "/opt/conda/lib/python3.10/site-packages/xgboost/data.py", line 420, in _from_pandas_df
    return _from_numpy_array(data, missing, nthread, feature_names, feature_types)
  File "/opt/conda/lib/python3.10/site-packages/xgboost/data.py", line 214, in _from_numpy_array
    _LIB.XGDMatrixCreateFromDense(
KeyboardInterrupt
[33m[W 2023-11-09 03:49:14,906][0m Trial 30 failed with value None.[0m
```