In [1]:
import numpy as np
import pandas as pd
import yaml
import sys
import os
import pickle
from glob import glob
from pathlib import Path
import pickle
import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.xgb import fit_xgb
from utils.metric import compute_comptetition_metric
from utils.postprocess import post_process
from utils.set_seed import seed_base
from feature_engineering.stage1 import generate_1st_stage_features

PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
print(CFG["1st_stage"]["execution"]["exp_id"])

CFG["output_dir"] = f"/kaggle/output/{CFG['1st_stage']['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

seed_base(CFG["env"]["seed"])

2023-10-01 03:42:42.685456: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-01 03:42:42.771458: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-01 03:42:43.246552: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/u

exp_047


In [2]:
# 特徴生成
files = glob(f"{CFG['dataset']['step_csv_dir']}/*.csv")
train, features = generate_1st_stage_features(files)

# cv splitとマージ
cv_split = pd.read_csv(CFG['dataset']['cv_split_path'])
train["fold"] = train["series_id"].map(cv_split.set_index("series_id")["fold"])
display(train.head(5))
print(train.shape)

generate features: 100%|██████████| 277/277 [06:37<00:00,  1.44s/it]


Unnamed: 0,anglez_diff_abs,anglez_diff_abs_rolling_max_10,anglez_diff_abs_rolling_max_100,anglez_diff_abs_rolling_max_1000,anglez_diff_abs_rolling_max_50,anglez_diff_abs_rolling_mean_10,anglez_diff_abs_rolling_mean_100,anglez_diff_abs_rolling_mean_1000,anglez_diff_abs_rolling_mean_50,anglez_diff_abs_rolling_median_10,...,enmo_rolling_square_mean_50,enmo_rolling_std_10,enmo_rolling_std_100,enmo_rolling_std_1000,enmo_rolling_std_50,total_seconds,target,step,series_id,fold
0,0.006461,0.0432,,,,0.007771,,,,0.000428,...,,0.0,,,,40557.5,1,11.5,af91d9a50547,3
1,0.000117,0.000833,,,0.0474,0.000192,,,0.002144,4e-06,...,0.0,0.0,,,0.0,40677.5,1,35.5,af91d9a50547,3
2,0.000767,0.005475,0.5969,,0.457854,0.001198,0.022684,,0.024919,0.000319,...,3e-06,0.0,0.00146,,0.001634,40797.5,1,59.5,af91d9a50547,3
3,0.083833,0.347392,0.5969,,0.5969,0.083032,0.025412,,0.042692,0.036481,...,4e-06,0.001924,0.00146,,0.002064,40917.5,1,83.5,af91d9a50547,3
4,0.014787,0.048846,0.5969,,0.309038,0.012615,0.02846,,0.028898,0.00191,...,1e-06,0.0,0.00146,,0.000604,41037.5,1,107.5,af91d9a50547,3


(5331163, 203)


In [3]:
# tuning
import optuna
from sklearn.metrics import log_loss

def objective(trial):
    CFG["1st_stage"]["xgboost"]["learning_rate"] = 0.1
    CFG["1st_stage"]["xgboost"]["max_depth"] = trial.suggest_int('max_depth', 3, 10)
    CFG["1st_stage"]["xgboost"]["colsample_bytree"] = trial.suggest_uniform('colsample_bytree', 0.2, 0.8)
    CFG["1st_stage"]["xgboost"]["reg_alpha"] = trial.suggest_loguniform('reg_alpha', 1e-3, 1e3)
    CFG["1st_stage"]["xgboost"]["reg_lambda"] = trial.suggest_loguniform('reg_lambda', 1e-3, 1e3)

    trn_oof, models = fit_xgb(
        X=train, 
        y=train["target"], 
        folds=train["fold"].astype(int), 
        features=features.all_features(),
        params=CFG["1st_stage"]["xgboost"], 
        es_rounds=100,
        verbose=False,
        log=False,
    )
    train["oof"] = trn_oof
    score = log_loss(train['target'], train["oof"])
    return score

# 最適化
study = optuna.create_study(direction='minimize')
study.enqueue_trial({
    'max_depth': 5,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.02,
    'reg_lambda': 0.2,
})
study.optimize(objective, n_trials=100)

# 結果
print(f'best_params: {study.best_params}')
print(f'best_value: {study.best_value}')
display(study.trials_dataframe().sort_values(
    "value", ascending=False).head(10))
study.trials_dataframe().to_csv(os.path.join(CFG["output_dir"], "xgb_optuna_result.csv"), index=False)

# 割り当て
CFG["1st_stage"]["xgboost"]["learning_rate"] = 0.01
CFG["1st_stage"]["xgboost"]["max_depth"] = study.best_params["max_depth"]
CFG["1st_stage"]["xgboost"]["colsample_bytree"] = study.best_params["colsample_bytree"]
CFG["1st_stage"]["xgboost"]["reg_alpha"] = study.best_params["reg_alpha"]
CFG["1st_stage"]["xgboost"]["reg_lambda"] = study.best_params["reg_lambda"]

[32m[I 2023-10-01 03:49:44,180][0m A new study created in memory with name: no-name-b05d9f88-5b17-4372-857d-a93ad01711a0[0m
[32m[I 2023-10-01 03:52:15,181][0m Trial 0 finished with value: 0.09244305608856712 and parameters: {'max_depth': 5, 'colsample_bytree': 0.7, 'reg_alpha': 0.02, 'reg_lambda': 0.2}. Best is trial 0 with value: 0.09244305608856712.[0m
[32m[I 2023-10-01 03:55:31,148][0m Trial 1 finished with value: 0.09218542563732607 and parameters: {'max_depth': 4, 'colsample_bytree': 0.277057055539128, 'reg_alpha': 126.23934691423631, 'reg_lambda': 0.7442677006551333}. Best is trial 1 with value: 0.09218542563732607.[0m
[32m[I 2023-10-01 03:58:31,149][0m Trial 2 finished with value: 0.09234247607843528 and parameters: {'max_depth': 4, 'colsample_bytree': 0.39569896851093117, 'reg_alpha': 1.416051623766434, 'reg_lambda': 6.735194491802836}. Best is trial 1 with value: 0.09218542563732607.[0m
[32m[I 2023-10-01 04:01:11,088][0m Trial 3 finished with value: 0.09241811207

: 

In [None]:
# 学習
trn_oof, models = fit_xgb(
    X=train, 
    y=train["target"], 
    folds=train["fold"].astype(int), 
    features=features.all_features(),
    params=CFG["1st_stage"]["xgboost"], 
    es_rounds=100,
)
train["oof"] = trn_oof
train = train.drop(columns=features.all_features())
train = train.sort_values(["series_id", "step"]).reset_index(drop=True)

# 保存
for i, model in enumerate(models):
    model.save_model(os.path.join(CFG["output_dir"], f'xgb_fold{i}.model'))
with open(os.path.join(CFG["output_dir"], "features.pkl"), "wb") as f:
    pickle.dump(features, f)
train.to_csv(os.path.join(CFG["output_dir"], "oof.csv"), index=False)

  0%|          | 0/5 [00:00<?, ?it/s]

== fold 0 ==
[0]	eval-logloss:0.61107
[100]	eval-logloss:0.09862
[197]	eval-logloss:0.09901


 20%|██        | 1/5 [00:32<02:10, 32.60s/it]

== fold 1 ==
[0]	eval-logloss:0.61144
[100]	eval-logloss:0.08143
[200]	eval-logloss:0.08167
[231]	eval-logloss:0.08174


 40%|████      | 2/5 [01:02<01:33, 31.03s/it]

== fold 2 ==
[0]	eval-logloss:0.61098
[100]	eval-logloss:0.08480
[200]	eval-logloss:0.08403
[300]	eval-logloss:0.08427
[320]	eval-logloss:0.08447


 60%|██████    | 3/5 [01:36<01:04, 32.18s/it]

== fold 3 ==
[0]	eval-logloss:0.61126
[100]	eval-logloss:0.09344
[200]	eval-logloss:0.09207
[276]	eval-logloss:0.09270


 80%|████████  | 4/5 [02:07<00:31, 31.74s/it]

== fold 4 ==
[0]	eval-logloss:0.61197
[100]	eval-logloss:0.10467
[200]	eval-logloss:0.10409
[284]	eval-logloss:0.10402


100%|██████████| 5/5 [02:37<00:00, 31.48s/it]


In [None]:
# 後処理
sub = post_process(train)
sub.to_csv(os.path.join(CFG["output_dir"], "submission.csv"), index=False)

# スコア計算
labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
score, ap_table = compute_comptetition_metric(labels, sub)
print(f"score: {score:.4f}")
display(ap_table)

post process: 100%|██████████| 277/277 [00:46<00:00,  5.98it/s]


score: 0.5143


event   tolerance
onset   12           0.007410
        36           0.063323
        60           0.180587
        90           0.356458
        120          0.539562
        150          0.616559
        180          0.676332
        240          0.729481
        300          0.763573
        360          0.778655
wakeup  12           0.025611
        36           0.183351
        60           0.365586
        90           0.530580
        120          0.618199
        150          0.685890
        180          0.730202
        240          0.787793
        300          0.813421
        360          0.832656
dtype: float64

In [None]:
# 次の候補を決定
next_cand_size = 0
count = 0
next_dict = {}
for series_id, train_df in train.groupby("series_id"):
    train_df = train_df[(train_df["oof"] >= 0.1) & (train_df["oof"] <= 0.9)]
    sub_df = sub[(sub["series_id"] == series_id)]
    label_df = labels[labels["series_id"] == series_id]
    pred_steps = train_df["step"].values
    sub_steps = sub_df["step"].values
    label_steps = label_df["step"].values

    if len(train_df) == 0:
        continue
    next_cand = np.zeros(int(max(max(pred_steps if len(pred_steps) > 0 else [0]), max(sub_steps if len(sub_steps) > 0 else [0]))) + CFG["feature"]["agg_freq"])
    for sub_step in sub_steps:
        next_cand[int(sub_step - CFG["feature"]["agg_freq"] * 10): int(sub_step + CFG["feature"]["agg_freq"] * 10)] = 1
    for pred_step in pred_steps:
        next_cand[int(pred_step - CFG["feature"]["agg_freq"] * 10): int(pred_step + CFG["feature"]["agg_freq"] * 10)] = 1
    next_cand_size += np.sum(next_cand)
    next_dict[series_id] = np.where(next_cand)[0]

    for label_step in label_steps:
        if label_step < next_cand.shape[0]:
            count += next_cand[int(label_step)]
    
recall = count / len(labels)
print(f"recall: {recall:.4f}")
print(f"next_cand_size: {next_cand_size}")

with open(f"{CFG['output_dir']}/next_cands.pkl", "wb") as f:
    pickle.dump(next_dict, f)

recall: 0.9729
next_cand_size: 21062664.0
