In [1]:
import numpy as np
import pandas as pd
import yaml
import sys
import os
import pickle
from glob import glob
from pathlib import Path
from tqdm import tqdm
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.xgb import fit_xgb, inference_xgb, plot_importances
from utils.metric import compute_comptetition_metric
from utils.postprocess import post_process
from utils.set_seed import seed_base
from feature_engineering.stage1 import generate_1st_stage_features

PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
print(CFG["1st_stage"]["execution"]["exp_id"])

CFG["output_dir"] = f"/kaggle/output/{CFG['1st_stage']['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

seed_base(CFG["env"]["seed"])

2023-10-10 16:15:31.795349: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-10 16:15:32.230783: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-10 16:15:33.634044: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/u

exp_076


In [2]:
# 特徴生成
files = glob(f"{CFG['dataset']['step_csv_dir']}/*.csv")
train, features = generate_1st_stage_features(files, downsample_rate=CFG["1st_stage"]["execution"]["downsample_rate"])

# cv splitとマージ
cv_split = pd.read_csv(CFG['dataset']['cv_split_path'])
train["fold"] = train["series_id"].map(cv_split.set_index("series_id")["fold"])
print(train.shape)

generate features: 100%|██████████| 277/277 [06:34<00:00,  1.42s/it]


(6397317, 208)


In [3]:
# 学習
trn_oof, models = fit_xgb(
    X=train, 
    y=train["target"], 
    folds=train["fold"].astype(int), 
    features=features.all_features(),
    params=CFG["1st_stage"]["xgboost"], 
    es_rounds=100,
)

# 保存
for i, model in enumerate(models):
    model.save_model(os.path.join(CFG["output_dir"], f'xgb_fold{i}.model'))
with open(os.path.join(CFG["output_dir"], "features.pkl"), "wb") as f:
    pickle.dump(features, f)

del train
gc.collect()

  0%|          | 0/5 [00:00<?, ?it/s]

== fold 0 ==
[0]	eval-logloss:0.61108
[100]	eval-logloss:0.09822
[200]	eval-logloss:0.09895
[239]	eval-logloss:0.09923


 20%|██        | 1/5 [00:43<02:54, 43.53s/it]

== fold 1 ==
[0]	eval-logloss:0.61122
[100]	eval-logloss:0.08163
[200]	eval-logloss:0.08097
[228]	eval-logloss:0.08104


 40%|████      | 2/5 [01:20<01:59, 39.76s/it]

== fold 2 ==
[0]	eval-logloss:0.61074
[100]	eval-logloss:0.08441
[200]	eval-logloss:0.08389
[250]	eval-logloss:0.08400


 60%|██████    | 3/5 [01:55<01:14, 37.36s/it]

== fold 3 ==
[0]	eval-logloss:0.61145
[100]	eval-logloss:0.09297
[200]	eval-logloss:0.09231
[268]	eval-logloss:0.09236


 80%|████████  | 4/5 [02:31<00:37, 37.09s/it]

== fold 4 ==
[0]	eval-logloss:0.61185
[100]	eval-logloss:0.10486
[200]	eval-logloss:0.10439
[286]	eval-logloss:0.10514


100%|██████████| 5/5 [03:08<00:00, 37.79s/it]


0

In [5]:
# 推論
dfs = []
for fold, fold_df in cv_split.groupby("fold"):
    fold_df.reset_index(drop=True, inplace=True)
    stride = 8
    for start in tqdm(range(0, len(fold_df), stride)):
        end = min(start + stride, len(fold_df))
        series_ids = fold_df.iloc[start:end]["series_id"].values
        files = [f"{CFG['dataset']['step_csv_dir']}/{series_id}.csv" for series_id in series_ids]

        # 特徴生成
        train, features = generate_1st_stage_features(files, pbar=False)

        # 推論
        model = models[fold]
        preds = inference_xgb([model], train[features.all_features()])
        train["oof"] = preds
        train.drop(columns=features.all_features(), inplace=True)
        dfs.append(train)

train = pd.concat(dfs, axis=0).reset_index(drop=True)
del dfs
gc.collect()

# oofの保存
train = train[["series_id", "step", "oof", "target"]]
train.to_parquet(os.path.join(CFG["output_dir"], "oof.parquet"))

100%|██████████| 7/7 [03:53<00:00, 33.35s/it]
100%|██████████| 7/7 [03:46<00:00, 32.40s/it]
100%|██████████| 7/7 [03:52<00:00, 33.28s/it]
100%|██████████| 7/7 [03:50<00:00, 32.93s/it]
100%|██████████| 7/7 [04:00<00:00, 34.41s/it]


In [6]:
# 後処理
sub = post_process(train)
sub.to_csv(os.path.join(CFG["output_dir"], "submission.csv"), index=False)

# スコア計算
labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
score, ap_table = compute_comptetition_metric(labels, sub)
print(f"score: {score:.4f}")
display(ap_table)

Processing series:   0%|          | 0/277 [00:00<?, ?it/s]

score: 0.5224


event   tolerance
onset   12           0.008861
        36           0.091589
        60           0.267211
        90           0.477007
        120          0.593797
        150          0.647902
        180          0.677465
        240          0.707337
        300          0.734636
        360          0.750056
wakeup  12           0.021398
        36           0.193058
        60           0.371167
        90           0.532530
        120          0.632525
        150          0.686993
        180          0.715681
        240          0.752416
        300          0.781686
        360          0.804758
dtype: float64

In [7]:
# # 次の候補を決定
# next_cand_size = 0
# count = 0
# next_dict = {}
# for series_id, train_df in train.groupby("series_id"):
#     train_df = train_df[(train_df["oof"] >= 0.1) & (train_df["oof"] <= 0.9)]
#     sub_df = sub[(sub["series_id"] == series_id)]
#     label_df = labels[labels["series_id"] == series_id]
#     pred_steps = train_df["step"].values
#     sub_steps = sub_df["step"].values
#     label_steps = label_df["step"].values

#     if len(train_df) == 0:
#         continue
#     next_cand = np.zeros(int(max(max(pred_steps if len(pred_steps) > 0 else [0]), max(sub_steps if len(sub_steps) > 0 else [0]))) + CFG["feature"]["agg_freq"])
#     for sub_step in sub_steps:
#         next_cand[int(sub_step - CFG["feature"]["agg_freq"] * 10): int(sub_step + CFG["feature"]["agg_freq"] * 10)] = 1
#     for pred_step in pred_steps:
#         next_cand[int(pred_step - CFG["feature"]["agg_freq"] * 10): int(pred_step + CFG["feature"]["agg_freq"] * 10)] = 1
#     next_cand_size += np.sum(next_cand)
#     next_dict[series_id] = np.where(next_cand)[0]

#     for label_step in label_steps:
#         if label_step < next_cand.shape[0]:
#             count += next_cand[int(label_step)]
    
# recall = count / len(labels)
# print(f"recall: {recall:.4f}")
# print(f"next_cand_size: {next_cand_size}")

# with open(f"{CFG['output_dir']}/next_cands.pkl", "wb") as f:
#     pickle.dump(next_dict, f)

In [8]:
importances = plot_importances(models, save_path=f"{CFG['output_dir']}/importances.png")