In [1]:
import numpy as np
import pandas as pd
import yaml
import sys
import os
import pickle
from glob import glob
from pathlib import Path
from tqdm import tqdm
import pickle
import gc
import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.xgb import fit_xgb, inference_xgb, plot_importances
from utils.metric import compute_comptetition_metric
from utils.postprocess import post_process
from utils.set_seed import seed_base
from feature_engineering.stage2 import generate_2nd_stage_features

PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
print(CFG["2nd_stage"]["execution"]["exp_id"])

CFG["output_dir"] = f"/kaggle/output/{CFG['2nd_stage']['execution']['exp_id']}"
!rm -r {CFG["output_dir"]}
os.makedirs(CFG["output_dir"], exist_ok=True)

seed_base(CFG["env"]["seed"])

2023-10-11 00:39:59.560835: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-11 00:39:59.952975: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-11 00:40:01.213740: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/u

exp_077


In [2]:
cand_path = os.path.join("/kaggle/output", CFG["1st_stage"]["execution"]["best_exp_id"], "next_cands.pkl")
with open(cand_path, "rb") as f:
    next_cands = pickle.load(f)

In [3]:
# 特徴生成
files = glob(f"{CFG['dataset']['step_csv_dir']}/*.csv")[:50]
train, features = generate_2nd_stage_features(files, downsample_rate=CFG["2nd_stage"]["execution"]["downsample_rate"])

# cv splitとマージ
cv_split = pd.read_csv(CFG['dataset']['cv_split_path'])
train["fold"] = train["series_id"].map(cv_split.set_index("series_id")["fold"])
print(train.shape)

generate features:   2%|▏         | 1/50 [02:26<2:00:00, 146.96s/it]

In [None]:
# 学習
trn_oof, models = fit_xgb(
    X=train, 
    y=train["target"], 
    folds=train["fold"].astype(int), 
    features=features.all_features(),
    params=CFG["2nd_stage"]["xgboost"], 
    es_rounds=100,
)
train["oof"] = trn_oof
train = train.drop(columns=list(set(features.all_features()) - set(["minutes"])))
train = train.sort_values(["series_id", "step"]).reset_index(drop=True)

# 保存
for i, model in enumerate(models):
    model.save_model(os.path.join(CFG["output_dir"], f'xgb_fold{i}.model'))
with open(os.path.join(CFG["output_dir"], "features.pkl"), "wb") as f:
    pickle.dump(features, f)

del train
gc.collect()

  0%|          | 0/4 [00:00<?, ?it/s]

== fold 0 ==
[0]	eval-logloss:0.00000
[100]	eval-logloss:0.00000


  0%|          | 0/4 [00:02<?, ?it/s]






TypeError: NumPy boolean array indexing assignment requires a 0 or 1-dimensional input, input has 2 dimensions

In [None]:
# 推論
dfs = []
for fold, fold_df in cv_split.groupby("fold"):
    fold_df.reset_index(drop=True, inplace=True)
    stride = 8
    for start in tqdm(range(0, len(fold_df), stride)):
        end = min(start + stride, len(fold_df))
        series_ids = fold_df.iloc[start:end]["series_id"].values
        files = [f"{CFG['dataset']['step_csv_dir']}/{series_id}.csv" for series_id in series_ids]

        # 特徴生成
        train, features = generate_2nd_stage_features(files, pbar=False)

        # 推論
        model = models[fold]
        preds = inference_xgb([model], train[features.all_features()])
        train["oof"] = preds
        train.drop(columns=features.all_features(), inplace=True)
        dfs.append(train)

train = pd.concat(dfs, axis=0).reset_index(drop=True)
del dfs
gc.collect()

# 1st stage
cand_path = os.path.join("/kaggle/output", CFG["1st_stage"]["execution"]["best_exp_id"], "oof.parquet")
last_train = pd.read_parquet(cand_path).rename(columns={"oof": "oof_1st"})
train = last_train[["series_id", "oof_1st", "step"]].merge(
    train[["series_id", "oof", "step", "minutes"]], 
    on=["series_id", "step"], 
    how="left")
train["oof"] = train["oof"].fillna(train["oof_1st"])
train = train.sort_values(["series_id", "step"]).reset_index(drop=True)

# oofの保存
train = train[["series_id", "step", "oof", "target", "minutes"]]
train.to_parquet(os.path.join(CFG["output_dir"], "oof.parquet"))

100%|██████████| 7/7 [03:53<00:00, 33.35s/it]
100%|██████████| 7/7 [03:46<00:00, 32.40s/it]
100%|██████████| 7/7 [03:52<00:00, 33.28s/it]
100%|██████████| 7/7 [03:50<00:00, 32.93s/it]
100%|██████████| 7/7 [04:00<00:00, 34.41s/it]


In [None]:
# 後処理
sub = post_process(train)
sub.to_csv(os.path.join(CFG["output_dir"], "submission.csv"), index=False)

# スコア計算
labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
score, ap_table = compute_comptetition_metric(labels, sub)
print(f"score: {score:.4f}")
display(ap_table)

Processing series:   0%|          | 0/277 [00:00<?, ?it/s]

score: 0.5224


event   tolerance
onset   12           0.008861
        36           0.091589
        60           0.267211
        90           0.477007
        120          0.593797
        150          0.647902
        180          0.677465
        240          0.707337
        300          0.734636
        360          0.750056
wakeup  12           0.021398
        36           0.193058
        60           0.371167
        90           0.532530
        120          0.632525
        150          0.686993
        180          0.715681
        240          0.752416
        300          0.781686
        360          0.804758
dtype: float64

In [None]:
importances = plot_importances(models, save_path=f"{CFG['output_dir']}/importances.png")