In [1]:
import pandas as pd
import yaml
import sys
import os
from glob import glob
from pathlib import Path
from tqdm import tqdm
import gc
import pickle

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.xgb import fit_xgb
from utils.metric import compute_comptetition_metric
from utils.postprocess import dynamic_range_nms
from utils.set_seed import seed_base

PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
seed_base(CFG["env"]["seed"])

2023-11-03 01:15:17.141416: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-03 01:15:17.557447: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-03 01:15:18.790954: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/u

In [2]:
# oof_df = pd.read_parquet(f'/kaggle/output/{CFG["stacking"]["execution"]["best_exp_id"]}/oof.parquet')
oof_df = pd.read_csv(f'/kaggle/output/{CFG["xgb_model"]["execution"]["exp_id"]}/oof.csv')
# oof_df = oof_df[oof_df["series_id"].isin(oof_df["series_id"].unique()[:50])]
oof_df.head()

Unnamed: 0,series_id,step,score,event
0,038441c925bb,299,-0.03658,wakeup
1,038441c925bb,299,0.001942,onset
2,038441c925bb,323,-0.052111,wakeup
3,038441c925bb,323,0.01166,onset
4,038441c925bb,347,-0.02159,wakeup


In [3]:
train = oof_df[oof_df["score"] > 0.01].reset_index(drop=True)
len(train)

426912

In [4]:
# dfs = []
# df = oof_df[["series_id", "step", "wakeup_oof"]]
# df = df[df["wakeup_oof"] > 0.1]
# df["event"] = "wakeup"
# df["score"] = df["wakeup_oof"]
# dfs.append(df[['series_id', 'step', 'event', 'score']])

# df = oof_df[["series_id", "step", "onset_oof"]]
# df = df[df["onset_oof"] > 0.1]
# df["event"] = "onset"
# df["score"] = df["onset_oof"]
# dfs.append(df[['series_id', 'step', 'event', 'score']])

# train = pd.concat(dfs).reset_index(drop=True)
# train["step"] = train["step"].astype(int)
# train["score"].hist()
# len(train)

In [5]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import sys
import yaml
import matplotlib.pyplot as plt
import gc
from typing import Optional
from scipy.interpolate import interp1d


RANGE = 917
COEFF = 11
EXP = 5

def dynamic_range_nms(df: pd.DataFrame) -> pd.DataFrame:
    """Dynamic-Range NMS

    Parameters
    ----------
    df : pd.DataFrame
        単一のseries_idに対する提出形式
    """
    df = df.sort_values("score", ascending=False).reset_index(drop=True)
    used = []
    used_scores = []
    reduce_rate = np.ones(df["step"].max() + 1000)
    for _ in range(min(len(df), 100)):
        df["reduced_score"] = df["score"] / reduce_rate[df["step"]]
        best_score = df["reduced_score"].max()
        best_idx = df["reduced_score"].idxmax()
        best_step = df.loc[best_idx, "step"]
        used.append(best_idx)
        used_scores.append(best_score)

        for r in range(1, int(RANGE)):
            reduce = ((RANGE - r) / RANGE) ** EXP * COEFF
            reduce_rate[best_step + r] += reduce
            if best_step - r >= 0:
                reduce_rate[best_step - r] += reduce
        reduce_rate[best_step] = 1e10
    df = df.iloc[used].copy()
    df["reduced_score"] = used_scores
    return df


In [6]:
from multiprocessing import Pool
import optuna

def objective(trial):
    global RANGE, COEFF, EXP
    RANGE = trial.suggest_int("RANGE", 100, 1000)
    COEFF = trial.suggest_int("COEFF", 1, 30)
    EXP = trial.suggest_int("EXP", 1, 10)        

    groups = [group for _, group in train.groupby("series_id")]
    with Pool(30) as p:  
        results = list(p.imap(dynamic_range_nms, groups))
    sub = pd.concat(results)
    sub["score"] = sub["reduced_score"]

    # スコア計算
    labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
    # labels = labels[labels["series_id"].isin(sub["series_id"].unique())].reset_index(drop=True)
    score, ap_table = compute_comptetition_metric(labels, sub)
    return score


study = optuna.create_study(direction="maximize")
study.enqueue_trial({"RANGE": 261, "COEFF": 28, "EXP": 5})
study.enqueue_trial({"RANGE": 917, "COEFF": 11, "EXP": 5})
study.enqueue_trial({"RANGE": 695, "COEFF": 14, "EXP": 3})
study.enqueue_trial({"RANGE": 360, "COEFF": 10, "EXP": 2})
study.enqueue_trial({"RANGE": 638, "COEFF": 17, "EXP": 3})
study.enqueue_trial({"RANGE": 570, "COEFF": 5, "EXP": 3})
study.optimize(objective, n_trials=60)
print(study.best_params)

[32m[I 2023-11-03 01:15:22,575][0m A new study created in memory with name: no-name-e67bee34-e9aa-4336-a734-76cb5651166a[0m
[32m[I 2023-11-03 01:16:23,551][0m Trial 0 finished with value: 0.763911710517406 and parameters: {'RANGE': 261, 'COEFF': 28, 'EXP': 5}. Best is trial 0 with value: 0.763911710517406.[0m
[32m[I 2023-11-03 01:17:25,292][0m Trial 1 finished with value: 0.7570498417415169 and parameters: {'RANGE': 917, 'COEFF': 11, 'EXP': 5}. Best is trial 0 with value: 0.763911710517406.[0m
[32m[I 2023-11-03 01:18:26,792][0m Trial 2 finished with value: 0.7554902233173839 and parameters: {'RANGE': 695, 'COEFF': 14, 'EXP': 3}. Best is trial 0 with value: 0.763911710517406.[0m
[32m[I 2023-11-03 01:19:28,013][0m Trial 3 finished with value: 0.7605563972412005 and parameters: {'RANGE': 360, 'COEFF': 10, 'EXP': 2}. Best is trial 0 with value: 0.763911710517406.[0m
[32m[I 2023-11-03 01:20:29,184][0m Trial 4 finished with value: 0.7566003228710735 and parameters: {'RANGE':

KeyboardInterrupt: 

---

In [8]:
RANGE = 523
COEFF = 27
EXP = 7

def dynamic_range_nms(df: pd.DataFrame) -> pd.DataFrame:
    """Dynamic-Range NMS

    Parameters
    ----------
    df : pd.DataFrame
        単一のseries_idに対する提出形式
    """
    df = df.sort_values("score", ascending=False).reset_index(drop=True)
    used = []
    used_scores = []
    reduce_rate = np.ones(df["step"].max() + 1000)
    for _ in range(min(len(df), 1000)):
        df["reduced_score"] = df["score"] / reduce_rate[df["step"]]
        best_score = df["reduced_score"].max()
        best_idx = df["reduced_score"].idxmax()
        best_step = df.loc[best_idx, "step"]
        used.append(best_idx)
        used_scores.append(best_score)

        for r in range(1, int(RANGE)):
            reduce = ((RANGE - r) / RANGE) ** EXP * COEFF
            reduce_rate[best_step + r] += reduce
            if best_step - r >= 0:
                reduce_rate[best_step - r] += reduce
        reduce_rate[best_step] = 1e10
    df = df.iloc[used].copy()
    df["reduced_score"] = used_scores
    return df


train = oof_df[oof_df["score"] > 0.005].reset_index(drop=True)
groups = [group for _, group in train.groupby("series_id")]
with Pool(30) as p:  
    results = list(p.imap(dynamic_range_nms, groups))
sub = pd.concat(results)
sub["score"] = sub["reduced_score"]

# スコア計算
labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
# labels = labels[labels["series_id"].isin(sub["series_id"].unique())].reset_index(drop=True)
score, ap_table = compute_comptetition_metric(labels, sub)
print(score)
ap_table

0.7790080201168637


event   tolerance
onset   12           0.263531
        36           0.676661
        60           0.772403
        90           0.820726
        120          0.840973
        150          0.857832
        180          0.867965
        240          0.883518
        300          0.889090
        360          0.893921
wakeup  12           0.223737
        36           0.694710
        60           0.780328
        90           0.822538
        120          0.847852
        150          0.863858
        180          0.875253
        240          0.891365
        300          0.901725
        360          0.912176
dtype: float64