In [26]:
import pandas as pd
import yaml
import sys
import os
from glob import glob
from pathlib import Path
from tqdm import tqdm
import gc
import pickle

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.xgb import fit_xgb
from utils.metric import compute_comptetition_metric
from utils.postprocess import dynamic_range_nms
from utils.set_seed import seed_base

PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
seed_base(CFG["env"]["seed"])

In [27]:
# oof_df = pd.read_parquet(f'/kaggle/output/{CFG["stacking"]["execution"]["best_exp_id"]}/oof.parquet')
oof_df = pd.read_csv(f'/kaggle/output/{CFG["xgb_model"]["execution"]["exp_id"]}/oof.csv')
# oof_df = oof_df[oof_df["series_id"].isin(oof_df["series_id"].unique()[:50])]
oof_df.head()

Unnamed: 0,series_id,step,score,event
0,038441c925bb,299,-0.009271,wakeup
1,038441c925bb,299,-0.000296,onset
2,038441c925bb,323,-0.031838,wakeup
3,038441c925bb,323,-0.003623,onset
4,038441c925bb,347,-0.015792,wakeup


In [28]:
train = oof_df[oof_df["score"] > 0.01].reset_index(drop=True)
len(train)

347547

In [29]:
# dfs = []
# df = oof_df[["series_id", "step", "wakeup_oof"]]
# df = df[df["wakeup_oof"] > 0.1]
# df["event"] = "wakeup"
# df["score"] = df["wakeup_oof"]
# dfs.append(df[['series_id', 'step', 'event', 'score']])

# df = oof_df[["series_id", "step", "onset_oof"]]
# df = df[df["onset_oof"] > 0.1]
# df["event"] = "onset"
# df["score"] = df["onset_oof"]
# dfs.append(df[['series_id', 'step', 'event', 'score']])

# train = pd.concat(dfs).reset_index(drop=True)
# train["step"] = train["step"].astype(int)
# train["score"].hist()
# len(train)

In [30]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
import sys
import yaml
import matplotlib.pyplot as plt
import gc
from typing import Optional
from scipy.interpolate import interp1d


RANGE = 917
COEFF = 11
EXP = 5

def dynamic_range_nms(df: pd.DataFrame) -> pd.DataFrame:
    """Dynamic-Range NMS

    Parameters
    ----------
    df : pd.DataFrame
        単一のseries_idに対する提出形式
    """
    df = df.sort_values("score", ascending=False).reset_index(drop=True)
    used = []
    used_scores = []
    reduce_rate = np.ones(df["step"].max() + 1000)
    for _ in range(min(len(df), 100)):
        df["reduced_score"] = df["score"] / reduce_rate[df["step"]]
        best_score = df["reduced_score"].max()
        best_idx = df["reduced_score"].idxmax()
        best_step = df.loc[best_idx, "step"]
        used.append(best_idx)
        used_scores.append(best_score)

        for r in range(1, int(RANGE)):
            reduce = ((RANGE - r) / RANGE) ** EXP * COEFF
            reduce_rate[best_step + r] += reduce
            if best_step - r >= 0:
                reduce_rate[best_step - r] += reduce
        reduce_rate[best_step] = 1e10
    df = df.iloc[used].copy()
    df["reduced_score"] = used_scores
    return df


In [31]:
from multiprocessing import Pool
import optuna

def objective(trial):
    global RANGE, COEFF, EXP
    RANGE = trial.suggest_int("RANGE", 100, 1000)
    COEFF = trial.suggest_int("COEFF", 1, 30)
    EXP = trial.suggest_int("EXP", 1, 5)        

    groups = [group for _, group in train.groupby("series_id")]
    with Pool(30) as p:  
        results = list(p.imap(dynamic_range_nms, groups))
    sub = pd.concat(results)
    sub["score"] = sub["reduced_score"]

    # スコア計算
    labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
    # labels = labels[labels["series_id"].isin(sub["series_id"].unique())].reset_index(drop=True)
    score, ap_table = compute_comptetition_metric(labels, sub)
    return score


study = optuna.create_study(direction="maximize")
study.enqueue_trial({"RANGE": 917, "COEFF": 11, "EXP": 5})
study.enqueue_trial({"RANGE": 695, "COEFF": 14, "EXP": 3})
study.enqueue_trial({"RANGE": 360, "COEFF": 10, "EXP": 2})
study.enqueue_trial({"RANGE": 638, "COEFF": 17, "EXP": 3})
study.enqueue_trial({"RANGE": 570, "COEFF": 5, "EXP": 3})
study.optimize(objective, n_trials=60)
print(study.best_params)

[32m[I 2023-11-02 19:40:02,121][0m A new study created in memory with name: no-name-b8e95c14-71ce-4ecb-8991-c4bc28938e1d[0m


[32m[I 2023-11-02 19:41:03,439][0m Trial 0 finished with value: 0.7530295341646096 and parameters: {'RANGE': 917, 'COEFF': 11, 'EXP': 5}. Best is trial 0 with value: 0.7530295341646096.[0m
[32m[I 2023-11-02 19:42:03,932][0m Trial 1 finished with value: 0.7515153248108886 and parameters: {'RANGE': 695, 'COEFF': 14, 'EXP': 3}. Best is trial 0 with value: 0.7530295341646096.[0m
[32m[I 2023-11-02 19:43:03,826][0m Trial 2 finished with value: 0.755294806083858 and parameters: {'RANGE': 360, 'COEFF': 10, 'EXP': 2}. Best is trial 2 with value: 0.755294806083858.[0m
[32m[I 2023-11-02 19:44:04,009][0m Trial 3 finished with value: 0.7526455339519498 and parameters: {'RANGE': 638, 'COEFF': 17, 'EXP': 3}. Best is trial 2 with value: 0.755294806083858.[0m
[32m[I 2023-11-02 19:45:05,219][0m Trial 4 finished with value: 0.749983330468935 and parameters: {'RANGE': 570, 'COEFF': 5, 'EXP': 3}. Best is trial 2 with value: 0.755294806083858.[0m
[32m[I 2023-11-02 19:46:05,894][0m Trial 5 f

KeyboardInterrupt: 

---

In [34]:
RANGE = 261
COEFF = 28
EXP = 5

def dynamic_range_nms(df: pd.DataFrame) -> pd.DataFrame:
    """Dynamic-Range NMS

    Parameters
    ----------
    df : pd.DataFrame
        単一のseries_idに対する提出形式
    """
    df = df.sort_values("score", ascending=False).reset_index(drop=True)
    used = []
    used_scores = []
    reduce_rate = np.ones(df["step"].max() + 1000)
    for _ in range(min(len(df), 1000)):
        df["reduced_score"] = df["score"] / reduce_rate[df["step"]]
        best_score = df["reduced_score"].max()
        best_idx = df["reduced_score"].idxmax()
        best_step = df.loc[best_idx, "step"]
        used.append(best_idx)
        used_scores.append(best_score)

        for r in range(1, int(RANGE)):
            reduce = ((RANGE - r) / RANGE) ** EXP * COEFF
            reduce_rate[best_step + r] += reduce
            if best_step - r >= 0:
                reduce_rate[best_step - r] += reduce
        reduce_rate[best_step] = 1e10
    df = df.iloc[used].copy()
    df["reduced_score"] = used_scores
    return df


train = oof_df[oof_df["score"] > 0.005].reset_index(drop=True)
groups = [group for _, group in train.groupby("series_id")]
with Pool(30) as p:  
    results = list(p.imap(dynamic_range_nms, groups))
sub = pd.concat(results)
sub["score"] = sub["reduced_score"]

# スコア計算
labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
# labels = labels[labels["series_id"].isin(sub["series_id"].unique())].reset_index(drop=True)
score, ap_table = compute_comptetition_metric(labels, sub)
print(score)
ap_table

0.7762369696107978


event   tolerance
onset   12           0.305681
        36           0.687888
        60           0.768869
        90           0.813232
        120          0.831152
        150          0.847011
        180          0.856668
        240          0.869057
        300          0.874683
        360          0.880350
wakeup  12           0.246568
        36           0.711111
        60           0.783142
        90           0.819134
        120          0.841468
        150          0.854962
        180          0.864171
        240          0.879552
        300          0.890293
        360          0.899747
dtype: float64