In [1]:
import pandas as pd
import yaml
import sys
import os
from glob import glob
from pathlib import Path
import numpy as np
from tqdm import tqdm
import gc
import pickle

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.xgb import fit_xgb
from utils.metric import compute_comptetition_metric
from utils.postprocess import dynamic_range_nms
from utils.set_seed import seed_base

PACKAGE_DIR = Path("/kaggle/src")
CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
seed_base(CFG["env"]["seed"])

2023-12-03 10:36:31.184848: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-03 10:36:31.359921: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-03 10:36:31.947315: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/u

In [2]:
oof_df = pd.read_parquet(f'/kaggle/output/exp_ensemble/oof.parquet')
oof_df = oof_df[["series_id", "step", "onset_oof", "wakeup_oof"]]
oof_df.head()

Unnamed: 0,series_id,step,onset_oof,wakeup_oof
0,038441c925bb,0,0.006805,0.003592
1,038441c925bb,1,0.005428,0.002768
2,038441c925bb,2,0.002674,0.001122
3,038441c925bb,3,0.001379,0.000308
4,038441c925bb,4,0.001545,0.000329


In [3]:
unused_series_ids = ['0ce74d6d2106', '0f9e60a8e56d', '154fe824ed87', '2fc653ca75c7',
       '390b487231ce', '44a41bba1ee7', '89c7daa72eee', 'a3e59c2ce3f6',
       'c5d08fc3e040', 'c7b1283bb7eb', 'e11b9d69f856', 'efbfc4526d58',
       'f8a8da8bdd00']

labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
labels = labels[~labels["series_id"].isin(unused_series_ids)]

In [4]:
def wbf_nikhil(preds_orig, max_thresh=0.1, max_count=500, hyperparams=None):
     
    k_dist = hyperparams['k_dist']
    log_base = hyperparams['log_base']
    log_scale = hyperparams['log_scale']
    
    
    preds = preds_orig.copy()
    preds = np.convolve(preds, np.array([0.25, 0.5, 0.25]), mode='same')

    count = 0

    indices = []
    scores = []

    while count < max_count:
        curr_max_idx = np.argmax(preds)
        curr_max = preds[curr_max_idx]

        if curr_max < max_thresh:
            break

        k = int(k_dist - curr_max)

        start_idx = max(curr_max_idx - k, 0)
        end_idx = min(curr_max_idx + k + 1, len(preds))

        section = preds[start_idx:end_idx]

        # Logarithmic weight calculation
        distances = np.abs(np.arange(len(section)) - k)
        weights = 1 / (log_base ** (distances / (k * log_scale)))

        weighted_avg = np.sum(section * weights) / np.sum(weights)

        scores.append(weighted_avg)
        indices.append(curr_max_idx)

        preds[start_idx:end_idx] *= ((1 - weights))**4

        count += 1

    return indices, scores

In [5]:
def scale_array(arr, new_min, new_max):
    """
    Scale a numpy array to a new given range [new_min, new_max].

    :param arr: numpy array to be scaled
    :param new_min: new minimum value of the range
    :param new_max: new maximum value of the range
    :return: scaled numpy array
    """
    min_val = np.min(arr)
    max_val = np.max(arr)
    
    # Scale the array
    scaled_arr = ((arr - min_val) / (max_val - min_val)) * (new_max - new_min) + new_min
    
    return scaled_arr


def scale(arr, new_min, new_max, power=1.25):
    arr = scale_array(arr, new_min, new_max)
    arr = arr**power
    arr = scale_array(arr, new_min, new_max)
    return arr

In [6]:
from multiprocessing import Pool
import optuna

def objective(trial):
    k_dist = trial.suggest_int("k_dist", 50, 200)
    log_base = trial.suggest_int("log_base", 5, 50)
    log_scale = trial.suggest_uniform("log_scale", 1, 10)

    oof_df = pd.read_parquet(f'/kaggle/output/exp_ensemble/oof.parquet')
    oof_df = oof_df[["series_id", "step", "onset_oof", "wakeup_oof"]]
    power = trial.suggest_uniform("power", 0.5, 2)

    oof_df["onset_oof"] = scale(oof_df["onset_oof"], 0, 15, power)
    oof_df["wakeup_oof"] = scale(oof_df["wakeup_oof"], 0, 15, power)

    hyperparams = {
        "k_dist": k_dist,
        "log_base": log_base,
        "log_scale": log_scale
    }   
    sub_dfs = []
    for sid, df in tqdm(oof_df.groupby("series_id")):
        for event in ["onset", "wakeup"]:
            preds_orig = df[f"{event}_oof"].values
            indices, scores = wbf_nikhil(preds_orig, hyperparams=hyperparams)
            sub_df = pd.DataFrame({
                "series_id": sid,
                "event": event,
                "step": indices,
                "score": scores
            })
            sub_dfs.append(sub_df)
    sub = pd.concat(sub_dfs)
    score, ap_table = compute_comptetition_metric(labels, sub)
    return score

study = optuna.create_study(direction="maximize")
study.enqueue_trial({
    "k_dist": 84,
    "log_base": 12,
    "log_scale": 4.845,
})
study.optimize(objective, n_trials=60)
print(study.best_params)

[32m[I 2023-12-03 10:36:42,312][0m A new study created in memory with name: no-name-a660f0da-36ca-4e0c-84f8-b7583c81ac41[0m
100%|██████████| 277/277 [00:12<00:00, 22.77it/s]
[32m[I 2023-12-03 10:37:18,359][0m Trial 0 finished with value: 0.8211579956850635 and parameters: {'k_dist': 84, 'log_base': 12, 'log_scale': 4.845, 'power': 1.5907702043086134}. Best is trial 0 with value: 0.8211579956850635.[0m
100%|██████████| 277/277 [00:35<00:00,  7.90it/s]
[32m[I 2023-12-03 10:38:25,026][0m Trial 1 finished with value: 0.8232571483974418 and parameters: {'k_dist': 97, 'log_base': 24, 'log_scale': 7.051190748724591, 'power': 1.0975701145543941}. Best is trial 1 with value: 0.8232571483974418.[0m
100%|██████████| 277/277 [00:07<00:00, 38.03it/s] 
[32m[I 2023-12-03 10:38:53,615][0m Trial 2 finished with value: 0.811878960227173 and parameters: {'k_dist': 175, 'log_base': 36, 'log_scale': 3.2672064520167385, 'power': 1.945753902904981}. Best is trial 1 with value: 0.8232571483974418.

{'k_dist': 83, 'log_base': 38, 'log_scale': 8.705642017910971, 'power': 0.7588796533153248}
