In [1]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import datetime
import sys
from tqdm import tqdm
import os
import random
import pickle
from glob import glob
import gc
from multiprocessing import Pool, cpu_count

import warnings
warnings.filterwarnings('ignore')

sys.path.append('/kaggle/src')
from utils.xgb import fit_xgb, inference_xgb
from utils.metric import compute_comptetition_metric
from utils.feature_contena import Features
from utils.pandas_utils import reduce_mem_usage



In [2]:
config = f"""
execution:
    stage2_exp_id: exp_044
    reg_exp_id: exp_045
    debug: False
    submit: False
    multiprocessing: True

dataset: 
    competition_dir: /kaggle/input/child-mind-institute-detect-sleep-states
    cv_split_path: /kaggle/input/cv_split/train_folds.csv
    train_base_path: /kaggle/input/train_base/train_base.csv
    step_csv_dir: /kaggle/input/save_series_csv/csvs

feature:
    agg_freq: 24 # [step]

xgboost:    
    objective: "binary:logistic"
    learning_rate: 0.1  # 0.01で固定。学習時間とのトレードオフ
    reg_alpha: 0.02  # L1正則化。0.1が推奨。
    reg_lambda: 0.2  # L2正則化。0.1が推奨
    random_state: 42
    max_depth: 5  # 3-8。7くらいでいい。
    colsample_bytree: 0.7  # カラムが多い時は少なめ(0.4とか)にする。
    
seed: 46
"""

CFG = yaml.load(config, Loader=yaml.SafeLoader)

CFG["output_dir"] = f"/kaggle/output/{CFG['execution']['reg_exp_id']}"
os.makedirs(CFG["output_dir"], exist_ok=True)

In [3]:
stage2 = pd.read_csv(os.path.join("/kaggle/output", os.path.join(CFG["execution"]["stage2_exp_id"], "oof.csv")))
stage2.head()

Unnamed: 0,series_id,oof_1st,step,oof,minutes
0,038441c925bb,0.999923,11.5,0.999923,
1,038441c925bb,0.999829,35.5,0.999829,
2,038441c925bb,0.994979,59.5,0.994979,
3,038441c925bb,0.993882,83.5,0.993882,
4,038441c925bb,0.993127,107.5,0.993127,


In [4]:
reg = pd.read_csv(os.path.join("/kaggle/output", os.path.join(CFG["execution"]["reg_exp_id"], "submission.csv")))
reg.head()

Unnamed: 0,step,sub_step_before_modify,key_step,series_id,score,event,oof_stage2,oof_regressor,target,minutes
0,4995,5039.0,5051.5,038441c925bb,0.834676,onset,0.12036,-43.046986,-47.0,1857.5
1,10928,10895.0,10907.5,038441c925bb,0.754868,wakeup,0.170375,33.655094,37.0,2337.5
2,20288,20375.0,20387.5,038441c925bb,0.63036,onset,0.252375,-86.351067,-131.0,2937.5
3,27432,27455.0,27467.5,038441c925bb,0.880441,wakeup,0.775471,-22.161205,37.0,2337.5
4,39989,40007.0,40019.5,038441c925bb,0.87992,onset,0.17507,-17.894793,-11.0,297.5


In [5]:
labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
labels.head()

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


In [6]:
step_per_day = 12 * 60 * 24
WINDOW_SIZE = step_per_day / 2

for sid, stage2_df in tqdm(stage2.groupby("series_id")):
    reg_df = reg[reg["series_id"] == sid].reset_index(drop=True)
    label_df = labels[labels["series_id"] == sid].reset_index(drop=True)
    sensor_df = pd.read_csv(os.path.join(CFG["dataset"]["step_csv_dir"], f"{sid}.csv"))

    sleep_time_mean = (reg_df["step"] % step_per_day).mean()

    for day in range(100):
        base = day * step_per_day + sleep_time_mean
        reg_window_df = reg_df[(base - WINDOW_SIZE <= reg_df["step"]) & (reg_df["step"] <= base + WINDOW_SIZE)]
        label_window_df = label_df[(base - WINDOW_SIZE <= label_df["step"]) & (label_df["step"] <= base + WINDOW_SIZE)]

        step_min = np.nanmin([reg_window_df["step"].min(), label_window_df["step"].min()])
        step_max = np.nanmax([reg_window_df["step"].max(), label_window_df["step"].max()])
        plot_min = step_min - step_per_day * 0.1
        plot_max = step_max + step_per_day * 0.1

        stage2_window_df = stage2_df[(plot_min <= stage2_df["step"]) & (stage2_df["step"] <= plot_max)]
        sensor_window_df = sensor_df[(plot_min <= sensor_df["step"]) & (sensor_df["step"] <= plot_max)]
        reg_window_df = reg_df[(plot_min <= reg_df["step"]) & (reg_df["step"] <= plot_max)]
        label_window_df = label_df[(plot_min <= label_df["step"]) & (label_df["step"] <= plot_max)]
                
        if len(stage2_window_df) == 0:
            continue
        fig, axs = plt.subplots(3, 1, figsize=(10, 7))

        # pred
        axs[0].plot(stage2_window_df["step"], stage2_window_df["oof"])
        axs[0].grid()
        axs[0].set_xlim(plot_min, plot_max)
        axs[0].set_ylim(0, 1)
        axs[0].set_ylabel("oof")

        # sensor
        axs[1].plot(sensor_window_df["step"], sensor_window_df["enmo"])
        axs[1].grid()
        axs[1].set_xlim(plot_min, plot_max)
        axs[1].set_ylabel("enmo")
        axs[2].plot(sensor_window_df["step"], sensor_window_df["anglez"])
        axs[2].grid()
        axs[2].set_xlim(plot_min, plot_max)
        axs[2].set_ylabel("anglez")

        # label
        for i in range(3):
            min_step = label_window_df["step"].min()
            max_step = label_window_df["step"].max()
            if min_step == min_step and max_step == max_step:
                axs[i].axvspan(min_step, max_step, color="gray", edgecolor="black", alpha=0.2)
                axs[i].axvline(x=min_step, color="black", lw=1)
                axs[i].axvline(x=max_step, color="black", lw=1)
                axs[i].axvspan(min_step - 12 * 30, min_step + 12 * 30, color="gray", edgecolor="black", alpha=0.1, hatch="/")
                axs[i].axvspan(max_step - 12 * 30, max_step + 12 * 30, color="gray", edgecolor="black", alpha=0.1, hatch="/")
            for step in reg_window_df[reg_window_df["event"]=="onset"]["step"]:
                axs[i].axvline(x=step, color="red", lw=1)      
            for step in reg_window_df[reg_window_df["event"]=="wakeup"]["step"]:
                axs[i].axvline(x=step, color="green", lw=1)      
            for step in reg_window_df[reg_window_df["event"]=="onset"]["sub_step_before_modify"]:
                axs[i].axvline(x=step, color="red", linestyle="dashed", lw=1)      
            for step in reg_window_df[reg_window_df["event"]=="wakeup"]["sub_step_before_modify"]:
                axs[i].axvline(x=step, color="green", linestyle="dashed", lw=1)      

        # 再現率
        recall = np.nan
        if len(label_window_df) > 0 and len(reg_window_df) > 0:
            thresholds = [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
            dfs = []
            for event, _label_df in label_window_df.groupby("event"):
                sub_df = reg_window_df[reg_window_df["event"] == event].reset_index(drop=True)
                for thresh in thresholds:
                    targets = np.zeros(len(_label_df))
                    for idx, step in enumerate(_label_df["step"].values):           
                        min_dist = thresh
                        min_idx = -1            
                        for sub_step in sub_df["step"].values:    
                            dist = abs(step - sub_step)
                            if dist < min_dist:
                                min_dist = dist
                                min_idx = idx
                        if min_idx != -1:
                            targets[idx] = True
                    _label_df["target"] = targets
                    _label_df["threshold"] = thresh
                    dfs.append(_label_df.copy())
            recall_df = pd.concat(dfs).reset_index(drop=True)
            recall = recall_df["target"].mean()


        path = f"/kaggle/output/{CFG['execution']['reg_exp_id']}/image/{sid}_{day}.png"
        os.makedirs(os.path.dirname(path), exist_ok=True)
        plt.suptitle(f"sid: {sid}, day: {day}, recall: {recall:.3f}")
        plt.tight_layout()
        plt.savefig(path)
        plt.close()

100%|██████████| 277/277 [16:44<00:00,  3.63s/it]
