In [2]:
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import datetime
import sys
from tqdm import tqdm
import os
import random
import pickle
from glob import glob
import gc
from multiprocessing import Pool, cpu_count

import warnings
warnings.filterwarnings('ignore')




In [3]:
config = f"""
execution:
    exp_id: exp_015
    debug: False
    submit: False
    multiprocessing: True

dataset: 
    competition_dir: /kaggle/input/child-mind-institute-detect-sleep-states
    cv_split_path: /kaggle/input/cv_split/train_folds.csv
    train_base_path: /kaggle/input/train_base/train_base.csv
    step_csv_dir: /kaggle/input/save_series_csv/csvs

feature:
    agg_freq: 24 # [step]

xgboost:    
    objective: "binary:logistic"
    learning_rate: 0.1  # 0.01で固定。学習時間とのトレードオフ
    reg_alpha: 0.02  # L1正則化。0.1が推奨。
    reg_lambda: 0.2  # L2正則化。0.1が推奨
    random_state: 42
    max_depth: 5  # 3-8。7くらいでいい。
    colsample_bytree: 0.7  # カラムが多い時は少なめ(0.4とか)にする。
    
seed: 46
"""

CFG = yaml.load(config, Loader=yaml.SafeLoader)

CFG["output_dir"] = f"/kaggle/output/{CFG['execution']['exp_id']}"
os.makedirs(CFG["output_dir"], exist_ok=True)

In [4]:
labels = pd.read_csv(f"{CFG['dataset']['competition_dir']}/train_events.csv").dropna()
labels.head()

Unnamed: 0,series_id,night,event,step,timestamp
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400


In [5]:
dfs = []
for series_id, label_df in tqdm(labels.groupby("series_id")):
    df = pd.read_csv(f"{CFG['dataset']['step_csv_dir']}/{series_id}.csv")
    
    for i, (step, event) in enumerate(label_df[["step", "event"]].values):
        start_step = step - 100
        end_step = step + 100
        group_df = df[(df["step"] >= start_step) & (df["step"] <= end_step)]

        ticks = group_df[group_df["step"] % 12 == 0]["step"].values

        fig, axs = plt.subplots(4, 1, figsize=(20, 12))
        axs[0].plot(group_df["step"], group_df["enmo"])
        axs[0].axvline(step, color="red")
        axs[0].set_ylabel("enmo")
        axs[0].set_xticks(ticks)
        axs[0].grid()
        axs[1].plot(group_df["step"], group_df["anglez"])
        axs[1].axvline(step, color="red")
        axs[1].set_ylabel("anglez")
        axs[1].set_xticks(ticks)
        axs[1].grid()
        axs[2].plot(group_df["step"], group_df["anglez"].diff()==0)
        axs[2].axvline(step, color="red")
        axs[2].set_ylabel("anglez diff==0")
        axs[2].set_xticks(ticks)
        axs[2].grid()
        y = group_df["anglez"].diff().abs()
        y[y > 5] = 5
        axs[3].plot(group_df["step"], y)
        axs[3].axvline(step, color="red")
        axs[3].set_ylabel("anglez diff abs")
        axs[3].set_xticks(ticks)
        axs[3].grid()

        plt.suptitle(f"series_id: {series_id} ({event})")
        plt.tight_layout()

        savepath = f"/kaggle/notebook/gt_images_zoom/{series_id}_{start_step:06}-{end_step:06}.png"
        os.makedirs(os.path.dirname(savepath), exist_ok=True)
        # plt.show()
        plt.savefig(savepath)
        plt.close()
    #     break
    # break

 37%|███▋      | 99/269 [23:29<43:27, 15.34s/it]  