In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import datetime
from tqdm import tqdm
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
config = f"""
dataset: 
    competition_dir: /kaggle/input/child-mind-institute-detect-sleep-states
    cv_split_path: /kaggle/input/cv_split/train_folds.csv

seed: 46
"""

CFG = yaml.load(config, Loader=yaml.SafeLoader)

In [3]:
labels = pd.read_csv(os.path.join(CFG['dataset']['competition_dir'], 'train_events.csv'))

# 朝と夜の両方がそろってるものだけを残す
check = labels.groupby(["series_id", "night"])["step"].count().reset_index()
check["step"] = check["step"] == 2
check.rename(columns={"step": "safe"}, inplace=True)
labels = labels.merge(check, on=["series_id", "night"], how="left")
labels = labels[labels["safe"] == True]

labels.head()

Unnamed: 0,series_id,night,event,step,timestamp,safe
0,038441c925bb,1,onset,4992.0,2018-08-14T22:26:00-0400,True
1,038441c925bb,1,wakeup,10932.0,2018-08-15T06:41:00-0400,True
2,038441c925bb,2,onset,20244.0,2018-08-15T19:37:00-0400,True
3,038441c925bb,2,wakeup,27492.0,2018-08-16T05:41:00-0400,True
4,038441c925bb,3,onset,39996.0,2018-08-16T23:03:00-0400,True


In [4]:
train = pd.read_parquet(CFG["dataset"]["competition_dir"] + "/train_series.parquet")
train.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14T15:30:00-0400,2.6367,0.0217
1,038441c925bb,1,2018-08-14T15:30:05-0400,2.6368,0.0215
2,038441c925bb,2,2018-08-14T15:30:10-0400,2.637,0.0216
3,038441c925bb,3,2018-08-14T15:30:15-0400,2.6368,0.0213
4,038441c925bb,4,2018-08-14T15:30:20-0400,2.6368,0.0215


In [5]:
thresholds = np.array([12, 36, 60, 90, 120, 150, 180, 240, 300, 360])
for series_id, df in tqdm(train.groupby("series_id")):
    df = df.reset_index(drop=True)

    label_df = labels[labels["series_id"] == series_id]
    
    # wakeup
    label_steps = label_df[label_df["event"]=="wakeup"]["step"].values
    total_counts = np.zeros(df.shape[0])
    for input_step in label_steps:
        abs_diff = np.abs(df['step'].values - input_step)
        counts = (abs_diff[:, None] <= thresholds).sum(axis=1)
        total_counts += counts
    df['target_wakeup'] = total_counts

    # onset
    label_steps = label_df[label_df["event"]=="onset"]["step"].values
    total_counts = np.zeros(df.shape[0])
    for input_step in label_steps:
        abs_diff = np.abs(df['step'].values - input_step)
        counts = (abs_diff[:, None] <= thresholds).sum(axis=1)
        total_counts += counts
    df['target_onset'] = total_counts

    path = f"/kaggle/input/save_reg_series_csv/csvs/{series_id}.csv"
    os.makedirs(os.path.dirname(path), exist_ok=True)
    df.to_csv(path, index=False)

100%|██████████| 277/277 [05:40<00:00,  1.23s/it]
