In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import datetime
from tqdm import tqdm
import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
config = f"""
dataset: 
    competition_dir: /kaggle/input/child-mind-institute-detect-sleep-states
    cv_split_path: /kaggle/input/cv_split/train_folds.csv
    train_base_path: /kaggle/input/train_base/train_base.csv

seed: 46
"""

CFG = yaml.load(config, Loader=yaml.SafeLoader)

In [3]:
train_base = pd.read_csv(CFG['dataset']['train_base_path'])
train_base["start_time"] = pd.to_datetime(train_base["start_time"], utc=True)
train_base["end_time"] = pd.to_datetime(train_base["end_time"], utc=True)
train_base.head()

Unnamed: 0,series_id,start_time,end_time,target_type,target_step,target_timestamp,sample_id,target
0,038441c925bb,2018-08-13 23:00:00+00:00,2018-08-14 22:59:59+00:00,wakeup,,,0,
1,038441c925bb,2018-08-14 23:00:00+00:00,2018-08-15 22:59:59+00:00,wakeup,10932.0,2018-08-15 10:41:00+00:00,1,8412.0
2,038441c925bb,2018-08-15 23:00:00+00:00,2018-08-16 22:59:59+00:00,wakeup,27492.0,2018-08-16 09:41:00+00:00,2,7692.0
3,038441c925bb,2018-08-16 23:00:00+00:00,2018-08-17 22:59:59+00:00,wakeup,44400.0,2018-08-17 09:10:00+00:00,3,7320.0
4,038441c925bb,2018-08-17 23:00:00+00:00,2018-08-18 22:59:59+00:00,wakeup,62856.0,2018-08-18 10:48:00+00:00,4,8496.0


In [4]:
train = pd.read_parquet(CFG["dataset"]["competition_dir"] + "/train_series.parquet")
train["timestamp"] = pd.to_datetime(train["timestamp"], utc=True)
train.head()

Unnamed: 0,series_id,step,timestamp,anglez,enmo
0,038441c925bb,0,2018-08-14 19:30:00+00:00,2.6367,0.0217
1,038441c925bb,1,2018-08-14 19:30:05+00:00,2.6368,0.0215
2,038441c925bb,2,2018-08-14 19:30:10+00:00,2.637,0.0216
3,038441c925bb,3,2018-08-14 19:30:15+00:00,2.6368,0.0213
4,038441c925bb,4,2018-08-14 19:30:20+00:00,2.6368,0.0215


In [5]:
for series_id, df in tqdm(train.groupby("series_id")):
    df = df.reset_index(drop=True)
    global_start_time = df["timestamp"].min()
    for start_time, end_time, target_type, sample_id in train_base[train_base["series_id"] == series_id][["start_time", "end_time", "target_type", "sample_id"]].values:
        df_sample = df[(df["timestamp"] >= start_time) & (df["timestamp"] <= end_time)].copy()
        df_sample["sample_id"] = sample_id
        df_sample["start_time"] = start_time
        df_sample["end_time"] = end_time
        df_sample["target_type"] = target_type
        df_sample["global_start_time"] = global_start_time

        savepath = f"/kaggle/input/save_day_csv/day_csvs/{series_id}/{sample_id}.csv"
        os.makedirs(os.path.dirname(savepath), exist_ok=True)        
        df_sample.to_csv(savepath, index=False)

100%|██████████| 277/277 [40:28<00:00,  8.77s/it]
