In [1]:
EVALUATE = True

In [2]:
from datetime import datetime as dt
from datetime import timedelta
from pathlib import Path

from matplotlib.dates import date2num
import polars as pl
from matplotlib import pyplot as plt

plt.style.use("ggplot")

In [3]:
DATA_PATH = Path("/kaggle/input/child-mind-institute-detect-sleep-states")
TRAIN_EVENTS_PATH = DATA_PATH / "train_events.csv"
TRAIN_SERIES_PATH = DATA_PATH / "train_series.parquet"
TEST_SERIES_PATH = DATA_PATH / "test_series.parquet"
SAMPLE_SUBMISSION_PATH = DATA_PATH / "sample_submission.csv"
# SLEEP_DATA_PATH = Path("/kaggle/input/sleep-data/sleep_time_probability.parquet")

for path in [
    DATA_PATH,
    TRAIN_EVENTS_PATH,
    TRAIN_SERIES_PATH,
    TEST_SERIES_PATH,
    SAMPLE_SUBMISSION_PATH,
    # SLEEP_DATA_PATH,
]:
    assert path.exists()

In [4]:
import polars as pl


def transform(df, night_offset=20):
    return (
        df.with_columns(
            [
                pl.col("timestamp").str.slice(-5, 3).cast(pl.Int8).alias("tz_offset"),
            ]
        )
        .with_columns(
            [
                (pl.col("tz_offset") == -4).alias("is_dst"),
            ]
        )
        .with_columns(
            [
                pl.col("timestamp")
                .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%S%z")
                .alias("timestamp"),
            ]
        )
        .with_columns(
            [
                (pl.col("timestamp").dt.year() - 2000).cast(pl.Int8).alias("year"),
                pl.col("timestamp").dt.month().cast(pl.Int8).alias("month"),
                pl.col("timestamp").dt.day().cast(pl.Int8).alias("day"),
                pl.col("timestamp").dt.hour().cast(pl.Int8).alias("hour"),
                pl.col("timestamp").dt.minute().cast(pl.Int8).alias("minute"),
                pl.col("timestamp").dt.second().cast(pl.Int8).alias("second"),
                pl.col("timestamp").dt.weekday().cast(pl.Int8).alias("weekday"),
            ]
        )
        .with_columns(  # 正午をまたいで日付を調整
            pl.when(pl.col("hour") < night_offset)
            .then(pl.col("timestamp"))
            .otherwise(pl.col("timestamp") + pl.duration(days=1))
            .dt.date()
            .alias("night_group"),
        )
        .with_columns(
            [
                (
                    pl.col("series_id")
                    + pl.lit("_")
                    + pl.col("night_group").cast(pl.Datetime).dt.strftime("%Y%m%d")
                ).alias("group_id"),
            ]
        )
        .with_columns(
            [
                pl.col("timestamp").cumcount().over("group_id").alias("norm_step"),
            ]
        )
        .drop(["night_group"])
    )


def transform_series(df):
    return transform(df).with_columns(
        [
            (pl.col("enmo") == 0).alias("is_enmo_clipped"),
        ]
    )


def transform_events(df):
    return (
        transform(df)
        .with_columns(
            [
                pl.col("night").cast(pl.UInt32).alias("night"),
            ]
        )
        .pivot(["step", "timestamp", "tz_offset"], ["series_id", "group_id", "night"], "event")
    )


In [5]:
def add_feature(
    df,
    group_col="series_id",
    day_group_col="group_id",
    term1=(5 * 60) // 5,
    term2=(30 * 60) // 5,
    term3=(60 * 60) // 5,
    min_threshold=0.005,
    max_threshold=0.04,
    center=True,
):
    return (
        df.with_columns(
            [
                pl.col("anglez").diff(1).abs().over(group_col).alias("anglez_diff"),
                pl.col("enmo").diff(1).abs().over(group_col).alias("enmo_diff"),
            ]
        )
        .with_columns(
            [
                pl.col("anglez_diff")
                .rolling_median(term1, center=center)  # 5 min window
                .over(group_col)
                .alias("anglez_diff_median_5min"),
                pl.col("enmo_diff")
                .rolling_median(term1, center=center)  # 5 min window
                .over(group_col)
                .alias("enmo_diff_median_5min"),
            ]
        )
        .with_columns(
            [
                pl.col("anglez_diff_median_5min")
                .quantile(0.1)
                .clip(min_threshold, max_threshold)
                .over(day_group_col)
                .alias("critical_threshold")
            ]
        )
        .with_columns(
            [
                (pl.col("anglez_diff_median_5min") < pl.col("critical_threshold") * 15)
                .over(group_col)
                .alias("is_static")
            ]
        )
        .with_columns(
            [
                pl.col("is_static")
                .cast(pl.Int32)
                .rolling_sum(term2, center=center)
                .over(group_col)
                .alias("is_static_sum_30min"),
            ]
        )
        .with_columns(
            [(pl.col("is_static_sum_30min") == ((30 * 60) // 5)).over(group_col).alias("tmp")]
        )
        .with_columns(
            [
                pl.col("tmp").shift(term2 // 2).over(group_col).alias("tmp_left"),
                pl.col("tmp").shift(-(term2 // 2)).over(group_col).alias("tmp_right"),
            ]
        )
        .with_columns(
            [
                (pl.col("tmp_left") | pl.col("tmp_right")).alias("is_sleep_block"),
            ]
        )
        .drop(["tmp", "tmp_left", "tmp_right"])
        .with_columns([pl.col("is_sleep_block").not_().alias("is_gap")])
        .with_columns(
            [
                pl.col("is_gap")
                .cast(pl.Int32)
                .rolling_sum(term3, center=center)
                .over(group_col)
                .alias("gap_length")
            ]
        )
        .with_columns([(pl.col("gap_length") == term3).over(group_col).alias("tmp")])
        .with_columns(
            [
                pl.col("tmp").shift(term3 // 2).over(group_col).alias("tmp_left"),
                pl.col("tmp").shift(-(term3 // 2)).over(group_col).alias("tmp_right"),
            ]
        )
        .with_columns(
            [
                (pl.col("tmp_left") | pl.col("tmp_right")).alias("is_large_gap"),
            ]
        )
        .drop(["tmp", "tmp_left", "tmp_right"])
        .with_columns([pl.col("is_large_gap").not_().alias("is_sleep_episode")])
        #
        # extract longest sleep episode
        #
        .with_columns(
            [
                # extract false->true transition
                (
                    (
                        pl.col("is_sleep_episode")
                        & pl.col("is_sleep_episode")
                        .shift_and_fill(pl.lit(False), periods=1)
                        .not_()
                    )
                    .cumsum()
                    .over("group_id")
                ).alias("sleep_episode_id")
            ]
        )
        .with_columns(
            [
                pl.col("is_sleep_episode")
                .sum()
                .over(["group_id", "sleep_episode_id"])
                .alias("sleep_episode_length")
            ]
        )
        .with_columns(
            [
                pl.col("sleep_episode_length")
                .max()
                .over(["group_id"])
                .alias("max_sleep_episode_length")
            ]
        )
        .with_columns(
            [
                (
                    pl.col("is_sleep_episode")
                    & (pl.col("sleep_episode_length") == pl.col("max_sleep_episode_length"))
                ).alias("is_longest_sleep_episode")
            ]
        )
    )

In [6]:
%%time

tr_events = pl.read_csv(TRAIN_EVENTS_PATH)
tr_series = pl.read_parquet(TRAIN_SERIES_PATH)
sample_submission = pl.read_csv(SAMPLE_SUBMISSION_PATH)

series_ids = tr_events["series_id"].unique()
print(f"#Events: {len(tr_events)}")
print(f"#Series: {len(series_ids)}")

tr_series = transform_series(tr_series)
tr_series = add_feature(tr_series)
tr_events = transform_events(tr_events)

#Events: 14508
#Series: 277
CPU times: user 11min 5s, sys: 38.3 s, total: 11min 43s
Wall time: 1min 31s


In [7]:
use_columns = ["series_id", "step", "is_longest_sleep_episode", "is_sleep_block", "is_gap", "is_large_gap"]
tr_series = tr_series[use_columns].fill_null(False)
tr_series.head()

series_id,step,is_longest_sleep_episode,is_sleep_block,is_gap,is_large_gap
str,u32,bool,bool,bool,bool
"""038441c925bb""",0,False,False,False,False
"""038441c925bb""",1,False,False,False,False
"""038441c925bb""",2,False,False,False,False
"""038441c925bb""",3,False,False,False,False
"""038441c925bb""",4,False,False,False,False


In [10]:
for sid, sdf in tr_series.group_by("series_id"):
    sdf = sdf.sort("step")
    path = Path(f"heauristic_features/{sid}.parquet")
    path.parent.mkdir(parents=True, exist_ok=True)
    sdf.write_parquet(path)