In [None]:
# Sleep Definitions, Rules, and Constraints for the Kaggle Sleep Detection Competition

# Goal:
# - Detect two types of events: sleep onset (beginning of sleep) and wake-up (end of sleep) from accelerometer data.

# Sleep Definition:
# - Sleep is referred to as the longest single period of inactivity while the accelerometer is being worn.

# Important Guidelines:
# - A single sleep period must be at least 30 minutes in length.
# - A sleep period can be interrupted by bouts of activity that do not exceed 30 consecutive minutes.
# - Only the longest sleep window is recorded per night.
# - No sleep windows can be detected unless the watch is deemed to be worn for the duration.
# - If no valid sleep window is identifiable, neither an onset nor a wakeup event is recorded for that night.
# - Sleep events do not need to straddle the day-line, and there's no hard rule defining how many may occur within a given period.
# - However, no more than one window should be assigned per night.
# - For example, it's valid to have a sleep window from 01:00–06:00 and 19:00–23:30 in the same calendar day, though assigned to consecutive nights.

# Data Handling Guidelines:
# - There might be periods within a series where the accelerometer device was removed, these periods should be ignored as they will be scored as false positives.

In [3]:
import pandas as pd
import numpy as np

# Load the training and test series data
train_series = pd.read_parquet('../data/train_series.parquet')
test_series = pd.read_parquet('../data/test_series.parquet')

# Load the training events data
train_events = pd.read_csv('../data/train_events.csv')

# perform comprehensive EDA on the data, by using f-strings to print out many different statistics
# Display the first few rows of each dataset to understand their structure
print(f"Training Series: {train_series.shape}")
print(train_series.head())
print(f"Test Series: {test_series.shape}")
print(test_series.head())
print(f"Training Events: {train_events.shape}")
print(train_events.head())


Training Series: (127946340, 5)
      series_id  step                 timestamp  anglez    enmo
0  038441c925bb     0  2018-08-14T15:30:00-0400  2.6367  0.0217
1  038441c925bb     1  2018-08-14T15:30:05-0400  2.6368  0.0215
2  038441c925bb     2  2018-08-14T15:30:10-0400  2.6370  0.0216
3  038441c925bb     3  2018-08-14T15:30:15-0400  2.6368  0.0213
4  038441c925bb     4  2018-08-14T15:30:20-0400  2.6368  0.0215
Test Series: (450, 5)
      series_id  step                 timestamp  anglez    enmo
0  038441c925bb     0  2018-08-14T15:30:00-0400  2.6367  0.0217
1  038441c925bb     1  2018-08-14T15:30:05-0400  2.6368  0.0215
2  038441c925bb     2  2018-08-14T15:30:10-0400  2.6370  0.0216
3  038441c925bb     3  2018-08-14T15:30:15-0400  2.6368  0.0213
4  038441c925bb     4  2018-08-14T15:30:20-0400  2.6368  0.0215
Training Events: (14508, 5)
      series_id  night   event     step                 timestamp
0  038441c925bb      1   onset   4992.0  2018-08-14T22:26:00-0400
1  038441c925bb   

In [4]:
# Display the number of unique series_id's in each dataset
print(f"Training Series: {train_series.series_id.nunique()}")
print(f"Test Series: {test_series.series_id.nunique()}")
print(f"Training Events: {train_events.series_id.nunique()}")


Training Series: 277
Test Series: 3
Training Events: 277
