In [1]:
!pip install pandas
!pip install seaborn
!pip install pyarrow
!pip install matplotlib





In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import gc

In [3]:
# === 1. Load raw data ===
df = pd.read_parquet('data/train_series.parquet')
train_events = pd.read_csv('data/train_events.csv')

📥 Lade Rohdaten...


In [5]:
# === 2. Timestamp-Conversion in Batches ===
print("🔄 Convert Timestamp-column...")
total_rows = len(df)
batch_size = 1_000_000
for i in range(0, total_rows, batch_size):
    start = i
    end = min(i + batch_size, total_rows)
    df.iloc[start:end, df.columns.get_loc('timestamp')] = pd.to_datetime(
        df.iloc[start:end, df.columns.get_loc('timestamp')], utc=True
    )
    print(f"Pocessed rows {start + 1} bis {end}... ({(end / total_rows) * 100:.2f}% finished)")

df = df.drop_duplicates()
df.to_parquet('processed/train_series_cleaned.parquet', index=False)

🔄 Konvertiere Timestamp-Spalte...
Pocessed rows 1 bis 1000000... (0.78% finished)
Pocessed rows 1000001 bis 2000000... (1.56% finished)
Pocessed rows 2000001 bis 3000000... (2.34% finished)
Pocessed rows 3000001 bis 4000000... (3.13% finished)
Pocessed rows 4000001 bis 5000000... (3.91% finished)
Pocessed rows 5000001 bis 6000000... (4.69% finished)
Pocessed rows 6000001 bis 7000000... (5.47% finished)
Pocessed rows 7000001 bis 8000000... (6.25% finished)
Pocessed rows 8000001 bis 9000000... (7.03% finished)
Pocessed rows 9000001 bis 10000000... (7.82% finished)
Pocessed rows 10000001 bis 11000000... (8.60% finished)
Pocessed rows 11000001 bis 12000000... (9.38% finished)
Pocessed rows 12000001 bis 13000000... (10.16% finished)
Pocessed rows 13000001 bis 14000000... (10.94% finished)
Pocessed rows 14000001 bis 15000000... (11.72% finished)
Pocessed rows 15000001 bis 16000000... (12.51% finished)
Pocessed rows 16000001 bis 17000000... (13.29% finished)
Pocessed rows 17000001 bis 1800000

In [10]:
df = pd.read_parquet('processed/train_series_cleaned.parquet')
del df
gc.collect()

1753

In [11]:
# === 3. Load data with ===
df_cleaned = pd.read_parquet('processed/train_series_cleaned.parquet')

train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], utc=True)
train_events = train_events.drop_duplicates()
train_events = train_events.dropna(subset=['timestamp'])
train_events = train_events.sort_values(by=["series_id", "step"])


In [12]:
# === 4. Remove series_ids with gaps in the nights and step===(where there are nights missing there are also missing timestamps)
print("🧹 Removing series_ids with night gaps...")
gaps = []

# Group by series_id and check for missing nights
for series_id, group in train_events.groupby("series_id"):
    group = group.sort_values(by="step")
    nights_with_data = group[group["step"].notna()]["night"].unique()
    max_night = group["night"].max()
    all_nights = set(range(1, max_night + 1))
    missing_nights = all_nights - set(nights_with_data)
    if missing_nights:
        print(f"🚨 {series_id} has gaps in nights {sorted(missing_nights)}!")
        gaps.append(series_id)

# Report result
if not gaps:
    print("No gaps found!")
removed_ids = len(gaps)
print(f" {removed_ids:,} series_ids removed due to night gaps.")
print(f"Before: {df_cleaned.shape[0]} rows")
df_cleaned = df_cleaned[~df_cleaned["series_id"].isin(gaps)]
print(f"After: {df_cleaned.shape[0]} rows")

# Remove from original events as well
event_cleaned = train_events[~train_events["series_id"].isin(gaps)]

🧹 Removing series_ids with night gaps...
🚨 038441c925bb has gaps in nights [5, 9, 14]!
🚨 03d92c9f6f8a has gaps in nights [4, 6, 7, 8, 9, 10, 13]!
🚨 0402a003dae9 has gaps in nights [3, 4, 5, 19]!
🚨 04f547b8017d has gaps in nights [1, 4, 17, 20, 23]!
🚨 05e1944c3818 has gaps in nights [1]!
🚨 062cae666e2a has gaps in nights [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]!
🚨 062dbd4c95e6 has gaps in nights [2, 3, 7, 8, 12, 13, 16, 17, 18, 19, 21, 22, 23]!
🚨 0cd1e3d0ed95 has gaps in nights [6, 7, 8, 9, 10, 11, 12]!
🚨 0d0ad1e77851 has gaps in nights [5, 6, 9]!
🚨 0dee4fda51c3 has gaps in nights [15, 16, 18, 22]!
🚨 0ec9fc461819 has gaps in nights [23, 24, 25]!
🚨 0f572d690310 has gaps in nights [4, 5, 6, 10, 13, 19, 20, 21]!
🚨 10469f6765bf has gaps in nights [2]!
🚨 12d01911d509 has gaps in nights [5, 20]!
🚨 137771d19ca2 has gaps in nights [1]!
🚨 137b99e936ab has gaps in nights [2, 3, 8, 11, 12, 18, 21, 25, 26, 27]!
🚨 13b4d6a01d27 has gaps in nights [1]!
🚨 148471991ffb has gaps in nights [3, 4, 5, 6, 7, 9, 10,

In [13]:
fi = "processed/event_cleaned.csv"
event_cleaned.to_csv(fi, index=False)

In [14]:
# Remove all series ids from train dataset which dont are in event dataset
print(f"Before: {len(df_cleaned)} rows")
df_cleaned = df_cleaned[df_cleaned['series_id'].isin(train_events['series_id'])]
print(f"After: {len(df_cleaned)} rows")

Before: 36869220 rows
After: 33744960 rows


In [15]:
event_cleaned['event'].value_counts().loc[['onset', 'wakeup']]

event
onset     1476
wakeup    1477
Name: count, dtype: int64

In [16]:
# === 5. Remove unnecessary steps in train dataset where we dont have any data in the event dataset ===
print("🔍 Filtering steps outside event boundaries...")
max_steps = train_events.groupby("series_id")["step"].max().to_dict()
df_cleaned = df_cleaned[
    df_cleaned.apply(lambda row: row["step"] <= max_steps.get(row["series_id"], float('inf')), axis=1)
]

🔍 Filtering steps outside event boundaries...


In [17]:
# 1. Merge
merged_dff = pd.merge(df_cleaned, train_events, on=['series_id', 'timestamp'], how='left')

del df_cleaned
gc.collect()
# 2. Store the positions of the original events
original_mask = merged_dff["event"].notna()

# 3. Forward-fill the 'event' and 'night' columns within each series_id
merged_dff[["night", "event"]] = merged_dff.groupby("series_id")[["night", "event"]].ffill()

# 4. Fill initial NaNs (before the first event)
merged_dff["event"] = merged_dff["event"].fillna("wakeup") #because the first event is always onset so the people are awake when the timeseries starts
merged_dff["night"] = merged_dff["night"].fillna(0)

# 5. Replace only the filled values with asleep/awake, not the original ones
filled_mask = ~original_mask
merged_dff.loc[filled_mask & (merged_dff["event"] == "wakeup"), "event"] = "awake"
merged_dff.loc[filled_mask & (merged_dff["event"] == "onset"), "event"] = "asleep"

In [18]:
print(f"'onset' count: {(merged_dff['event'] == 'onset').sum():,}")
print(f"'wakeup' count: {(merged_dff['event'] == 'wakeup').sum():,}")

'onset' count: 1,476
'wakeup' count: 1,477


In [19]:
print(f"'awake' count: {(merged_dff['event'] == 'awake').sum():,}")
print(f"'asleep' count: {(merged_dff['event'] == 'asleep').sum():,}")

'awake' count: 16,104,952
'asleep' count: 9,045,648


In [20]:
# Remove duplicate 'step_y' column 
if "step_y" in merged_dff.columns:
    merged_dff.drop(columns=["step_y"], inplace=True)

# Rename 'step_x' back to 'step' 
if "step_x" in merged_dff.columns:
    merged_dff.rename(columns={"step_x": "step"}, inplace=True)

# Feature Engineering


In [21]:
# === 8. Time-based features ===
merged_dff['hour'] = merged_dff['timestamp'].dt.hour.astype('int8')
merged_dff['minute'] = merged_dff['timestamp'].dt.minute.astype('int8')
merged_dff['day_of_week'] = merged_dff['timestamp'].dt.dayofweek.astype('int8')
merged_dff['elapsed_time_from_midnight'] = (merged_dff['hour'] * 60 + merged_dff['minute']).astype('int32')
merged_dff['is_weekend'] = (merged_dff['day_of_week'] >= 5).astype('int8')

In [23]:
# === 8. rolling statistical features ===

window_sizes = [12, 60, 360]  # in 5-second steps
for window in window_sizes:
    for col in ['anglez', 'enmo']:
        merged_dff[f'{col}_mean_{window}s'] = merged_dff[col].rolling(window, min_periods=1).mean().astype('float32')
        merged_dff[f'{col}_std_{window}s'] = merged_dff[col].rolling(window, min_periods=1).std().astype('float32')
        merged_dff[f'{col}_min_{window}s'] = merged_dff[col].rolling(window, min_periods=1).min().astype('float32')
        merged_dff[f'{col}_max_{window}s'] = merged_dff[col].rolling(window, min_periods=1).max().astype('float32')
        merged_dff[f'{col}_median_{window}s'] = merged_dff[col].rolling(window, min_periods=1).median().astype('float32')
        merged_dff[f'{col}_cumulative_{window}s'] = merged_dff[col].rolling(window, min_periods=1).sum().astype('float32')

📊 Calculating rolling statistical features...


In [25]:
# === 9. Advanced Feature Engineering ===

# is_night (as int8 for binary flag)
merged_dff["is_night"] = ((merged_dff["hour"] >= 22) | (merged_dff["hour"] <= 6)).astype('int8')

# anglez_delta (as float32 to save memory)
merged_dff["anglez_delta"] = (merged_dff["anglez"] - merged_dff["anglez"].shift(1)).astype('float32')

# SMA of enmo (Simple Moving Average), cast to float32
sma_windows = [12, 60, 360]
for window in sma_windows:
    merged_dff[f'enmo_sma_{window}s'] = merged_dff['enmo'].rolling(window, min_periods=1).mean().astype('float32')

# Lag Features (cast to float32 for efficiency)
lag_targets = [
    "anglez", "enmo",
    "anglez_mean_12s", "anglez_std_12s",
    "enmo_mean_12s", "enmo_std_12s",
    "anglez_mean_60s", "anglez_std_60s",
    "enmo_mean_60s", "enmo_std_60s"
]
lag_steps = [1, 2, 3]
for col in lag_targets:
    for lag in lag_steps:
        merged_dff[f"{col}_lag_{lag}"] = merged_dff[col].shift(lag).astype('float32')

In [26]:
# Check for NaNs
na_cols = merged_dff.columns[merged_dff.isna().any()]

# Fill only numeric columns with their median
for col in na_cols:
    if pd.api.types.is_numeric_dtype(merged_dff[col]):
        merged_dff[col] = merged_dff[col].fillna(merged_dff[col].median())


In [27]:
print("Noch NaNs übrig?")
print(merged_dff.isna().sum()[merged_dff.isna().sum() > 0])

Noch NaNs übrig?
Series([], dtype: int64)


In [28]:
merged_dff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25153553 entries, 0 to 25153552
Data columns (total 83 columns):
 #   Column                      Dtype              
---  ------                      -----              
 0   series_id                   object             
 1   step                        uint32             
 2   timestamp                   datetime64[us, UTC]
 3   anglez                      float32            
 4   enmo                        float32            
 5   night                       float64            
 6   event                       object             
 7   hour                        int8               
 8   minute                      int8               
 9   day_of_week                 int8               
 10  elapsed_time_from_midnight  int32              
 11  is_weekend                  int8               
 12  anglez_mean_12s             float32            
 13  anglez_std_12s              float32            
 14  anglez_min_12s              floa

In [29]:
# === 10. Targetvariables ===
event_mapping = {
    "asleep": 0,
    "awake": 1,
    "onset": 2,
    "wakeup": 3
}
merged_dff["event"] = merged_dff["event"].map(event_mapping)

# Binary Target für Classification: onset & wakeup = 1
merged_dff["target"] = merged_dff["event"].apply(lambda x: 1 if x in [2, 3] else 0)


In [30]:
# === 11. Safe ===
final_output = "processed/merged_dff_gold84_V3.parquet"
merged_dff.to_parquet(final_output, index=False)
print(f" Finales Preprocessed Dataset safed under: {final_output}")

 Finales Preprocessed Dataset safed under: processed/merged_dff_gold84_V3.parquet
