In [1]:
!pip install pandas
!pip install seaborn
!pip install pyarrow
!pip install matplotlib





In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

In [3]:
# === 1. Load raw data ===
print("📥 Lade Rohdaten...")
df = pd.read_parquet('/home/jovyan/AICOMP/Second/code/data/raw/train_series.parquet')
train_events = pd.read_csv('/home/jovyan/AICOMP/Second/code/data/raw/train_events.csv')

📥 Lade Rohdaten...


KeyboardInterrupt: 

In [None]:
# === 2. Timestamp-Conversion in Batches ===
print("🔄 Konvertiere Timestamp-Spalte...")
total_rows = len(df)
batch_size = 1_000_000
for i in range(0, total_rows, batch_size):
    start = i
    end = min(i + batch_size, total_rows)
    df.iloc[start:end, df.columns.get_loc('timestamp')] = pd.to_datetime(
        df.iloc[start:end, df.columns.get_loc('timestamp')], utc=True
    )
    print(f"Pocessed rows {start + 1} bis {end}... ({(end / total_rows) * 100:.2f}% finished)")

df = df.drop_duplicates()
df.to_parquet('/home/jovyan/AICOMP/code/data/processed/train_series_cleaned.parquet', index=False)"""

In [None]:
# === 3. Load data with ===
df_cleaned = pd.read_parquet('/home/jovyan/AICOMP/code/data/processed/train_series_cleaned.parquet')

train_events['timestamp'] = pd.to_datetime(train_events['timestamp'], utc=True)
train_events = train_events.drop_duplicates()
train_events = train_events.dropna(subset=['timestamp'])
train_events = train_events.sort_values(by=["series_id", "step"])


In [None]:
# === 4. Remove series_ids with gaps in the nights and step===(where there are nights missing there are also missing timestamps)
print("🧹 Removing series_ids with night gaps...")
gaps = []

# Group by series_id and check for missing nights
for series_id, group in train_events.groupby("series_id"):
    group = group.sort_values(by="step")
    nights_with_data = group[group["step"].notna()]["night"].unique()
    max_night = group["night"].max()
    all_nights = set(range(1, max_night + 1))
    missing_nights = all_nights - set(nights_with_data)
    if missing_nights:
        print(f"🚨 {series_id} has gaps in nights {sorted(missing_nights)}!")
        gaps.append(series_id)

# Report result
if not gaps:
    print("No gaps found!")
removed_ids = len(gaps)
print(f" {removed_ids:,} series_ids removed due to night gaps.")
print(f"Before: {df_cleaned.shape[0]} rows")
df_cleaned = df_cleaned[~df_cleaned["series_id"].isin(gaps)]
print(f"After: {df_cleaned.shape[0]} rows")

# Remove from original events as well
event_cleaned = train_events[~train_events["series_id"].isin(gaps)]

In [None]:
fi = "/home/jovyan/AICOMP/Second/code/data/raw/event_cleaned.csv"
event_cleaned.to_csv(fi, index=False)

In [None]:
# Remove all series ids from train dataset which dont are in event dataset
print(f"Before: {len(df_cleaned)} rows")
df_cleaned = df_cleaned[df_cleaned['series_id'].isin(train_events['series_id'])]
print(f"After: {len(df_cleaned)} rows")

In [None]:
event_cleaned['event'].value_counts().loc[['onset', 'wakeup']]

In [None]:
# === 5. Remove unnecessary steps in train dataset where we dont have any data in the event dataset ===
print("🔍 Filtering steps outside event boundaries...")
max_steps = train_events.groupby("series_id")["step"].max().to_dict()
df_cleaned = df_cleaned[
    df_cleaned.apply(lambda row: row["step"] <= max_steps.get(row["series_id"], float('inf')), axis=1)
]

In [None]:
# 1. Merge
merged_dff = pd.merge(df_cleaned, train_events, on=['series_id', 'timestamp'], how='left')

# 2. Store the positions of the original events
original_mask = merged_dff["event"].notna()

# 3. Forward-fill the 'event' and 'night' columns within each series_id
merged_dff[["night", "event"]] = merged_dff.groupby("series_id")[["night", "event"]].ffill()

# 4. Fill initial NaNs (before the first event)
merged_dff["event"] = merged_dff["event"].fillna("wakeup") #because the first event is always onset so the people are awake when the timeseries starts
merged_dff["night"] = merged_dff["night"].fillna(0)

# 5. Replace only the filled values with asleep/awake, not the original ones
filled_mask = ~original_mask
merged_dff.loc[filled_mask & (merged_dff["event"] == "wakeup"), "event"] = "awake"
merged_dff.loc[filled_mask & (merged_dff["event"] == "onset"), "event"] = "asleep"

In [None]:
print(f"'onset' count: {(merged_dff['event'] == 'onset').sum():,}")
print(f"'wakeup' count: {(merged_dff['event'] == 'wakeup').sum():,}")

In [None]:
print(f"'awake' count: {(merged_dff['event'] == 'awake').sum():,}")
print(f"'asleep' count: {(merged_dff['event'] == 'asleep').sum():,}")

In [None]:
# Remove duplicate 'step_y' column 
if "step_y" in merged_dff.columns:
    merged_dff.drop(columns=["step_y"], inplace=True)

# Rename 'step_x' back to 'step' 
if "step_x" in merged_dff.columns:
    merged_dff.rename(columns={"step_x": "step"}, inplace=True)

#Feature Engineering

In [None]:
# === 8. Time-based features ===
merged_dff['hour'] = merged_dff['timestamp'].dt.hour.astype('int8')
merged_dff['minute'] = merged_dff['timestamp'].dt.minute.astype('int8')
merged_dff['day_of_week'] = merged_dff['timestamp'].dt.dayofweek.astype('int8')
merged_dff['elapsed_time_from_midnight'] = (merged_dff['hour'] * 60 + merged_dff['minute']).astype('int32')
merged_dff['is_weekend'] = (merged_dff['day_of_week'] >= 5).astype('int8')

In [None]:
len(merged_dff)

In [None]:
print("📊 Calculating rolling statistical features...")
window_sizes = [12, 60, 360]  # in 5-second steps
for window in window_sizes:
    for col in ['anglez', 'enmo']:
        merged_dff[f'{col}_mean_{window}s'] = merged_dff[col].rolling(window, min_periods=1).mean().astype('float32')
        merged_dff[f'{col}_std_{window}s'] = merged_dff[col].rolling(window, min_periods=1).std().astype('float32')
        merged_dff[f'{col}_min_{window}s'] = merged_dff[col].rolling(window, min_periods=1).min().astype('float32')
        merged_dff[f'{col}_max_{window}s'] = merged_dff[col].rolling(window, min_periods=1).max().astype('float32')
        merged_dff[f'{col}_median_{window}s'] = merged_dff[col].rolling(window, min_periods=1).median().astype('float32')
        merged_dff[f'{col}_cumulative_{window}s'] = merged_dff[col].rolling(window, min_periods=1).sum().astype('float32')

In [None]:
merged_dff.head()

In [None]:
# === 9. Advanced Feature Engineering ===
print("🧠 Adding more meaningful features...")

# is_night (as int8 for binary flag)
merged_dff["is_night"] = ((merged_dff["hour"] >= 22) | (merged_dff["hour"] <= 6)).astype('int8')

# anglez_delta (as float32 to save memory)
merged_dff["anglez_delta"] = (merged_dff["anglez"] - merged_dff["anglez"].shift(1)).astype('float32')

# SMA of enmo (Simple Moving Average), cast to float32
sma_windows = [12, 60, 360]
for window in sma_windows:
    merged_dff[f'enmo_sma_{window}s'] = merged_dff['enmo'].rolling(window, min_periods=1).mean().astype('float32')

# Lag Features (cast to float32 for efficiency)
lag_targets = [
    "anglez", "enmo",
    "anglez_mean_12s", "anglez_std_12s",
    "enmo_mean_12s", "enmo_std_12s",
    "anglez_mean_60s", "anglez_std_60s",
    "enmo_mean_60s", "enmo_std_60s"
]
lag_steps = [1, 2, 3]
for col in lag_targets:
    for lag in lag_steps:
        merged_dff[f"{col}_lag_{lag}"] = merged_dff[col].shift(lag).astype('float32')

In [None]:
# Check for NaNs
na_cols = merged_dff.columns[merged_dff.isna().any()]

# Fill only numeric columns with their median
for col in na_cols:
    if pd.api.types.is_numeric_dtype(merged_dff[col]):
        merged_dff[col] = merged_dff[col].fillna(merged_dff[col].median())


In [None]:
print("Noch NaNs übrig?")
print(merged_dff.isna().sum()[merged_dff.isna().sum() > 0])

In [None]:
merged_dff.info()

In [None]:
# === 10. Zielvariablen umkodieren ===
event_mapping = {
    "asleep": 0,
    "awake": 1,
    "onset": 2,
    "wakeup": 3
}
merged_dff["event"] = merged_dff["event"].map(event_mapping)

# Binary Target für Klassifikation: onset & wakeup = 1
merged_dff["target"] = merged_dff["event"].apply(lambda x: 1 if x in [2, 3] else 0)


In [None]:
# === 11. Speichern ===
final_output = "/home/jovyan/AICOMP/code/data/processed/merged_dff_gold84_V3.parquet"
merged_dff.to_parquet(final_output, index=False)
print(f"✅ Finales Preprocessed Dataset gespeichert unter: {final_output}")