In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm

In [2]:
train = pd.read_parquet("processed/Zzzs_train_multi.parquet")

In [3]:
train.head()


Unnamed: 0,series_id,step,timestamp,anglez,enmo,awake
0,08db4255286f,0,2018-11-05T10:00:00-0400,-30.845301,0.0447,1
1,08db4255286f,1,2018-11-05T10:00:05-0400,-34.181801,0.0443,1
2,08db4255286f,2,2018-11-05T10:00:10-0400,-33.877102,0.0483,1
3,08db4255286f,3,2018-11-05T10:00:15-0400,-34.282101,0.068,1
4,08db4255286f,4,2018-11-05T10:00:20-0400,-34.385799,0.0768,1


In [4]:
# --- Feature Engineering ---
def make_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df["hour"] = df["timestamp"].dt.hour

    periods = 20
    df["anglez"] = abs(df["anglez"])
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill")
    df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill")
    
    for col in ["anglez", "enmo"]:
        df[f"{col}_rolling_mean"] = df[col].rolling(periods, center=True).mean().bfill().ffill()
        df[f"{col}_rolling_max"]  = df[col].rolling(periods, center=True).max().bfill().ffill()
        df[f"{col}_rolling_std"]  = df[col].rolling(periods, center=True).std().bfill().ffill()
    
    for col in ["anglez_diff", "enmo_diff"]:
        df[f"{col}_rolling_mean"] = df[col].rolling(periods, center=True).mean().bfill().ffill()
        df[f"{col}_rolling_max"]  = df[col].rolling(periods, center=True).max().bfill().ffill()

    return df


features = ["hour",
            "anglez",
            "anglez_rolling_mean",
            "anglez_rolling_max",
            "anglez_rolling_std",
            "anglez_diff",
            "anglez_diff_rolling_mean",
            "anglez_diff_rolling_max",
            "enmo",
            "enmo_rolling_mean",
            "enmo_rolling_max",
            "enmo_rolling_std",
            "enmo_diff",
            "enmo_diff_rolling_mean",
            "enmo_diff_rolling_max",
           ]

In [None]:
train   = make_features(train)

X = train[features + ["series_id", "step"]].copy()  # Achte darauf, die `series_id` und `step` in den Features zu haben
y = train["awake"]
groups = train["series_id"]


  df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))


In [None]:
def get_event(df):
    lstCV = zip(df.series_id, df.smooth, df.step)
    lstPOI = []
    for (c, v, step), g in groupby(lstCV, lambda cv: (cv[0], cv[1] != 0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False:
            lstPOI.extend([(c, step, 0)] * llg)  # 0 ist für den Fall, dass es kein Ereignis gibt
        else:
            lstPOI.extend([(c, step, 'onset')] + [(c, step, 0)] * (llg - 2) + [(c, step, 'wakeup')] if llg > 1 else [(c, step, 'onset')])
    return lstPOI

In [None]:
gkf = GroupKFold(n_splits=5)
predictions = []
auc_scores = []
start_time = time.time()

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups), 1):
    print(f"\nFold {fold}")
    
    X_train, X_val = X.iloc[train_idx][features], X.iloc[val_idx][features]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    val_meta = X.iloc[val_idx][["series_id", "step", "timestamp"]].copy()
    
    # --- Pipeline Setup ---
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestClassifier(n_estimators=50, min_samples_leaf=300, random_state=42, n_jobs=-1))
    ])
    
    # Trainiere das Modell
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    
    # Evaluation
    print(classification_report(y_val, y_pred, digits=4))
    print(confusion_matrix(y_val, y_pred))

    # AUC (optional, für binäre Klassifikation)
    try:
        auc = roc_auc_score(y_val, pipeline.predict_proba(X_val)[:, 1])
        auc_scores.append(auc)
        print("ROC AUC:", round(auc, 5))
    except:
        pass

    # Speichere die Vorhersagen (predictions) zusammen mit series_id und step
    val_meta["pred"] = y_pred
    predictions.append(val_meta)
    
    # --- Bestimmung der Events (onset und wakeup) nach jedem Fold ---
    # Wir wenden get_event() auf das gesamte DataFrame (inkl. "smooth") an
    X_val["smooth"] = y_pred  # Dies könnte an die tatsächliche Glättungslogik angepasst werden, falls notwendig.
    events = get_event(X_val)  # Bestimmen der Ereignisse basierend auf der "smooth" Spalte

    # Umwandlung der Ergebnisse in einen DataFrame
    event_df = pd.DataFrame({
        "series_id": X_val["series_id"],
        "step": X_val["step"],
        "event": events
    })

    # Speichern der Ergebnisse in einer CSV-Datei (optional: zusammenführen mit bestehenden Ergebnissen)
    if fold == 1:  # Nur im ersten Fold die Datei speichern
        event_df.to_csv("sleep_events.csv", index=False, mode='w', header=True)
    else:
        event_df.to_csv("sleep_events.csv", index=False, mode='a', header=False)

    print(f"\n✅ Sleep events für Fold {fold} wurden gespeichert unter: sleep_events.csv")

# Am Ende der Cross-Validation die AUC ausgeben
print("\nDurchschnittlicher ROC AUC über alle Folds:", round(np.mean(auc_scores), 5))

In [None]:
print("Unique values in target:", y.unique())