In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from event_detection_ap import score, ParticipantVisibleError
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import GroupKFold

In [3]:
# Parameters
N_SPLITS = 5
THRESHOLD = 0.05

# File paths
DATA_PATH = "processed/merged_dff_gold84_V3.parquet"
CANDIDATES_PATH = "results/model1_candidates_kfold.csv"
EVENTS_PATH = "processed/event_cleaned.csv"




In [4]:
def candidate_windowing(df):
    df["step_int"] = df["step"].astype(int)
    df["step_offset"] = df.groupby("series_id")["step_int"].transform(lambda x: x - x.min())
    return df[df["step_offset"] % 3 == 0].reset_index(drop=True)

In [5]:
def expand_labels(df, radius=4):
    target_array = df["target"].values.copy()
    for shift in range(1, radius + 1):
        df[f"target_shift_minus_{shift}"] = df["target"].shift(-shift, fill_value=0)
        df[f"target_shift_plus_{shift}"] = df["target"].shift(shift, fill_value=0)
        target_array |= df[f"target_shift_minus_{shift}"].values
        target_array |= df[f"target_shift_plus_{shift}"].values
    df["target"] = target_array
    return df.drop(columns=[c for c in df.columns if c.startswith("target_shift_")])

In [7]:
df = pd.read_parquet(DATA_PATH)



# Feature Set
final_preset = [
    "anglez", "enmo", "hour", "minute", "is_night", "is_weekend",
    "anglez_delta", "anglez_lag_1", "enmo_lag_1",
    "anglez_mean_60s", "enmo_mean_60s",
    "enmo_cumulative_60s", "enmo_sma_60s",
    "enmo_mean_12s_lag_1", "anglez_std_60s_lag_1"
]

In [8]:
df = candidate_windowing(df)
df = expand_labels(df)
print(f"✅ Preprocessed shape: {df.shape}")

✅ Preprocessed shape: (8384569, 86)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8384569 entries, 0 to 8384568
Data columns (total 86 columns):
 #   Column                      Dtype              
---  ------                      -----              
 0   series_id                   object             
 1   step                        uint32             
 2   timestamp                   datetime64[us, UTC]
 3   anglez                      float32            
 4   enmo                        float32            
 5   night                       float64            
 6   event                       int64              
 7   hour                        int8               
 8   minute                      int8               
 9   day_of_week                 int8               
 10  elapsed_time_from_midnight  int32              
 11  is_weekend                  int8               
 12  anglez_mean_12s             float32            
 13  anglez_std_12s              float32            
 14  anglez_min_12s              float3

In [10]:

def postprocess_predictions(y_probs, threshold):
    y_pred = (y_probs >= threshold).astype(int)
    event_indices = np.where(y_pred == 1)[0]
    filtered = []
    last_idx = -np.inf
    for idx in event_indices:
        if idx - last_idx >= 12:
            filtered.append(idx)
            last_idx = idx
    return filtered


In [11]:


gkf = GroupKFold(n_splits=N_SPLITS)  # Create group-based cross-validation splits
results = []

for fold, (train_idx, val_idx) in enumerate(gkf.split(df, df["target"], groups=df["series_id"])):
    print(f"Fold {fold + 1}/{N_SPLITS}")
    
    # Split data into training and validation sets
    X_train = df.iloc[train_idx][final_preset].astype(np.float32)
    y_train = df.iloc[train_idx]["target"]
    X_val = df.iloc[val_idx][final_preset].astype(np.float32)
    y_val = df.iloc[val_idx]["target"]
    meta_val = df.iloc[val_idx][["series_id", "step"]].reset_index(drop=True)

    # Initialize and train XGBoost model
    model1 = xgb.XGBClassifier(
        objective="binary:logistic",
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        eval_metric="logloss",
        use_label_encoder=False,
        tree_method="hist",
        random_state=42
    )

    model1.fit(X_train, y_train)

    # Predict probabilities and postprocess
    y_probs = model1.predict_proba(X_val)[:, 1]
    event_idxs = postprocess_predictions(y_probs, THRESHOLD)

    # Store results for current fold
    fold_results = meta_val.loc[event_idxs].copy()
    fold_results["model1_score"] = y_probs[event_idxs]
    results.append(fold_results)
    
    # Save model after each fold
    model_filename = f'Safed Models/xgb_model_fold2_{fold + 1}.joblib'
    joblib.dump(model1, model_filename)
    print(f"✅ Saved model for fold {fold + 1} to {model_filename}")

# Combine results from all folds
candidates = pd.concat(results, ignore_index=True)

# Save predictions to CSV
candidates.to_csv(CANDIDATES_PATH, index=False)
print(f"✅ Saved {len(candidates)} candidates to {CANDIDATES_PATH}")

Fold 1/5


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model for fold 1 to Safed Models/xgb_model_fold2_1.joblib
Fold 2/5


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model for fold 2 to Safed Models/xgb_model_fold2_2.joblib
Fold 3/5


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model for fold 3 to Safed Models/xgb_model_fold2_3.joblib
Fold 4/5


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model for fold 4 to Safed Models/xgb_model_fold2_4.joblib
Fold 5/5


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model for fold 5 to Safed Models/xgb_model_fold2_5.joblib
✅ Saved 10871 candidates to results/model1_candidates_kfold.csv


In [12]:
def evaluate_predictions(candidates_df, true_events_df, step_tolerance=12):
    """
    Compares predicted event candidates to ground-truth sleep events using a step tolerance.
    
    Args:
        candidates_df (pd.DataFrame): Model predictions with 'series_id' and 'step'.
        true_events_df (pd.DataFrame): True events with 'series_id' and 'step'.
        step_tolerance (int): Max allowed difference in step between prediction and true event.
        
    Returns:
        precision, recall, f1
    """
    preds = candidates_df.copy()
    trues = true_events_df.copy()
    
    preds["matched"] = False
    trues["matched"] = False

    for i, true_event in trues.iterrows():
        series_id = true_event["series_id"]
        true_step = true_event["step"]
        
        # Get predictions in same series and within step_tolerance
        candidates_in_series = preds[
            (preds["series_id"] == series_id) &
            (~preds["matched"]) &
            (np.abs(preds["step"] - true_step) <= step_tolerance)
        ]
        
        if not candidates_in_series.empty:
            closest_idx = candidates_in_series.iloc[
                np.argmin(np.abs(candidates_in_series["step"] - true_step))
            ].name
            preds.at[closest_idx, "matched"] = True
            trues.at[i, "matched"] = True

    TP = preds["matched"].sum()
    FP = (~preds["matched"]).sum()
    FN = (~trues["matched"]).sum()
    
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    
    print(f"✅ Evaluation with step_tolerance = {step_tolerance}")
    print(f"True Positives: {TP}")
    print(f"False Positives: {FP}")
    print(f"False Negatives: {FN}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
    
    return precision, recall, f1

In [13]:
# Load true events
true_events_df = pd.read_csv(EVENTS_PATH)

# Optionally filter to only relevant events and cast step to int
true_events_df = true_events_df[["series_id", "step"]].dropna()
true_events_df["step"] = true_events_df["step"].astype(int)

# Load candidate predictions
candidates_df = pd.read_csv(CANDIDATES_PATH)

# Run evaluation
evaluate_predictions(candidates_df, true_events_df, step_tolerance=12)

✅ Evaluation with step_tolerance = 12
True Positives: 569
False Positives: 10302
False Negatives: 2384
Precision: 0.052
Recall: 0.193
F1 Score: 0.082


(np.float64(0.05234109097599117),
 np.float64(0.19268540467321368),
 np.float64(0.08232060185185186))

In [14]:
import pandas as pd

# Load previously saved candidates
c = pd.read_csv(CANDIDATES_PATH)

# Sort by 'series_id' and 'step'
c = c.sort_values(by=["series_id", "step"])

# Function to select the highest 'model1_score' within a 4000-step range
def select_max_score_in_range(group):
    selected_rows = []  # Store filtered rows
    
    # Iterate over each row and keep only the best one in the step range
    for i, row in group.iterrows():
        # Define a 4000-step window around the current step
        step_range = (row['step'] - 3600, row['step'] + 3500)
        
        # Filter rows within this range
        range_group = group[(group['step'] >= step_range[0]) & (group['step'] <= step_range[1])]
        
        # Select the row with the highest model1_score
        best_row = range_group.loc[range_group['model1_score'].idxmax()]
        selected_rows.append(best_row)
    
    # Return selected rows as a DataFrame
    return pd.DataFrame(selected_rows)

# Apply filtering function to each series_id group
result = c.groupby('series_id').apply(select_max_score_in_range).reset_index(drop=True)

# Drop duplicates to ensure one prediction per step
result = result.drop_duplicates(subset=["step"], keep="first")

# Save the filtered candidates
result.to_csv('results/filtered_candidates.csv', index=False)

print(f"Filtered candidates saved: {result.shape[0]} rows")

✅ Gefilterte Kandidaten gespeichert: 2528 Zeilen


  result = c.groupby('series_id').apply(select_max_score_in_range).reset_index(drop=True)


In [15]:

# Load the already filtered candidates from the result file
result = pd.read_csv('results/filtered_candidates.csv')

# Sort the data by 'series_id' and 'step'
result = result.sort_values(by=["series_id", "step"])

# Calculate the 'time_diff' for each 'series_id'
result['time_diff'] = result.groupby('series_id')['step'].diff().fillna(result['step'])

# Save the result with the new 'time_diff' column to a new CSV file
result.to_csv('results/timediff.csv', index=False)

print(f"Saved the file with calculated time differences: {result.shape[0]} rows")

✅ Die Datei mit den berechneten Time Diff gespeichert: 2528 Zeilen


In [16]:
# Load the file with calculated time differences
timediff = pd.read_csv('results/timediff.csv')

# Sort data by 'series_id' and 'step'
timediff = timediff.sort_values(by=["series_id", "step"])

# Define a function to assign alternating labels (onset/wakeup)
def assign_alternating_labels(group):
    # Split into even and odd rows
    even_rows = group.iloc[::2]
    odd_rows = group.iloc[1::2]
    
    # Calculate average time_diff for even and odd rows
    avg_time_even = even_rows['time_diff'].mean() if not even_rows.empty else 0
    avg_time_odd = odd_rows['time_diff'].mean() if not odd_rows.empty else 0
    
    # Assign event labels based on which has the higher average time_diff
    if avg_time_even > avg_time_odd:
        group['event'] = ['onset' if i % 2 == 0 else 'wakeup' for i in range(len(group))]
    else:
        group['event'] = ['onset' if i % 2 != 0 else 'wakeup' for i in range(len(group))]
    
    return group

# Apply the labeling function to each group of 'series_id'
timediff = timediff.groupby('series_id').apply(assign_alternating_labels)

# Save the result with assigned event labels
timediff.to_csv('results/labeled_candidates.csv', index=False)

print(f"✅ Saved file with alternating labels: {timediff.shape[0]} rows")

✅ Die Datei mit den alternierenden Labels gespeichert: 2528 Zeilen


  timediff = timediff.groupby('series_id').apply(assign_alternating_labels)


In [17]:
solution = pd.read_csv('processed/event_cleaned.csv')
submission = pd.read_csv('results/labeled_candidates.csv')
tolerances = {
    "onset": [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
    "wakeup": [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
}

column_names = {
    'series_id_column_name': 'series_id',
    'time_column_name': 'step',
    'event_column_name': 'event',
    'score_column_name': 'model1_score',
}

try:
    ap_score = score(solution, submission, tolerances, **column_names)
    print(f"Average Precision Score: {ap_score}")
except ParticipantVisibleError as e:
    print(f"Error: {e}")

Average Precision Score: 0.024597218231661333
