In [1]:
!pip install xgboost




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from event_detection_ap import score, ParticipantVisibleError
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import GroupKFold
from datetime import datetime
import gc
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import precision_recall_curve, precision_score, recall_score, f1_score


In [3]:
# Parameter
N_SPLITS = 5


# Dateipfade
DATA_PATH = "processed/merged_dff_gold84_V3.parquet"

EVENTS_PATH = "processed/event_cleaned.csv"


In [4]:
def postprocess_predictions(y_probs, threshold):
    # Apply threshold to the probabilities
    y_pred = (y_probs >= threshold).astype(int)
    return np.where(y_pred == 1)[0]  # Return indices where predictions are 1

In [5]:

def candidate_windowing(df):
    df["step_int"] = df["step"].astype(int)
    df["step_offset"] = df.groupby("series_id")["step_int"].transform(lambda x: x - x.min())
    return df[df["step_offset"] % 3 == 0].reset_index(drop=True)

In [6]:
def expand_labels(df, radius=4):
    def expand_group(group):
        target_array = group["target"].values.copy()
        shifted_targets = {}
        for shift in range(1, radius + 1):
            shifted_targets[f"minus_{shift}"] = group["target"].shift(-shift, fill_value=0).values
            shifted_targets[f"plus_{shift}"] = group["target"].shift(shift, fill_value=0).values

        # apply shifts without fragmenting the dataframe
        for values in shifted_targets.values():
            target_array |= values

        group = group.copy()  
        group["target"] = target_array
        return group

    df = df.groupby("series_id", group_keys=False).apply(expand_group)
    return df

In [7]:
df = pd.read_parquet(DATA_PATH)


In [8]:
# Feature Set
final_preset = [
    "anglez", "enmo", "hour", "minute", "is_night", "is_weekend",
    "anglez_delta", "anglez_lag_1", "enmo_lag_1",
    "anglez_mean_60s", "enmo_mean_60s", "enmo_std_60s",
    "anglez_min_60s", "enmo_max_60s",
    "enmo_cumulative_60s", "enmo_sma_60s",
    "enmo_mean_12s_lag_1", "anglez_std_60s_lag_1"
]

In [9]:
missing_features = [feat for feat in final_preset if feat not in df.columns]
print(missing_features)

[]


In [10]:
def run_experiment_simple(
    feature_set,
    radius=6,
    model_params=None,
    save_results=True
):
    df_exp = candidate_windowing(df)
    df_exp = expand_labels(df_exp, radius=radius)
    X = df_exp[feature_set].astype(np.float32)
    y = df_exp["target"]
    groups = df_exp["series_id"]
    meta = df_exp[["series_id", "step"]].copy()

    gkf = GroupKFold(n_splits=N_SPLITS)

    all_preds = []
    all_true = []
    all_probs = []
    all_meta = []
    all_best_thresholds = []  # Store the best threshold for each fold

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        meta_val = meta.iloc[val_idx]

        if model_params is None:
            model_params = {
                "objective": "binary:logistic",
                "n_estimators": 200,
                "max_depth": 6,
                "learning_rate": 0.05,
                "eval_metric": "logloss",
                "tree_method": "hist",
                "scale_pos_weight": 170,
                "random_state": 42,
            }

        model = xgb.XGBClassifier(**model_params)
        weights = compute_sample_weight("balanced", y_train)
        model.fit(X_train, y_train, sample_weight=weights)

        y_probs = model.predict_proba(X_val)[:, 1]

        # Threshold optimization
        prec, rec, thresholds_pr = precision_recall_curve(y_val, y_probs)
        
        
        valid = rec[:-1] >= 0.85  # -1 because rec is longer than thresholds
        if np.any(valid):
            prec_valid = prec[:-1][valid]
            thresholds_valid = thresholds_pr[valid]
            best_idx = np.argmax(prec_valid)
            best_threshold = thresholds_valid[best_idx]
        else:
            # Fallback: choose threshold with best F1 score
            f1s = 2 * (prec * rec) / (prec + rec + 1e-8)
            best_threshold = thresholds_pr[np.argmax(f1s)]

        all_best_thresholds.append(best_threshold)

        # Apply best threshold
        y_pred = (y_probs >= best_threshold).astype(int)

        all_probs.extend(y_probs)
        all_preds.extend(y_pred)
        all_true.extend(y_val)
        all_meta.append(meta_val)

        # Clean up memory
        del model
        gc.collect()

    # Compute evaluation metrics
    precision = precision_score(all_true, all_preds)
    recall = recall_score(all_true, all_preds)
    f1 = f1_score(all_true, all_preds)

    # Calculate global threshold (average across folds)
    global_threshold = np.mean(all_best_thresholds)
    print(f"\n⮕ Global threshold (average from all folds): {global_threshold:.4f}")

    if save_results:
        results_df = pd.concat(all_meta).copy()
        results_df["true_label"] = all_true
        results_df["pred_label"] = all_preds
        results_df["score"] = all_probs
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"results/model_preds_radius{radius}_{timestamp}.csv"
        results_df.to_csv(filename, index=False)
        print(f"Saved predictions to: {filename}")

    return precision, recall, f1, global_threshold

In [None]:
feature_sets = {
    "base": final_preset
}

radii = [12, 30, 50]
n_estimators_list = [300, 800, 1500]
max_depth_list = [4, 8]
learning_rate_list = [0.05, 0.02, 0.005]
scale_pos_weight_list = [10, 100]

best_results = {}  # Store best results for each radius

for name, feats in feature_sets.items():
    for r in radii:
        best_precision = -1  
        best_entry = None

        for n_estimators in n_estimators_list:
            for max_depth in max_depth_list:
                for learning_rate in learning_rate_list:
                    for scale_pos_weight in scale_pos_weight_list:
                        model_params = {
                            "objective": "binary:logistic",
                            "n_estimators": n_estimators,
                            "max_depth": max_depth,
                            "learning_rate": learning_rate,
                            "eval_metric": "logloss",
                            "tree_method": "hist",
                            "scale_pos_weight": scale_pos_weight,
                            "subsample": 0.8,
                            "colsample_bytree": 0.8,
                            "gamma": 1,
                            "min_child_weight": 5,
                            "random_state": 42,
                        }

                        # IMPORTANT: run_experiment_simple no longer expects a threshold input
                        p, r_, f, best_threshold = run_experiment_simple(
                            feats,
                            radius=r,
                            model_params=model_params
                        )

                        print(f"[{name}] radius={r} | best threshold={best_threshold:.2f} | "
                              f"n_estimators={n_estimators} | max_depth={max_depth} | "
                              f"lr={learning_rate:.3f} | spw={scale_pos_weight} → "
                              f"Precision={p:.3f}, Recall={r_: .3f}, F1={f:.3f}")

                        # Save if precision is the best so far for this radius
                        if p > best_precision:
                            best_precision = p
                            best_entry = {
                                "radius": r,
                                "threshold": best_threshold,
                                "n_estimators": n_estimators,
                                "max_depth": max_depth,
                                "learning_rate": learning_rate,
                                "scale_pos_weight": scale_pos_weight,
                                "precision": p,
                                "recall": r_,
                                "f1": f
                            }

        # After all parameter combinations for a given radius
        best_results[r] = best_entry

# Print summary of best configurations per radius
print("\n🏆 Best results per radius:")
for radius, result in best_results.items():
    print(f"Radius {radius}: "
          f"Precision={result['precision']:.3f}, Recall={result['recall']:.3f}, F1={result['f1']:.3f} | "
          f"Params: best_threshold={result['threshold']:.2f}, "
          f"n_estimators={result['n_estimators']}, max_depth={result['max_depth']}, "
          f"lr={result['learning_rate']:.3f}, spw={result['scale_pos_weight']}")

  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9127
Saved predictions to: results/model_preds_radius12_20250429_140838.csv
[base] radius=12 | best threshold=0.91 | n_estimators=300 | max_depth=4 | lr=0.050 | spw=10 → Precision=0.037, Recall= 0.850, F1=0.070


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9895
Saved predictions to: results/model_preds_radius12_20250429_141134.csv
[base] radius=12 | best threshold=0.99 | n_estimators=300 | max_depth=4 | lr=0.050 | spw=100 → Precision=0.035, Recall= 0.850, F1=0.068


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9195
Saved predictions to: results/model_preds_radius12_20250429_141438.csv
[base] radius=12 | best threshold=0.92 | n_estimators=300 | max_depth=4 | lr=0.020 | spw=10 → Precision=0.035, Recall= 0.850, F1=0.068


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9905
Saved predictions to: results/model_preds_radius12_20250429_141738.csv
[base] radius=12 | best threshold=0.99 | n_estimators=300 | max_depth=4 | lr=0.020 | spw=100 → Precision=0.034, Recall= 0.850, F1=0.065


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9045
Saved predictions to: results/model_preds_radius12_20250429_142030.csv
[base] radius=12 | best threshold=0.90 | n_estimators=300 | max_depth=4 | lr=0.005 | spw=10 → Precision=0.029, Recall= 0.851, F1=0.055


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9671
Saved predictions to: results/model_preds_radius12_20250429_142320.csv
[base] radius=12 | best threshold=0.97 | n_estimators=300 | max_depth=4 | lr=0.005 | spw=100 → Precision=0.022, Recall= 0.856, F1=0.043


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.7019
Saved predictions to: results/model_preds_radius12_20250429_142719.csv
[base] radius=12 | best threshold=0.70 | n_estimators=300 | max_depth=8 | lr=0.050 | spw=10 → Precision=0.033, Recall= 0.850, F1=0.063


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.8900
Saved predictions to: results/model_preds_radius12_20250429_143119.csv
[base] radius=12 | best threshold=0.89 | n_estimators=300 | max_depth=8 | lr=0.050 | spw=100 → Precision=0.030, Recall= 0.850, F1=0.057


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.8457
Saved predictions to: results/model_preds_radius12_20250429_143517.csv
[base] radius=12 | best threshold=0.85 | n_estimators=300 | max_depth=8 | lr=0.020 | spw=10 → Precision=0.035, Recall= 0.850, F1=0.067


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9686
Saved predictions to: results/model_preds_radius12_20250429_143927.csv
[base] radius=12 | best threshold=0.97 | n_estimators=300 | max_depth=8 | lr=0.020 | spw=100 → Precision=0.031, Recall= 0.850, F1=0.061


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.8764
Saved predictions to: results/model_preds_radius12_20250429_144329.csv
[base] radius=12 | best threshold=0.88 | n_estimators=300 | max_depth=8 | lr=0.005 | spw=10 → Precision=0.033, Recall= 0.850, F1=0.064


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9634
Saved predictions to: results/model_preds_radius12_20250429_144722.csv
[base] radius=12 | best threshold=0.96 | n_estimators=300 | max_depth=8 | lr=0.005 | spw=100 → Precision=0.030, Recall= 0.850, F1=0.057


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.8892
Saved predictions to: results/model_preds_radius12_20250429_145236.csv
[base] radius=12 | best threshold=0.89 | n_estimators=800 | max_depth=4 | lr=0.050 | spw=10 → Precision=0.037, Recall= 0.850, F1=0.070


  df = df.groupby("series_id", group_keys=False).apply(expand_group)



⮕ Global threshold (average from all folds): 0.9839
Saved predictions to: results/model_preds_radius12_20250429_145737.csv
[base] radius=12 | best threshold=0.98 | n_estimators=800 | max_depth=4 | lr=0.050 | spw=100 → Precision=0.034, Recall= 0.850, F1=0.066


  df = df.groupby("series_id", group_keys=False).apply(expand_group)
