In [1]:
!pip install xgboost
!pip install wandb



In [2]:
# %% [markdown]
# # 📊 Model 1 & Model 2 Training & Evaluation Notebook
#
# Dieses Notebook führt dich durch den vollständigen Prozess:
# - Datenvorbereitung
# - Training von Model 1 (Binary Classifier)
# - Kandidatengenerierung
# - Training von Model 2 (Regressor zur Verfeinerung der Scores)
# - Evaluation

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from event_detection_ap import score, ParticipantVisibleError
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import joblib
from datetime import datetime
from sklearn.model_selection import GroupKFold

In [4]:
# Parameter
N_SPLITS = 5
THRESHOLD = 0.9

# Dateipfade
DATA_PATH = "/home/jovyan/AICOMP/code/data/processed/merged_dff_gold84_V2.parquet"
CANDIDATES_PATH = "model1_candidates_kfold.csv"
EVENTS_PATH = "/home/jovyan/AICOMP/Second/code/data/raw/event_cleaned.csv"
OUTPUT_PREDICTIONS = "model2_predictions.csv"


In [5]:
def postprocess_predictions(y_probs, threshold):
    # Apply threshold to the probabilities
    y_pred = (y_probs >= threshold).astype(int)
    return np.where(y_pred == 1)[0]  # Return indices where predictions are 1

In [6]:
# ## 📥 Daten laden und vorbereiten

# %%
def candidate_windowing(df):
    df["step_int"] = df["step"].astype(int)
    df["step_offset"] = df.groupby("series_id")["step_int"].transform(lambda x: x - x.min())
    return df[df["step_offset"] % 3 == 0].reset_index(drop=True)

In [7]:
def expand_labels(df, radius=4):
    def expand_group(group):
        target_array = group["target"].values.copy()
        shifted_targets = {}
        for shift in range(1, radius + 1):
            shifted_targets[f"minus_{shift}"] = group["target"].shift(-shift, fill_value=0).values
            shifted_targets[f"plus_{shift}"] = group["target"].shift(shift, fill_value=0).values

        # Now apply shifts without fragmenting the dataframe
        for values in shifted_targets.values():
            target_array |= values

        group = group.copy()  # To avoid SettingWithCopyWarning
        group["target"] = target_array
        return group

    df = df.groupby("series_id", group_keys=False).apply(expand_group)
    return df

In [8]:
def oversample(df):
    positives = df[df["target"] == 1]
    return pd.concat([df, positives, positives, positives]).sample(frac=1).reset_index(drop=True)


In [9]:
df = pd.read_parquet(DATA_PATH)


In [10]:
# Feature Set
final_preset = [
    "anglez", "enmo", "hour", "minute", "is_night", "is_weekend",
    "anglez_delta", "anglez_lag_1", "enmo_lag_1",
    "anglez_mean_60s", "enmo_mean_60s", "enmo_std_60s",
    "anglez_min_60s", "enmo_max_60s",
    "enmo_cumulative_60s", "enmo_sma_60s",
    "enmo_mean_12s_lag_1", "anglez_std_60s_lag_1"
]

In [11]:
missing_features = [feat for feat in final_preset if feat not in df.columns]
print(missing_features)

[]


In [12]:

def run_experiment_simple(
    feature_set,
    radius=6,
    threshold=0.9,
    model_params=None,
    save_results=True
):
    df_exp = candidate_windowing(df)
    df_exp = expand_labels(df_exp, radius=radius)

    X = df_exp[feature_set].astype(np.float32)
    y = df_exp["target"]
    groups = df_exp["series_id"]
    meta = df_exp[["series_id", "step"]].copy()

    gkf = GroupKFold(n_splits=N_SPLITS)
    all_preds = []
    all_true = []
    all_probs = []
    all_meta = []

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        meta_val = meta.iloc[val_idx]

        model = xgb.XGBClassifier(**model_params)
        weights = compute_sample_weight("balanced", y_train)
        model.fit(X_train, y_train, sample_weight=weights)

        y_probs = model.predict_proba(X_val)[:, 1]
        y_pred = (y_probs >= threshold).astype(int)

        all_probs.extend(y_probs)
        all_preds.extend(y_pred)
        all_true.extend(y_val)
        all_meta.append(meta_val)

    precision = precision_score(all_true, all_preds)
    recall = recall_score(all_true, all_preds)
    f1 = f1_score(all_true, all_preds)
    print(X_train.dtypes)
    
    if save_results:
        results_df = pd.concat(all_meta).copy()
        results_df["true_label"] = all_true
        results_df["pred_label"] = all_preds
        results_df["score"] = all_probs

        
        filename = f"newmodel_preds.csv"
        results_df.to_csv(filename, index=False)
        print(f"Saved predictions to: {filename}")
    
    import gc

    # After model training:
    del model
    gc.collect()
    return precision, recall, f1

In [13]:
from sklearn.utils.class_weight import compute_sample_weight

# Fixed set of parameters
feature_sets = {
    "base": final_preset  # Assuming final_preset is defined
}

radius = 50
threshold = 0.9

# Model hyperparameters
model_params = {
                "objective": "binary:logistic",
                "n_estimators": 500,        # fixed now
                "max_depth": 4,              # fixed now
                "learning_rate": 0.02,       # fixed now
                "eval_metric": "logloss",
                "use_label_encoder": False,
                 "tree_method": "hist",
                  "scale_pos_weight": 10,  # still depends on loop
                "subsample": 0.8,
                "colsample_bytree": 0.8,
                "gamma": 1,
                 "min_child_weight": 5,
                 "random_state": 42,
}

# Run the experiment for a fixed set of parameters
for name, feats in feature_sets.items():
    print(f"Running experiment for feature set: {name}")
    
    p, r_, f = run_experiment_simple(
        feats,
        radius=radius,
        threshold=threshold,
        model_params=model_params
    )

    print(f"Experiment Results for Radius={radius}, Threshold={threshold}: "
          f"Precision={p:.3f}, Recall={r_: .3f}, F1={f:.3f}")

Running experiment for feature set: base


  df = df.groupby("series_id", group_keys=False).apply(expand_group)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


anglez                  float32
enmo                    float32
hour                    float32
minute                  float32
is_night                float32
is_weekend              float32
anglez_delta            float32
anglez_lag_1            float32
enmo_lag_1              float32
anglez_mean_60s         float32
enmo_mean_60s           float32
enmo_std_60s            float32
anglez_min_60s          float32
enmo_max_60s            float32
enmo_cumulative_60s     float32
enmo_sma_60s            float32
enmo_mean_12s_lag_1     float32
anglez_std_60s_lag_1    float32
dtype: object
Saved predictions to: newmodel_preds.csv
Experiment Results for Radius=50, Threshold=0.9: Precision=0.101, Recall= 0.862, F1=0.180


In [14]:


# Step 1: Load the CSV
dfn = pd.read_csv("newmodel_preds.csv")

# Step 2: Keep only rows where pred_label == 1
dfn = dfn[dfn["pred_label"] == 1]

# Step 3: Drop the 'true_label' column
dfn = dfn.drop(columns=["true_label", "pred_label"])

# Step 4 (Optional): Save the cleaned CSV
dfn.to_csv("newmodel_preds_cleaned.csv", index=False)

print("✅ Cleaned file saved as 'newmodel_preds_cleaned.csv'")

✅ Cleaned file saved as 'newmodel_preds_cleaned.csv'


In [15]:
batch_size = 100_000  # you can adjust this depending on your memory
merged_batches = []

# Load your main dataframe only once
df_features = df[['series_id', 'step'] + final_preset]

# Process in batches
for start_idx in range(0, len(dfn), batch_size):
    end_idx = start_idx + batch_size
    batch = dfn.iloc[start_idx:end_idx]

    # Merge the batch
    merged_batch = pd.merge(batch, df_features, how='left', on=['series_id', 'step'])
    merged_batches.append(merged_batch)

    print(f"✅ Processed batch {start_idx} - {end_idx}")

# Combine all merged batches into one big DataFrame
merged_df = pd.concat(merged_batches, ignore_index=True)

# Save
merged_df.to_csv('merged_predictionsN.csv', index=False)

print(f"✅ Final merged DataFrame shape: {merged_df.shape}")

✅ Processed batch 0 - 100000
✅ Processed batch 100000 - 200000
✅ Processed batch 200000 - 300000
✅ Processed batch 300000 - 400000
✅ Processed batch 400000 - 500000
✅ Processed batch 500000 - 600000
✅ Processed batch 600000 - 700000
✅ Processed batch 700000 - 800000
✅ Processed batch 800000 - 900000
✅ Processed batch 900000 - 1000000
✅ Processed batch 1000000 - 1100000
✅ Processed batch 1100000 - 1200000
✅ Processed batch 1200000 - 1300000
✅ Processed batch 1300000 - 1400000
✅ Processed batch 1400000 - 1500000
✅ Processed batch 1500000 - 1600000
✅ Processed batch 1600000 - 1700000
✅ Processed batch 1700000 - 1800000
✅ Processed batch 1800000 - 1900000
✅ Processed batch 1900000 - 2000000
✅ Processed batch 2000000 - 2100000
✅ Processed batch 2100000 - 2200000
✅ Processed batch 2200000 - 2300000
✅ Processed batch 2300000 - 2400000
✅ Processed batch 2400000 - 2500000
✅ Processed batch 2500000 - 2600000
✅ Final merged DataFrame shape: (2518917, 21)
