In [2]:
#!/usr/bin/env python

import pandas as pd
import numpy as np
import xgboost as xgb
import json
from tqdm import tqdm
import time

# Paths (adjust if needed)
CANDIDATES_PATH = "/home/jovyan/AIcomp/processed/merged_predictionsN.csv"
EVENTS_PATH = "/home/jovyan/AIcomp/data/train_events.csv"
SAVE_MODEL_PATH = "/home/jovyan/AIcomp/Safed Models/model2a_confidence_regressor.json"
SAVE_FEATURES_PATH = "/home/jovyan/AIcomp/results/model2a_features.json"
SAVE_SCORED_CANDIDATES_PATH = "/home/jovyan/AIcomp/results/model2a_scored_candidates.csv"

# Hyperparameters
MATCH_WINDOW = 6  # ±6 steps (30 seconds)
GAUSS_SIGMA = 3   # how fast confidence decays with step distance

def match_candidates(candidates, events):
    print(f"🔍 Matching {len(candidates)} candidates to true events...")

    matched = []
    events_grouped = events.groupby("series_id")

    for series_id, group in tqdm(candidates.groupby("series_id")):
        true_steps = events_grouped.get_group(series_id)["step"].values if series_id in events_grouped.groups else np.array([])
        for idx, row in group.iterrows():
            step = row["step"]
            if len(true_steps) > 0:
                min_dist = np.min(np.abs(true_steps - step))
                matched.append(min_dist)
            else:
                matched.append(np.inf)  # no events for this series

    candidates["step_distance"] = matched
    return candidates

def apply_gaussian_labeling(candidates):
    candidates["target"] = np.exp(-0.5 * (candidates["step_distance"] / GAUSS_SIGMA) ** 2)
    return candidates

def train_regressor(data, feature_cols):
    X = data[feature_cols].astype(np.float32)
    y = data["target"]

    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        tree_method="hist",
        random_state=42
    )

    print("🧠 Training Model 2A (Confidence Regressor)...")
    start = time.time()
    model.fit(X, y)
    duration = (time.time() - start) / 60
    print(f"✅ Model trained in {duration:.2f} min")

    return model

def main():
    print("📥 Loading data...")
    candidates = pd.read_csv(CANDIDATES_PATH)
    events = pd.read_csv(EVENTS_PATH)

    # Step 1: Match candidates to true events
    candidates = match_candidates(candidates, events)

    # Step 2: Keep candidates within reasonable distance (optional)
    candidates = candidates[candidates["step_distance"] <= 500].reset_index(drop=True)

    # Step 3: Apply Gaussian confidence label
    candidates = apply_gaussian_labeling(candidates)

    # Step 4: Select features
    feature_cols = [
        "anglez", "enmo", "hour", "minute", "is_night", "is_weekend",
        "anglez_delta", "anglez_lag_1", "enmo_lag_1",
        "anglez_mean_60s", "enmo_mean_60s",
        "enmo_cumulative_60s", "enmo_sma_60s",
        "enmo_mean_12s_lag_1", "anglez_std_60s_lag_1"
    ]

    # Step 5: Train model
    model = train_regressor(candidates, feature_cols)

    # Step 6: Save model and feature list
    model.save_model(SAVE_MODEL_PATH)
    with open(SAVE_FEATURES_PATH, "w") as f:
        json.dump(feature_cols, f)

    # Step 7: Predict confidence scores for all candidates
    X_all = candidates[feature_cols].astype(np.float32)
    candidates["model2a_score"] = model.predict(X_all)

    # Step 8: Save scored candidates for Model 2B
    candidates.to_csv(SAVE_SCORED_CANDIDATES_PATH, index=False)
    print(f"💾 Scored candidates saved to {SAVE_SCORED_CANDIDATES_PATH}")

    print("✅ Model 2A pipeline complete!")

if __name__ == "__main__":
    main()


📥 Loading data...


FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/AIcomp/processed/merged_predictionsN.csv'