In [53]:
#!/usr/bin/env python

import pandas as pd
import numpy as np
import xgboost as xgb
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# === Load Model 2A scored candidates ===
print("📥 Loading candidates...")
candidates = pd.read_csv("/home/jovyan/AIcomp/results/model2a_scored_candidates.csv")  # Path to output from Model 2A
events = pd.read_csv("/home/jovyan/AIcomp/data/train_events.csv")  # Path to true labels

# === Keep only candidates close to true events (already pre-filtered normally) ===
candidates = candidates[candidates["step_distance"] <= 90].copy()

# === Create binary label: onset (0) vs wakeup (1) ===
# Use nearest event label from true events
print("🔍 Matching to nearest true event to get event labels...")

event_labels = []
for idx, row in candidates.iterrows():
    series_id = row["series_id"]
    step = row["step"]
    # Copy to avoid SettingWithCopyWarning and ensure assignment works
    series_events = events[events["series_id"] == series_id].copy()
    series_events["distance"] = (series_events["step"] - step).abs()
    if not series_events.empty:
        nearest = series_events.sort_values("distance").iloc[0]
        label = 0 if nearest["event"] == "onset" else 1
    else:
        label = np.nan  # No label found — will be dropped
    event_labels.append(label)

candidates["target_event"] = event_labels
candidates = candidates.dropna(subset=["target_event"]).copy()
candidates["target_event"] = candidates["target_event"].astype(int)

# === Define features to use for Model 2B ===
features2b = [
    "anglez", "enmo",
    "hour", "minute", "is_weekend", "is_night",
    "anglez_delta", "anglez_lag_1", "enmo_lag_1",
    "anglez_mean_60s", "enmo_mean_60s",
    "enmo_cumulative_60s", "enmo_sma_60s",
    "enmo_mean_12s_lag_1", "anglez_std_60s_lag_1"
]

# === Prepare data ===
X = candidates[features2b].astype(np.float32)
y = candidates["target_event"]

# === Train/validation split ===
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# === Define Model 2B ===
model2b = xgb.XGBClassifier(
    objective="binary:logistic",
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss"
)

# === Train Model 2B ===
print("🧠 Training Model 2B (onset vs wakeup classifier)...")
model2b.fit(X_train, y_train)

# === Validation ===
y_pred_probs = model2b.predict_proba(X_val)[:, 1]
y_pred_labels = (y_pred_probs >= 0.5).astype(int)

rmse = mean_squared_error(y_val, y_pred_labels) ** 0.5
print(f"✅ Validation RMSE: {rmse:.5f}")

# === Save Model 2B ===
model2b.save_model("model2b_classifier.json")
with open("model2b_features.json", "w") as f:
    json.dump(features2b, f)

print("💾 Model 2B and features saved.")

# === Build final submission ===
print("📦 Generating final submission CSV...")

# Predict event type for all candidates
full_X = candidates[features2b].astype(np.float32)
event_probs = model2b.predict_proba(full_X)[:, 1]

candidates["predicted_event_label"] = np.where(event_probs >= 0.5, "wakeup", "onset")
candidates["final_score"] = candidates["target"]  # From Model 2A confidence

# Pick needed columns
submission = candidates.copy()
submission = submission[["series_id", "step", "predicted_event_label", "final_score"]]
submission = submission.rename(columns={"predicted_event_label": "event", "final_score": "score"})

# Assign row_id
submission["row_id"] = range(len(submission))
submission = submission[["row_id", "series_id", "step", "event", "score"]]

# Save final submission
submission.to_csv("submission_model2b.csv", index=False)
print("✅ Final submission saved: submission_model2b.csv")


📥 Loading candidates...
🔍 Matching to nearest true event to get event labels...
🧠 Training Model 2B (onset vs wakeup classifier)...
✅ Validation RMSE: 0.12212
💾 Model 2B and features saved.
📦 Generating final submission CSV...
✅ Final submission saved: submission_model2b.csv


In [57]:
# Filter by score threshold
threshold = 0.1
filtered = submission[submission["score"] > threshold].copy()

# Sort by score and drop overlapping predictions within ±24 steps
filtered = filtered.sort_values(["series_id", "step", "score"], ascending=[True, True, False])
final_preds = []

for series_id, group in filtered.groupby("series_id"):
    taken_steps = []
    for _, row in group.iterrows():
        step = row["step"]
        if all(abs(step - s) > 24 for s in taken_steps):
            final_preds.append(row)
            taken_steps.append(step)

# Convert to DataFrame
final_df = pd.DataFrame(final_preds)

# Reassign row_id
final_df["row_id"] = range(len(final_df))
final_df = final_df[["row_id", "series_id", "step", "event", "score"]]

# Save clean final submission
final_df.to_csv("submission_model2b_filtered.csv", index=False)
print("✅ Final filtered submission saved.")

✅ Final filtered submission saved.


In [58]:
import pandas as pd
import numpy as np
import xgboost as xgb
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
output = pd.read_csv("/home/jovyan/AICOMP/code/data/raw/submission_model2b_filtered.csv")

In [59]:
len(output)

1517

In [51]:
gt = pd.read_csv("/home/jovyan/AICOMP/code/data/raw/event_cleaned.csv")

In [52]:
len(gt)

2953