In [3]:
# simple_baseline.py
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score

train = pd.read_csv('../data/raw/train.csv')
labels = pd.read_csv('../data/raw/train_labels.csv')

df = train.merge(labels, on='date_id')
target_cols = [c for c in labels.columns if c.startswith('target_')]

# Forward fill
df[target_cols] = df[target_cols].ffill()
df = df.dropna(subset=target_cols)

# Split 70/30
split = int(len(df) * 0.7)
train_targets = df[target_cols].iloc[:split].values
test_targets = df[target_cols].iloc[split:].values

# Baseline 1: Predict 0 (no change)
baseline_zero = np.zeros_like(test_targets)
r2_zero = r2_score(test_targets, baseline_zero)

# Baseline 2: Predict train mean
baseline_mean = np.tile(train_targets.mean(axis=0), (len(test_targets), 1))
r2_mean = r2_score(test_targets, baseline_mean)

print(f"Baseline (predict 0): R² = {r2_zero:.4f}")
print(f"Baseline (predict mean): R² = {r2_mean:.4f}")
print(f"\nYour model must beat these to be useful")

Baseline (predict 0): R² = -0.0046
Baseline (predict mean): R² = -0.0027

Your model must beat these to be useful
