# 03 — Baselines ML (TimeSeriesSplit)

On compare plusieurs baselines sur la cible **score ∈ [-1,1]**.

Métriques:
- **MSE** (erreur)
- **Directional accuracy** (signe correct)
- **Information Coefficient** (corrélation de rang Spearman)


In [4]:
import sys
from pathlib import Path

ROOT = Path("..").resolve()
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

import numpy as np
import pandas as pd

from utils import get_logger
logger = get_logger("notebook", log_file=str(ROOT/"logs"/"run.log"))

from data import load_ohlc_from_xlsx
from features import build_features
from labels import add_target_20d_score, fit_score_scaler, apply_score
from split import time_series_splits
from models import get_baselines
from metrics import mse, directional_accuracy, information_coefficient

In [5]:

from models import get_baselines
from split import time_series_splits
from metrics import mse, directional_accuracy, information_coefficient
from labels import fit_score_scaler
from data import load_ohlc_from_xlsx
from features import build_features
from labels import add_target_20d_score
import pandas as pd
from pathlib import Path


ROOT = Path("..").resolve()

XLSX = ROOT / "dataset_train.xlsx"

df = load_ohlc_from_xlsx(XLSX, sheet_name="Gold")
df = add_target_20d_score(build_features(df), horizon=20)
df = df.dropna().reset_index(drop=True)

exclude = {"Date","Open","High","Low","Close","fut_ret_20","y_score"}
feature_cols = [c for c in df.columns if c not in exclude]

X = df[feature_cols].to_numpy()
future_ret = df["fut_ret_20"].to_numpy()

rows = []

for split_id, (tr, te) in enumerate(time_series_splits(len(df), n_splits=5), start=1):
    scale = fit_score_scaler(pd.Series(future_ret[tr]), std_mult=2.0)
    y_tr = (future_ret[tr] / scale).clip(-1, 1)
    y_te = (future_ret[te] / scale).clip(-1, 1)

    for spec in get_baselines():
        model = spec.model
        model.fit(X[tr], y_tr)
        pred = model.predict(X[te])

        rows.append({
            "asset": "Gold",
            "model": spec.name,
            "split": split_id,
            "ic": information_coefficient(y_te, pred),
            "dir_acc": directional_accuracy(y_te, pred),
            "mse": mse(y_te, pred)
        })

results_gold = pd.DataFrame(rows)
results_gold


2025-12-15 16:29:59,728 | INFO | data | Loading sheet=Gold from C:\Users\fayca\Downloads\hackathon_gold_project\hackathon_gold_project\dataset_train.xlsx
2025-12-15 16:30:03,479 | INFO | data | Loaded 11340 rows, columns=['Date', 'Open', 'High', 'Low', 'Close', 'smavg_50', 'smavg_100', 'smavg_240']
2025-12-15 16:30:03,480 | INFO | features | Building features...
2025-12-15 16:30:03,506 | INFO | features | Features built. Total columns=34
2025-12-15 16:30:03,529 | INFO | labels | Fitted score scale=0.192158 (std_mult=2.00, std=0.096079)
2025-12-15 16:30:12,456 | INFO | labels | Fitted score scale=0.151396 (std_mult=2.00, std=0.075698)
2025-12-15 16:30:29,389 | INFO | labels | Fitted score scale=0.127992 (std_mult=2.00, std=0.063996)
2025-12-15 16:30:54,733 | INFO | labels | Fitted score scale=0.118446 (std_mult=2.00, std=0.059223)
2025-12-15 16:31:26,713 | INFO | labels | Fitted score scale=0.117416 (std_mult=2.00, std=0.058708)


Unnamed: 0,asset,model,split,ic,dir_acc,mse
0,Gold,ridge,1,0.084885,0.52221,0.059001
1,Gold,random_forest,1,0.094134,0.419827,0.157641
2,Gold,gbrt,1,0.187243,0.561213,0.117895
3,Gold,ridge,2,0.114044,0.56013,0.035732
4,Gold,random_forest,2,0.036182,0.523294,0.041974
5,Gold,gbrt,2,0.024483,0.529252,0.058618
6,Gold,ridge,3,0.153824,0.517876,0.084193
7,Gold,random_forest,3,0.042797,0.44312,0.151104
8,Gold,gbrt,3,0.018202,0.470206,0.156136
9,Gold,ridge,4,0.004987,0.36403,0.710243


In [6]:
results_gold.groupby("model")[["ic","dir_acc","mse"]].mean().sort_values("ic", ascending=False)


Unnamed: 0_level_0,ic,dir_acc,mse
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ridge,0.082537,0.481798,0.210531
gbrt,0.055438,0.476706,0.2211
random_forest,0.015276,0.441062,0.20515


✅ Prends le meilleur compromis (souvent RF/GBRT) comme baseline officielle.