# 04 ‚Äî MerLin / QML (hybride)

But: tester **ML vs ML+MerLin**.

On met MerLin sur **peu de features** (4 max), puis on concat√®ne l'embedding au vecteur classique.

‚û°Ô∏è Dans l'environnement hackathon, remplace le placeholder par les appels MerLin r√©els (les mentors vous donneront l‚ÄôAPI).

In [1]:
import sys
from pathlib import Path

ROOT = Path("..").resolve()
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

import numpy as np
import pandas as pd

from utils import get_logger
logger = get_logger("notebook", log_file=str(ROOT/"logs"/"run.log"))

from data import load_ohlc_from_xlsx
from features import build_features
from labels import add_target_20d_score, fit_score_scaler
from split import time_series_splits
from merlin_embedder import MerlinEmbedder

from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from metrics import mse, directional_accuracy, information_coefficient

XLSX = str(ROOT / "dataset_train.xlsx")
df = load_ohlc_from_xlsx(XLSX, sheet_name="Gold")
df = add_target_20d_score(build_features(df), horizon=20)
df = df.dropna().reset_index(drop=True)

# üëá Pour QML: on limite volontairement √† quelques features "fortes"
qml_features = ["mom_20", "ret_vol_20", "rsi_14", "atr_rel_14"]
qml_features = [c for c in qml_features if c in df.columns]

exclude = {"Date","Open","High","Low","Close","fut_ret_20","y_score"}
all_features = [c for c in df.columns if c not in exclude]

X_all = df[all_features].to_numpy()
X_qml = df[qml_features].to_numpy()
future_ret = df["fut_ret_20"].to_numpy()

rows = []
for split_id, (tr, te) in enumerate(time_series_splits(len(df), n_splits=5, embargo=0), start=1):
    scale = fit_score_scaler(pd.Series(future_ret[tr]), std_mult=2.0)
    y_tr = np.clip(future_ret[tr] / scale, -1.0, 1.0)
    y_te = np.clip(future_ret[te] / scale, -1.0, 1.0)

    # 1) Baseline (sans QML)
    baseline = Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0))])
    baseline.fit(X_all[tr], y_tr)
    pred_b = baseline.predict(X_all[te])

    rows.append({
        "split": split_id, "variant": "baseline_ridge",
        "mse": mse(y_te, pred_b),
        "dir_acc": directional_accuracy(y_te, pred_b),
        "ic": information_coefficient(y_te, pred_b)
    })

    # 2) QML embedding sur quelques features + concat avec features classiques
    emb = MerlinEmbedder(n_qubits=4, n_features_in=X_qml.shape[1])
    Z_tr = emb.fit_transform(X_qml[tr], y_tr)
    Z_te = emb.transform(X_qml[te])

    X_tr_h = np.concatenate([X_all[tr], Z_tr], axis=1)
    X_te_h = np.concatenate([X_all[te], Z_te], axis=1)

    hybrid = Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0))])
    hybrid.fit(X_tr_h, y_tr)
    pred_h = hybrid.predict(X_te_h)

    rows.append({
        "split": split_id, "variant": "hybrid_ridge_with_merlin",
        "mse": mse(y_te, pred_h),
        "dir_acc": directional_accuracy(y_te, pred_h),
        "ic": information_coefficient(y_te, pred_h)
    })

res = pd.DataFrame(rows)
res


2025-12-15 16:07:18,581 | INFO | data | Loading sheet=Gold from C:\Users\fayca\Downloads\hackathon_gold_project\hackathon_gold_project\dataset_train.xlsx
2025-12-15 16:07:20,612 | INFO | data | Loaded 11340 rows, columns=['Date', 'Open', 'High', 'Low', 'Close', 'smavg_50', 'smavg_100', 'smavg_240']
2025-12-15 16:07:20,614 | INFO | features | Building features...
2025-12-15 16:07:20,645 | INFO | features | Features built. Total columns=34
2025-12-15 16:07:20,674 | INFO | labels | Fitted score scale=0.192158 (std_mult=2.00, std=0.096079)
2025-12-15 16:07:20,733 | INFO | labels | Fitted score scale=0.151396 (std_mult=2.00, std=0.075698)
2025-12-15 16:07:20,762 | INFO | labels | Fitted score scale=0.127992 (std_mult=2.00, std=0.063996)
2025-12-15 16:07:20,803 | INFO | labels | Fitted score scale=0.118446 (std_mult=2.00, std=0.059223)
2025-12-15 16:07:20,846 | INFO | labels | Fitted score scale=0.117416 (std_mult=2.00, std=0.058708)


Unnamed: 0,split,variant,mse,dir_acc,ic
0,1,baseline_ridge,0.059001,0.52221,0.084885
1,1,hybrid_ridge_with_merlin,0.059008,0.52221,0.084778
2,2,baseline_ridge,0.035732,0.56013,0.114044
3,2,hybrid_ridge_with_merlin,0.035736,0.56013,0.114048
4,3,baseline_ridge,0.084193,0.517876,0.153824
5,3,hybrid_ridge_with_merlin,0.084193,0.517876,0.153819
6,4,baseline_ridge,0.710243,0.36403,0.004987
7,4,hybrid_ridge_with_merlin,0.710246,0.36403,0.00499
8,5,baseline_ridge,0.163484,0.444745,0.054947
9,5,hybrid_ridge_with_merlin,0.163483,0.444745,0.054967


In [2]:

res.groupby("variant")[["mse","dir_acc","ic"]].mean().sort_values("mse")


Unnamed: 0_level_0,mse,dir_acc,ic
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
baseline_ridge,0.210531,0.481798,0.082537
hybrid_ridge_with_merlin,0.210533,0.481798,0.08252
