# 06 — Multi-asset models comparison (ML vs ML+MerLin)

Objectif :
1) Comparer plusieurs modèles **classiques** (Ridge / RandomForest / GBRT) sur **tous les actifs** (toutes les feuilles Excel).
2) Ajouter une comparaison **QML hybride** via **MerLin** (embedding) sur une liste d'actifs ciblés.

Métriques :
- **IC (Spearman)** : métrique principale en finance
- Directional accuracy
- MSE (sur score)

⚠️ Notes :
- Validation temporelle (TimeSeriesSplit)
- Scaling du score [-1,1] calibré **sur le train uniquement**
- MerLin appliqué sur **peu de features** (3–5 max)


In [9]:
import sys
from pathlib import Path

# Assumes this notebook is inside <project>/notebooks/
ROOT = Path.cwd().parent
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

import numpy as np
import pandas as pd

from utils import get_logger
logger = get_logger("multi_asset", log_file=str(ROOT/"logs"/"multi_asset.log"))

logger.info("ROOT=%s", ROOT)
logger.info("SRC=%s (exists=%s)", SRC, SRC.exists())


2025-12-16 15:11:50,856 | INFO | multi_asset | ROOT=c:\Users\fayca\Downloads\hackathon_gold_project\hackathon_gold_project
2025-12-16 15:11:50,859 | INFO | multi_asset | SRC=c:\Users\fayca\Downloads\hackathon_gold_project\hackathon_gold_project\src (exists=True)


## 1) Charger la liste des actifs (feuilles Excel)

In [10]:
import pandas as pd

XLSX = ROOT / "dataset_train.xlsx"
assert XLSX.exists(), f"Excel not found: {XLSX}"

xls = pd.ExcelFile(XLSX, engine="openpyxl")
assets = xls.sheet_names
logger.info("Found %d sheets/assets: %s", len(assets), assets)

assets


2025-12-16 15:11:53,134 | INFO | multi_asset | Found 9 sheets/assets: ['MSCI world', 'S&P 500 index', 'Euro stoxx 50', 'Dax', 'CAC 40', 'Gold', 'Pétrole', 'Dollar Index', 'EURUSD']


['MSCI world',
 'S&P 500 index',
 'Euro stoxx 50',
 'Dax',
 'CAC 40',
 'Gold',
 'Pétrole',
 'Dollar Index',
 'EURUSD']

## 2) Imports du pipeline (features / labels / splits / modèles / MerLin)

In [11]:
from data import load_ohlc_from_xlsx
from features import build_features
from labels import add_target_20d_score, fit_score_scaler
from split import time_series_splits
from models import get_baselines
from metrics import mse, directional_accuracy, information_coefficient

from merlin_embedder import MerlinEmbedder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge


## 3) Paramètres d'expérience

In [12]:
N_SPLITS = 5
MIN_ROWS = 300  # ignorer les actifs avec trop peu d'historique

# QML : cibler 1-2 actifs max
default_qml = []
for name in ['MSCI world', 'S&P 500 index', 'Euro stoxx 50', 'Dax', 'CAC 40', 'Gold', 'Pétrole', 'Dollar Index', 'EURUSD']:
    if name in assets:
        default_qml.append(name)
if len(default_qml) < 2:
    default_qml = assets[:2]

QML_ASSETS = default_qml

# Features QML (3–5 max)
QML_FEATURES = ["mom_20", "ret_vol_20", "rsi_14", "atr_rel_14"]

logger.info("QML_ASSETS=%s", QML_ASSETS)
logger.info("QML_FEATURES=%s", QML_FEATURES)


2025-12-16 15:12:00,171 | INFO | multi_asset | QML_ASSETS=['MSCI world', 'S&P 500 index', 'Euro stoxx 50', 'Dax', 'CAC 40', 'Gold', 'Pétrole', 'Dollar Index', 'EURUSD']
2025-12-16 15:12:00,173 | INFO | multi_asset | QML_FEATURES=['mom_20', 'ret_vol_20', 'rsi_14', 'atr_rel_14']


## 4) Boucle multi-actifs : ML classique + variante MerLin

In [13]:
all_results = []

for asset in assets:
    try:
        df = load_ohlc_from_xlsx(str(XLSX), sheet_name=asset)
        df = add_target_20d_score(build_features(df), horizon=20)
        df = df.dropna().reset_index(drop=True)

        if len(df) < MIN_ROWS:
            logger.warning("Skip %s (rows=%d < MIN_ROWS=%d)", asset, len(df), MIN_ROWS)
            continue

        exclude = {"Date","Open","High","Low","Close","fut_ret_20","y_score"}
        feature_cols = [c for c in df.columns if c not in exclude]
        X = df[feature_cols].to_numpy()
        fut = df["fut_ret_20"].to_numpy()

        baselines = get_baselines()

        do_qml = asset in QML_ASSETS
        qml_cols = [c for c in QML_FEATURES if c in df.columns]
        X_qml = df[qml_cols].to_numpy() if do_qml and len(qml_cols) >= 2 else None

        for split_id, (tr, te) in enumerate(time_series_splits(len(df), n_splits=N_SPLITS, embargo=0), start=1):
            scale = fit_score_scaler(pd.Series(fut[tr]), std_mult=2.0)
            y_tr = np.clip(fut[tr] / scale, -1.0, 1.0)
            y_te = np.clip(fut[te] / scale, -1.0, 1.0)

            # Classic models
            for spec in baselines:
                model = spec.model
                model.fit(X[tr], y_tr)
                pred = model.predict(X[te])

                all_results.append({
                    "asset": asset,
                    "model": spec.name,
                    "variant": "classic",
                    "split": split_id,
                    "ic": information_coefficient(y_te, pred),
                    "dir_acc": directional_accuracy(y_te, pred),
                    "mse": mse(y_te, pred),
                })

            # MerLin hybrid (ridge on embedding)
            if do_qml and X_qml is not None:
                emb = MerlinEmbedder(n_qubits=min(4, X_qml.shape[1]), n_features_in=X_qml.shape[1])
                Z_tr = emb.fit_transform(X_qml[tr], y_tr)
                Z_te = emb.transform(X_qml[te])

                qml_model = Pipeline([("scaler", StandardScaler()), ("model", Ridge(alpha=1.0))])
                qml_model.fit(Z_tr, y_tr)
                pred_q = qml_model.predict(Z_te)

                all_results.append({
                    "asset": asset,
                    "model": "ridge_merlin",
                    "variant": "qml",
                    "split": split_id,
                    "ic": information_coefficient(y_te, pred_q),
                    "dir_acc": directional_accuracy(y_te, pred_q),
                    "mse": mse(y_te, pred_q),
                    "qml_features": ",".join(qml_cols),
                })

        logger.info("Done asset=%s (rows=%d, features=%d, qml=%s)", asset, len(df), len(feature_cols), do_qml)

    except Exception as e:
        logger.exception("Asset failed: %s | err=%s", asset, e)

results_all = pd.DataFrame(all_results)
logger.info("Total rows in results: %d", len(results_all))
results_all.head()


2025-12-16 15:12:04,174 | INFO | data | Loading sheet=MSCI world from c:\Users\fayca\Downloads\hackathon_gold_project\hackathon_gold_project\dataset_train.xlsx


2025-12-16 15:12:09,395 | INFO | data | Loaded 12387 rows, columns=['Date', 'Open', 'High', 'Low', 'Close', 'smavg_50', 'smavg_100', 'smavg_240']
2025-12-16 15:12:09,399 | INFO | features | Building features...
2025-12-16 15:12:09,468 | INFO | features | Features built. Total columns=34
2025-12-16 15:12:09,520 | INFO | labels | Fitted score scale=0.079461 (std_mult=2.00, std=0.039730)
2025-12-16 15:12:21,041 | INFO | labels | Fitted score scale=0.082479 (std_mult=2.00, std=0.041239)
2025-12-16 15:12:36,919 | INFO | labels | Fitted score scale=0.078186 (std_mult=2.00, std=0.039093)
2025-12-16 15:13:01,600 | INFO | labels | Fitted score scale=0.081490 (std_mult=2.00, std=0.040745)
2025-12-16 15:13:35,786 | INFO | labels | Fitted score scale=0.085597 (std_mult=2.00, std=0.042798)
2025-12-16 15:14:19,800 | INFO | multi_asset | Done asset=MSCI world (rows=12120, features=29, qml=True)
2025-12-16 15:14:19,801 | INFO | data | Loading sheet=S&P 500 index from c:\Users\fayca\Downloads\hackathon

Unnamed: 0,asset,model,variant,split,ic,dir_acc,mse,qml_features
0,MSCI world,ridge,classic,1,0.058053,0.430693,0.685453,
1,MSCI world,random_forest,classic,1,0.008372,0.394554,0.946715,
2,MSCI world,gbrt,classic,1,-0.075678,0.373762,1.5052,
3,MSCI world,ridge_merlin,qml,1,0.080159,0.594059,0.233141,"mom_20,ret_vol_20,rsi_14,atr_rel_14"
4,MSCI world,ridge,classic,2,-0.053449,0.568317,0.200035,


## 5) Résumé (moyenne sur splits)

In [14]:
summary = (
    results_all
    .groupby(["asset","model","variant"])
    .agg(ic=("ic","mean"), dir_acc=("dir_acc","mean"), mse=("mse","mean"))
    .reset_index()
    .sort_values(["asset","ic"], ascending=[True, False])
)
summary


Unnamed: 0,asset,model,variant,ic,dir_acc,mse
0,CAC 40,gbrt,classic,0.104343,0.540435,0.36738
1,CAC 40,random_forest,classic,0.054719,0.524883,0.410935
3,CAC 40,ridge_merlin,qml,0.047535,0.559409,0.206009
2,CAC 40,ridge,classic,0.013629,0.471073,0.705627
7,Dax,ridge_merlin,qml,0.060737,0.576441,0.250892
5,Dax,random_forest,classic,0.030571,0.478231,0.442123
4,Dax,gbrt,classic,0.021556,0.468191,0.522334
6,Dax,ridge,classic,-0.010995,0.465308,0.480299
9,Dollar Index,random_forest,classic,0.106363,0.50095,0.313029
8,Dollar Index,gbrt,classic,0.093039,0.503948,0.365078


## 6) Comparaison Classic vs QML sur les actifs ciblés

In [15]:
qml_compare = summary[summary["asset"].isin(QML_ASSETS)].copy()
qml_compare.sort_values(["asset","variant","ic"], ascending=[True, True, False])


Unnamed: 0,asset,model,variant,ic,dir_acc,mse
0,CAC 40,gbrt,classic,0.104343,0.540435,0.36738
1,CAC 40,random_forest,classic,0.054719,0.524883,0.410935
2,CAC 40,ridge,classic,0.013629,0.471073,0.705627
3,CAC 40,ridge_merlin,qml,0.047535,0.559409,0.206009
5,Dax,random_forest,classic,0.030571,0.478231,0.442123
4,Dax,gbrt,classic,0.021556,0.468191,0.522334
6,Dax,ridge,classic,-0.010995,0.465308,0.480299
7,Dax,ridge_merlin,qml,0.060737,0.576441,0.250892
9,Dollar Index,random_forest,classic,0.106363,0.50095,0.313029
8,Dollar Index,gbrt,classic,0.093039,0.503948,0.365078


## 7) Export des résultats

In [16]:
out_dir = ROOT / "outputs"
out_dir.mkdir(exist_ok=True)

raw_path = out_dir / "results_all_assets_models_with_merlin_raw.csv"
sum_path = out_dir / "results_all_assets_models_with_merlin_summary.csv"

results_all.to_csv(raw_path, index=False)
summary.to_csv(sum_path, index=False)

logger.info("Exported raw: %s", raw_path)
logger.info("Exported summary: %s", sum_path)

(raw_path, sum_path)


2025-12-16 15:30:30,114 | INFO | multi_asset | Exported raw: c:\Users\fayca\Downloads\hackathon_gold_project\hackathon_gold_project\outputs\results_all_assets_models_with_merlin_raw.csv
2025-12-16 15:30:30,117 | INFO | multi_asset | Exported summary: c:\Users\fayca\Downloads\hackathon_gold_project\hackathon_gold_project\outputs\results_all_assets_models_with_merlin_summary.csv


(WindowsPath('c:/Users/fayca/Downloads/hackathon_gold_project/hackathon_gold_project/outputs/results_all_assets_models_with_merlin_raw.csv'),
 WindowsPath('c:/Users/fayca/Downloads/hackathon_gold_project/hackathon_gold_project/outputs/results_all_assets_models_with_merlin_summary.csv'))

## Notes soutenance
- IC est la métrique principale.
- Si `ridge_merlin` ≈ `ridge` : résultat neutre (OK).
- Si `ridge_merlin` > `ridge` sur 1-2 actifs : petit gain (bonus).
- Si GBRT reste meilleur : conclusion honnête (classique très compétitif).
