In [18]:
import sys, pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

proj_root = pathlib.Path("..").resolve()
if str(proj_root) not in sys.path:
    sys.path.insert(0, str(proj_root))


In [19]:
from src.dataset import build_dataset
from src.config import FEATURE_COLS, TARGET_COL, TEST_SIZE

df_feat = build_dataset(
    include_vix=True,
    include_rates=True,
    include_usd_index=True,
    include_sp500=True,
)

print("df_feat shape:", df_feat.shape)
print("Période:", df_feat["date"].min().date(), "→", df_feat["date"].max().date())
print("Target balance:", df_feat[TARGET_COL].value_counts(normalize=True).round(3))


df_feat shape: (1250, 38)
Période: 2020-12-23 → 2025-10-23
Target balance: y_up
0    0.503
1    0.497
Name: proportion, dtype: float64


In [20]:
missing = [c for c in FEATURE_COLS if c not in df_feat.columns]
print("Features manquantes (config mais pas dans df_feat):", missing)

print("\nFeatures utilisées (presentes):")
print([c for c in FEATURE_COLS if c in df_feat.columns])


Features manquantes (config mais pas dans df_feat): []

Features utilisées (presentes):
['ret_lag1', 'ret_lag2', 'ret_lag3', 'ret_lag5', 'ret_rollmean_5', 'ret_rollstd_5', 'ret_rollstd_10', 'ret_rollstd_20', 'abs_ret_lag1', 'range_pct', 'mom_10', 'ma20_ratio', 'rsi_14', 'macd_hist', 'dow_sin', 'dow_cos', 'vix_lag1', 'dgs2_lag1', 'dgs10_lag1', 'term_spread_lag1', 'dtwexbgs_ret_lag1', 'sp500_ret_lag1']


In [21]:
cols_show = ["date", "close"]

for c in ["vix", "dgs2", "dgs10", "term_spread", "dtwexbgs", "sp500"]:
    if c in df_feat.columns:
        cols_show.append(c)

df_feat[cols_show].tail(10)


Unnamed: 0,date,close,vix,dgs2,dgs10,term_spread,dtwexbgs,sp500
1240,2025-10-10,1.16128,21.66,3.52,4.05,0.53,121.5217,6552.51
1241,2025-10-13,1.1572,19.03,3.52,4.05,0.53,121.5217,6654.72
1242,2025-10-14,1.16082,20.81,3.48,4.03,0.55,121.5815,6644.31
1243,2025-10-15,1.1644,20.64,3.5,4.05,0.55,121.2669,6671.06
1244,2025-10-16,1.16885,25.31,3.41,3.99,0.58,121.0834,6629.07
1245,2025-10-17,1.16708,20.78,3.46,4.02,0.56,121.1218,6664.01
1246,2025-10-20,1.16449,18.23,3.46,4.0,0.54,121.0394,6735.13
1247,2025-10-21,1.1605,17.87,3.45,3.98,0.53,121.302,6735.35
1248,2025-10-22,1.16128,18.6,3.45,3.97,0.52,121.3075,6699.4
1249,2025-10-23,1.15958,17.3,3.48,4.01,0.53,121.3633,6738.44


In [22]:
# normalement add_features a déjà dropna sur FEATURE_COLS
# mais on refait un petit dropna au cas où
df_model = df_feat.dropna(subset=FEATURE_COLS + [TARGET_COL]).copy()

print("df_model shape:", df_model.shape)
print("Target balance:", df_model[TARGET_COL].value_counts(normalize=True).round(3))


df_model shape: (1250, 38)
Target balance: y_up
0    0.503
1    0.497
Name: proportion, dtype: float64


In [23]:
from src.modeling import train_test_split_time

X_train, X_test, y_train, y_test = train_test_split_time(df_model, test_size=TEST_SIZE)

n = len(df_model)
cut = int(n * (1 - TEST_SIZE))
df_train = df_model.iloc[:cut].copy().reset_index(drop=True)
df_test  = df_model.iloc[cut:].copy().reset_index(drop=True)

print("Train:", X_train.shape, "| Test:", X_test.shape)
print("Test période:", df_test["date"].min().date(), "→", df_test["date"].max().date())


Train: (1000, 22) | Test: (250, 22)
Test période: 2024-11-05 → 2025-10-23


In [24]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_train, y_train)
pred = dummy.predict(X_test)
print("Dummy accuracy:", round(accuracy_score(y_test, pred), 3))

dummy_s = DummyClassifier(strategy="stratified", random_state=0)
dummy_s.fit(X_train, y_train)
proba = dummy_s.predict_proba(X_test)[:, 1]
print("Dummy ROC-AUC:", round(roc_auc_score(y_test, proba), 3))


Dummy accuracy: 0.508
Dummy ROC-AUC: 0.52


In [25]:
from src.modeling import (
    build_logreg_pipeline,
    build_decision_tree,
    build_random_forest,
    build_gradient_boosting,
)
from src.evaluation import evaluate_classifier

models = {
    "LogReg": build_logreg_pipeline(),
    "DecisionTree": build_decision_tree(),
    "RandomForest": build_random_forest(),
    "GradientBoosting": build_gradient_boosting(),
}

results = []
fitted = {}

for name, m in models.items():
    m.fit(X_train, y_train)
    acc, auc = evaluate_classifier(m, X_train, y_train, X_test, y_test, name=name)
    results.append((name, acc, auc))
    fitted[name] = m

pd.DataFrame(results, columns=["model", "accuracy", "roc_auc"]).sort_values("roc_auc", ascending=False)


=== LogReg ===
Accuracy: 0.468
ROC-AUC : 0.492
Confusion matrix:
 [[37 90]
 [43 80]]

Classification report:
               precision    recall  f1-score   support

           0      0.463     0.291     0.357       127
           1      0.471     0.650     0.546       123

    accuracy                          0.468       250
   macro avg      0.467     0.471     0.452       250
weighted avg      0.466     0.468     0.450       250

=== DecisionTree ===
Accuracy: 0.476
ROC-AUC : 0.474
Confusion matrix:
 [[53 74]
 [57 66]]

Classification report:
               precision    recall  f1-score   support

           0      0.482     0.417     0.447       127
           1      0.471     0.537     0.502       123

    accuracy                          0.476       250
   macro avg      0.477     0.477     0.475       250
weighted avg      0.477     0.476     0.474       250

=== RandomForest ===
Accuracy: 0.544
ROC-AUC : 0.534
Confusion matrix:
 [[64 63]
 [51 72]]

Classification report:
     

Unnamed: 0,model,accuracy,roc_auc
2,RandomForest,0.544,0.533513
0,LogReg,0.468,0.491838
3,GradientBoosting,0.476,0.480763
1,DecisionTree,0.476,0.474489


In [26]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score

X = df_feat[FEATURE_COLS].values
y = df_feat[TARGET_COL].values

tscv = TimeSeriesSplit(n_splits=5)
scores = []

m = build_logreg_pipeline()
for tr_idx, va_idx in tscv.split(X):
    m.fit(X[tr_idx], y[tr_idx])
    proba = m.predict_proba(X[va_idx])[:, 1]
    scores.append(roc_auc_score(y[va_idx], proba))

print("LogReg CV ROC-AUC:", np.round(scores, 3))
print("Mean:", round(float(np.mean(scores)), 3))


LogReg CV ROC-AUC: [0.522 0.531 0.574 0.445 0.503]
Mean: 0.515


In [27]:
from src.modeling import grid_search_timeseries
from sklearn.ensemble import RandomForestClassifier

rf_base = RandomForestClassifier(random_state=0, n_jobs=-1)

param_grid_rf = {
    "n_estimators": [200, 400],
    "max_depth": [4, 6, 8],
    "min_samples_leaf": [20, 50],
}

rf_best, rf_params, rf_cvscore = grid_search_timeseries(
    rf_base, param_grid_rf, X_train, y_train, n_splits=5
)

print("Best RF params:", rf_params)
print("CV ROC-AUC:", round(rf_cvscore, 3))

acc_rf_best, auc_rf_best = evaluate_classifier(
    rf_best, X_train, y_train, X_test, y_test, name="RandomForest tuned"
)

results.append(("RandomForest tuned", acc_rf_best, auc_rf_best))
fitted["RandomForest tuned"] = rf_best

pd.DataFrame(results, columns=["model", "accuracy", "roc_auc"]).sort_values("roc_auc", ascending=False)


Best RF params: {'max_depth': 8, 'min_samples_leaf': 50, 'n_estimators': 200}
CV ROC-AUC: 0.564
=== RandomForest tuned ===
Accuracy: 0.544
ROC-AUC : 0.536
Confusion matrix:
 [[67 60]
 [54 69]]

Classification report:
               precision    recall  f1-score   support

           0      0.554     0.528     0.540       127
           1      0.535     0.561     0.548       123

    accuracy                          0.544       250
   macro avg      0.544     0.544     0.544       250
weighted avg      0.544     0.544     0.544       250



Unnamed: 0,model,accuracy,roc_auc
4,RandomForest tuned,0.544,0.536265
2,RandomForest,0.544,0.533513
0,LogReg,0.468,0.491838
3,GradientBoosting,0.476,0.480763
1,DecisionTree,0.476,0.474489


In [28]:
best_name = max(results, key=lambda x: x[2])[0]
best_model = fitted[best_name]
print("Best model (by TEST ROC-AUC):", best_name)


Best model (by TEST ROC-AUC): RandomForest tuned


In [30]:
from sklearn.metrics import roc_auc_score

proba_test = best_model.predict_proba(X_test)[:, 1]
df_test = df_test.copy()
df_test["proba_up"] = proba_test

df_test[["date", TARGET_COL, "proba_up"]].head()


Unnamed: 0,date,y_up,proba_up
0,2024-11-05,0,0.530613
1,2024-11-06,1,0.516384
2,2024-11-07,0,0.556379
3,2024-11-08,0,0.554542
4,2024-11-11,0,0.555941


In [34]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

# segment TEST cohérent
cut = int(len(df_feat) * 0.8)
df_test = df_feat.iloc[cut:].copy().reset_index(drop=True)

# proba du modèle best
df_test["proba_up"] = best.predict_proba(df_test[FEATURE_COLS].values)[:, 1]
df_test["year"] = df_test["date"].dt.year

# petit debug : combien de points par année ?
print("Nb obs par année (TEST):")
print(df_test["year"].value_counts().sort_index())

by_year = []
for y, g in df_test.groupby("year"):
    n = len(g)

    # si une année a que 0 ou que 1 dans y_up => AUC impossible / pas utile
    if n >= 20 and g[TARGET_COL].nunique() == 2:
        auc = roc_auc_score(g[TARGET_COL].values, g["proba_up"].values)
    else:
        auc = np.nan

    by_year.append((y, auc, n))

df_auc_year = pd.DataFrame(by_year, columns=["year", "roc_auc", "n"]).sort_values("year")
df_auc_year


Nb obs par année (TEST):
year
2024     40
2025    210
Name: count, dtype: int64


Unnamed: 0,year,roc_auc,n
0,2024,0.607143,40
1,2025,0.548279,210
