In [54]:
# --- Import de packages ---

import pandas as pd
import taceconomics
from datetime import datetime
import numpy as np

from gdeltdoc import GdeltDoc, Filters
from datetime import datetime, timedelta
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [None]:
# --- API Key et dates ---

taceconomics.api_key = "sk_o24BhJRqVpIvxVSXX3yiKRGgpDEejmyJ8pfLFX2q22s"  
start_date = '2023-01-01'
end_date = datetime.today().strftime("%Y-%m-%d")

In [59]:
# --- Importation des données ---

# Taux de change EUR/USD
usd_eur = taceconomics.getdata(f"EXR/EUR/WLD?start_date={start_date}")
usd_eur.columns = ["usd_eur"]
usd_eur = usd_eur.dropna()
eur_usd = 1 / usd_eur  # Inversion pour avoir EUR/USD
eur_usd.columns = ["close"]
eur_usd.index = pd.to_datetime(eur_usd.index)

print(eur_usd.head(10))
print(eur_usd.shape)
print(eur_usd.columns)

               close
timestamp           
2023-05-04  1.102256
2023-05-05  1.121026
2023-05-06  1.121026
2023-05-07  1.101831
2023-05-08  1.099578
2023-05-09  1.096431
2023-05-10  1.098370
2023-05-11  1.091179
2023-05-12  1.093967
2023-05-13  1.093967
(874, 1)
Index(['close'], dtype='object')


In [None]:
# --- Indicateurs de base ---

# Rendement logarithmique
eur_usd["rendement_log"] = np.log(eur_usd["close"] / eur_usd["close"].shift(1))

# Volatilité glissante (30 jours) basée sur les rendements log
eur_usd["vol_30j"] = eur_usd["rendement_log"].rolling(window=30).std()

# Rendement logarithmique cumulé à 10 jours
eur_usd["rendement_log_10j"] = np.log(eur_usd["close"] / eur_usd["close"].shift(10))

# print(eur_usd.head(60))
print(eur_usd.shape)
print(eur_usd.columns)

(874, 4)
Index(['close', 'rendement_log', 'vol_30j', 'rendement_log_10j'], dtype='object')


In [61]:
# --- Cible --- 

# Rendement futur à 10 jours (≈ 2 semaines)
eur_usd["rendement_futur_10j"] = eur_usd["close"].shift(-10).pct_change(periods=10)

# Seuils de choc dynamiques (±2 × volatilité)
seuil_haut = 2 * eur_usd["vol_30j"]
seuil_bas = -2 * eur_usd["vol_30j"]

# Target : 1 si choc de volatilité (hausse/baisse), 0 sinon
eur_usd["target"] = (
    (eur_usd["rendement_futur_10j"] >= seuil_haut) |
    (eur_usd["rendement_futur_10j"] <= seuil_bas)
).astype(int)

# print(eur_usd.head(60))
print(eur_usd.shape)
print(eur_usd.columns)

(874, 6)
Index(['close', 'rendement_log', 'vol_30j', 'rendement_log_10j',
       'rendement_futur_10j', 'target'],
      dtype='object')


  eur_usd["rendement_futur_10j"] = eur_usd["close"].shift(-10).pct_change(periods=10)


In [62]:
# --- Indicateurs techniques ---

# Moyennes mobiles
eur_usd["mm7"] = eur_usd["close"].rolling(window=7).mean()
eur_usd["mm21"] = eur_usd["close"].rolling(window=21).mean()

# Bandes de Bollinger (20 jours)
rolling_mean_20 = eur_usd["close"].rolling(window=20).mean()
rolling_std_20 = eur_usd["close"].rolling(window=20).std()
eur_usd["boll_haut"] = rolling_mean_20 + 2 * rolling_std_20
eur_usd["boll_bas"] = rolling_mean_20 - 2 * rolling_std_20

# RSI
def calculer_rsi(series, window=14):
    delta = series.diff()
    gain = np.where(delta > 0, delta, 0)
    perte = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain, index=series.index).rolling(window=window).mean()
    avg_perte = pd.Series(perte, index=series.index).rolling(window=window).mean()
    rs = avg_gain / avg_perte
    rsi = 100 - (100 / (1 + rs))
    return rsi

eur_usd["rsi_14j"] = calculer_rsi(eur_usd["close"])

# --- Nettoyage final ---

eur_usd = eur_usd.drop(columns=["rendement_futur_10j"])
eur_usd = eur_usd.dropna().copy()

# print(eur_usd.head(60))
print(eur_usd.shape)
print(eur_usd.columns)
print(eur_usd.iloc[0])

(844, 10)
Index(['close', 'rendement_log', 'vol_30j', 'rendement_log_10j', 'target',
       'mm7', 'mm21', 'boll_haut', 'boll_bas', 'rsi_14j'],
      dtype='object')
close                 1.072559
rendement_log         0.000000
vol_30j               0.005411
rendement_log_10j    -0.002793
target                0.000000
mm7                   1.072378
mm21                  1.077441
boll_haut             1.088046
boll_bas              1.066056
rsi_14j              33.354206
Name: 2023-06-03 00:00:00, dtype: float64


In [63]:
eur_usd.to_excel("data/test_eur_usd_data.xlsx")

In [68]:
# --- Indicateurs de sentiment ---

# Télécharger VADER (une seule fois)
nltk.download('vader_lexicon')

# Initialiser VADER globalement
sid = SentimentIntensityAnalyzer()

def recuperer_sentiment_gdelt(start_date, end_date, keyword="EUR/USD", language='eng', chunk_days=30, num_records=250):
    """
    Récupère les articles GDELT pour un mot-clé donné entre deux dates,
    calcule le score de sentiment pour chaque article, et agrège par jour.
    """
    # Convertir les dates
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    gd = GdeltDoc()
    all_articles = []

    current_start = start
    while current_start <= end:
        current_end = min(current_start + timedelta(days=chunk_days), end)

        # print(f"Récupération articles: {current_start.date()} -> {current_end.date()}")

        # Créer filtre
        f = Filters(
            start_date=current_start.strftime("%Y-%m-%d"),
            end_date=current_end.strftime("%Y-%m-%d"),
            num_records=num_records,
            keyword=keyword,
            language=language
        )

        # Récupérer articles
        try:
            articles_df = gd.article_search(f)
            if not articles_df.empty:
                all_articles.append(articles_df)
        except Exception as e:
            print(f"Erreur récupération: {e}")

        current_start = current_end + timedelta(days=1)

    # Vérifier si on a récupéré des articles
    if not all_articles:
        print("Aucun article trouvé pour la période.")
        return pd.DataFrame(columns=['date', 'sentiment'])

    df = pd.concat(all_articles, ignore_index=True)

    # Parsing date GDELT
    def convertir_date_gdelt(date_str):
        try:
            # Format AAAAMMJJTHHMMSSZ ou AAAAMMJJ
            date_part = date_str.split('T')[0]
            return datetime.strptime(date_part, "%Y%m%d").date()
        except:
            return pd.NaT

    df['date'] = df['seendate'].apply(convertir_date_gdelt)
    df = df.dropna(subset=['date'])
    df['date'] = pd.to_datetime(df['date'])

    # Calculer sentiment pour chaque article
    def calculer_sentiment(texte):
        if isinstance(texte, str) and texte.strip():
            return sid.polarity_scores(texte)['compound']
        return 0

    df['sentiment'] = df['title'].apply(calculer_sentiment)

    # Agréger par jour : score moyen par jour
    df_daily = df.groupby('date')['sentiment'].mean().reset_index()

    return df_daily

start_date = eur_usd.iloc[0].name.strftime("%Y-%m-%d")
end_date = eur_usd.iloc[-1].name.strftime("%Y-%m-%d")

print(f"Récupération sentiment GDELT de {start_date} à {end_date}")

df_sentiment = recuperer_sentiment_gdelt(
    start_date=start_date,
    end_date=end_date,
    keyword="EUR/USD",
    language='eng',
    chunk_days=30,
    num_records=250
)

df_sentiment = df_sentiment.groupby('date')['sentiment'].mean()
df_sentiment.index = pd.to_datetime(df_sentiment.index)

print(df_sentiment.head(10))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/noe/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Récupération sentiment GDELT de 2023-06-03 à 2025-10-05
date
2023-06-03    0.283725
2023-06-05   -0.116700
2023-06-06   -0.440400
2023-06-07   -0.057000
2023-06-08   -0.256643
2023-06-09    0.017650
2023-06-10    0.000000
2023-06-12   -0.160760
2023-06-13   -0.027826
2023-06-14   -0.018286
Name: sentiment, dtype: float64


In [69]:
# --- Fusion des données de sentiment avec les données EUR/USD ---

eur_usd = eur_usd.merge(df_sentiment.rename("sentiment"), left_index=True, right_index=True, how='left')

# Remplir les NaN si aucun article
eur_usd['sentiment'] = eur_usd['sentiment'].fillna(0)

print(eur_usd.head(10))

               close  rendement_log   vol_30j  rendement_log_10j  target  \
timestamp                                                                  
2023-06-03  1.072559       0.000000  0.005411          -0.002793       0   
2023-06-04  1.069745      -0.002626  0.004245          -0.002487       1   
2023-06-05  1.071145       0.001308  0.004268          -0.001855       1   
2023-06-06  1.069770      -0.001285  0.003061          -0.003139       1   
2023-06-07  1.070710       0.000878  0.003072          -0.001455       1   
2023-06-08  1.078237       0.007006  0.003367           0.006984       1   
2023-06-09  1.076538      -0.001577  0.003343           0.003166       1   
2023-06-10  1.076538       0.000000  0.003153           0.006702       1   
2023-06-11  1.075061      -0.001373  0.003105          -0.000914       1   
2023-06-12  1.076264       0.001119  0.003119           0.003449       1   

                 mm7      mm21  boll_haut  boll_bas    rsi_14j  sentiment  
timestamp  

In [70]:
# --- Variables PCA & Clustering ---

# Copie sécurisée
df = eur_usd.copy().dropna().reset_index()

# Sélection des features quantitatives 
# On exclut les colonnes non numériques ou non pertinentes
features = df.drop(columns=['target'], errors='ignore').select_dtypes(include=[np.number])

# Standardisation 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# PCA (réduction de dimension) 
# Garde assez de composantes pour expliquer 90 % de la variance
pca = PCA(n_components=0.9, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Créer un DataFrame avec les composantes principales
df_pca = pd.DataFrame(
    X_pca,
    columns=[f'PC{i+1}' for i in range(X_pca.shape[1])],
    index=df.index
)

# KMeans (clustering sur l’espace PCA) 
# Trouve des structures cachées dans les données
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
df['cluster_kmeans'] = kmeans.fit_predict(X_pca)

# Fusion finale 
df_extended = pd.concat([df, df_pca], axis=1)

# Résumé 
print(f"Variance expliquée par PCA : {pca.explained_variance_ratio_.sum():.2%}")
print(f"Nombre de composantes PCA retenues : {pca.n_components_}")
print(f"Forme finale du DataFrame : {df_extended.shape}")
print(df_extended.head(10))
print(df_extended.columns)

Variance expliquée par PCA : 97.37%
Nombre de composantes PCA retenues : 5
Forme finale du DataFrame : (844, 18)
   timestamp     close  rendement_log   vol_30j  rendement_log_10j  target  \
0 2023-06-03  1.072559       0.000000  0.005411          -0.002793       0   
1 2023-06-04  1.069745      -0.002626  0.004245          -0.002487       1   
2 2023-06-05  1.071145       0.001308  0.004268          -0.001855       1   
3 2023-06-06  1.069770      -0.001285  0.003061          -0.003139       1   
4 2023-06-07  1.070710       0.000878  0.003072          -0.001455       1   
5 2023-06-08  1.078237       0.007006  0.003367           0.006984       1   
6 2023-06-09  1.076538      -0.001577  0.003343           0.003166       1   
7 2023-06-10  1.076538       0.000000  0.003153           0.006702       1   
8 2023-06-11  1.075061      -0.001373  0.003105          -0.000914       1   
9 2023-06-12  1.076264       0.001119  0.003119           0.003449       1   

        mm7      mm21  boll_

In [80]:
#----Variables macro----

start_date = df_extended["timestamp"].iloc[0].strftime("%Y-%m-%d")
print(start_date)
end_date = df_extended["timestamp"].iloc[-1].strftime("%Y-%m-%d")
print(end_date)

print(f"Récupération variables macro de {start_date} à {end_date}")

# Inflation
infl_eur = taceconomics.getdata(f"EUROSTAT/EI_CPHI_M_CP-HI00_NSA_HICP2015/EUZ?collapse=D&transform=growth_yoy&start_date={start_date}")
infl_us = taceconomics.getdata(f"FRED/CPIAUCSL/USA?collapse=D&transform=growth_yoy&start_date={start_date}")

 # Taux interets
ti_eur = taceconomics.getdata(f"ECB/FM_D_EUR_4F_KR_DFR_LEV/EUZ?collapse=D&collapse_mode=end_of_period&start_date={start_date}")
ti_us = taceconomics.getdata(f"DS/USPRATE./WLD?collapse=D&start_date={start_date}")

# Dataframe macro
df_macro = pd.DataFrame()
df_macro["inflation_eur"] = infl_eur
df_macro["inflation_us"] = infl_us
df_macro["interest_rate_eur"] = ti_eur
df_macro["interest_rate_us"] = ti_us

df_macro = df_macro.ffill()

print(df_macro.tail(10))
print(df_macro.shape)

2023-06-03
2025-10-05
Récupération variables macro de 2023-06-03 à 2025-10-05
            inflation_eur  inflation_us  interest_rate_eur  interest_rate_us
timestamp                                                                   
2025-09-21       2.227488       2.93922                2.0              4.25
2025-09-22       2.227488       2.93922                2.0              4.25
2025-09-23       2.227488       2.93922                2.0              4.25
2025-09-24       2.227488       2.93922                2.0              4.25
2025-09-25       2.227488       2.93922                2.0              4.25
2025-09-26       2.227488       2.93922                2.0              4.25
2025-09-27       2.227488       2.93922                2.0              4.25
2025-09-28       2.227488       2.93922                2.0              4.25
2025-09-29       2.227488       2.93922                2.0              4.25
2025-09-30       2.227488       2.93922                2.0              4.2

In [61]:
df_final = pd.concat([df_extended, df_macro], axis=1)
print(df_final.tail(10))
print(df_final.columns)

           timestamp  eur_usd  taux_croissance  vol  rendement_10j  \
2025-09-21       NaT      NaN              NaN  NaN            NaN   
2025-09-22       NaT      NaN              NaN  NaN            NaN   
2025-09-23       NaT      NaN              NaN  NaN            NaN   
2025-09-24       NaT      NaN              NaN  NaN            NaN   
2025-09-25       NaT      NaN              NaN  NaN            NaN   
2025-09-26       NaT      NaN              NaN  NaN            NaN   
2025-09-27       NaT      NaN              NaN  NaN            NaN   
2025-09-28       NaT      NaN              NaN  NaN            NaN   
2025-09-29       NaT      NaN              NaN  NaN            NaN   
2025-09-30       NaT      NaN              NaN  NaN            NaN   

            seuil_haut  seuil_bas  target  moyenne_mobile_7j  \
2025-09-21         NaN        NaN     NaN                NaN   
2025-09-22         NaN        NaN     NaN                NaN   
2025-09-23         NaN        NaN    

In [87]:
# MODELISATION

# xgb_single_model_timeseries.py
"""
XGBoost focalisé pour prédiction d'un choc à 2 semaines sur EUR/USD.
Sorties :
- AUC, Gini, courbe ROC (test),
- seuil optimal (Youden) et matrice de confusion + métriques associées,
- modèle final enregistré (joblib).
"""

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from datetime import datetime

# -----------------------
# Config 
# -----------------------
RANDOM_STATE = 42
# Fractions pour split contigu (doivent sommer à 1.0)
TRAIN_FRAC = 0.70
VAL_FRAC   = 0.15
TEST_FRAC  = 0.15

# TimeSeries CV splits (pour GridSearch)
TS_SPLITS = 5

# Grid search params 
XGB_PARAM_GRID = {
    'n_estimators': [100, 300],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Metric d'optimisation pour la recherche d'hyperparamètres (ici F1, tu peux changer en 'roc_auc')
GRID_SCORING = 'f1'

# Early stopping rounds pour re-entrainement final
EARLY_STOPPING_ROUNDS = 50

# Fichiers de sortie
MODEL_OUTPATH = "xgb_final_model.joblib"
IMPUTER_OUTPATH = "imputer.joblib"

# ===============================
# Décalage de la variable cible pour prédire le choc à 2 semaines
# ===============================

SHIFT_DAYS = 10  # environ 2 semaines ouvrées

# On crée une copie pour éviter les erreurs d’alignement
df_final = df_extended.copy()

# Décaler la target vers le passé : les features du jour t servent à prédire le choc à t+10
df_final["target_future"] = df_final["target"].shift(-SHIFT_DAYS)

# Supprimer les lignes où la cible future est manquante (en fin de série)
df_final = df_final.dropna(subset=["target_future"])

# Définir X (features actuelles) et y (choc futur)
X = df_final.drop(columns=["target", "target_future"])  # on garde uniquement les variables explicatives
y = df_final["target_future"].astype(int)

print("✅ Variable cible correctement décalée de 2 semaines.")
print(f"Shape finale : {X.shape}, target positive rate = {y.mean():.3f}")

# -----------------------
# Chargement & checks
# -----------------------
print("Chargement des données...")

df = df_final.copy().dropna().sort_values("timestamp").reset_index(drop=True)

# Features & target
X = df.drop(columns=["target", "target_future"])
y = df["target_future"].astype(int)

# On suppose tout numérique — sinon adapter types/catégoriques
num_cols = X.columns.tolist()

# -----------------------
# Split contigu (train / val / test)
# -----------------------
n = len(df)
if not abs(TRAIN_FRAC + VAL_FRAC + TEST_FRAC - 1.0) < 1e-8:
    raise ValueError("TRAIN_FRAC + VAL_FRAC + TEST_FRAC doit être égal à 1.0")

train_end = int(n * TRAIN_FRAC)
val_end = train_end + int(n * VAL_FRAC)

X_train = X.iloc[:train_end].copy()
y_train = y.iloc[:train_end].copy()

X_val = X.iloc[train_end:val_end].copy()
y_val = y.iloc[train_end:val_end].copy()

X_test = X.iloc[val_end:].copy()
y_test = y.iloc[val_end:].copy()

print(f"Tailles -> train: {X_train.shape}, val: {X_val.shape}, test: {X_test.shape}")

# -----------------------
# Imputation (median) - fit uniquement sur train
# -----------------------
imputer = SimpleImputer(strategy='median')
imputer.fit(X_train[num_cols])

X_train_imp = pd.DataFrame(imputer.transform(X_train[num_cols]), index=X_train.index, columns=num_cols)
X_val_imp   = pd.DataFrame(imputer.transform(X_val[num_cols]), index=X_val.index, columns=num_cols)
X_test_imp  = pd.DataFrame(imputer.transform(X_test[num_cols]), index=X_test.index, columns=num_cols)

# Save imputer for reproducibility / production
joblib.dump(imputer, IMPUTER_OUTPATH)
print(f"Imputer sauvegardé -> {IMPUTER_OUTPATH}")

# -----------------------
# GridSearchCV (TimeSeriesSplit) pour chercher les meilleurs hyperparams
# -----------------------
print("\nLancement GridSearchCV (TimeSeriesSplit) sur XGBoost...")

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE, n_jobs=-1)

tscv = TimeSeriesSplit(n_splits=TS_SPLITS)

# GridSearchCV sur les features imputées (pas de scaling nécessaire pour XGBoost)
gscv = GridSearchCV(
    estimator=xgb,
    param_grid=XGB_PARAM_GRID,
    scoring=GRID_SCORING,
    cv=tscv,
    n_jobs=-1,
    verbose=1,
    refit=True
)

gscv.fit(X_train_imp, y_train)
print("Meilleurs paramètres trouvés (GridSearchCV):")
print(gscv.best_params_)
print(f"Best CV {GRID_SCORING}: {gscv.best_score_:.4f}")

# -----------------------
# Ré-entraînement final avec early stopping sur l'échantillon de validation
# -----------------------
best_params = gscv.best_params_.copy()

# Conserver paramètres choisis et activer early stopping via eval_set
xgb_final = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE,
                          n_jobs=-1, **best_params)

print("\nRé-entrainement final avec early stopping sur validation (eval_set)...")
xgb_final.fit(
    X_train_imp, y_train,
    eval_set=[(X_val_imp, y_val)],
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    verbose=False
)

# Sauvegarde modèle
joblib.dump(xgb_final, MODEL_OUTPATH)
print(f"Modèle final sauvegardé -> {MODEL_OUTPATH}")

# -----------------------
# Prédiction out-of-sample (test) + évaluation
# -----------------------
print("\nÉvaluation out-of-sample (test)...")
probs_test = xgb_final.predict_proba(X_test_imp)[:, 1]
auc = roc_auc_score(y_test, probs_test)
gini = 2*auc - 1
fpr, tpr, thresholds = roc_curve(y_test, probs_test)

# Seuil optimal - Youden (TPR - FPR maximisé)
youden_idx = np.argmax(tpr - fpr)
opt_threshold_youden = thresholds[youden_idx]

# Seuil optimisant F1 (pour info)
f1_scores = [f1_score(y_test, (probs_test >= t).astype(int), zero_division=0) for t in thresholds]
opt_threshold_f1 = thresholds[np.argmax(f1_scores)]

# Choix du seuil final : tu peux choisir Youden ou F1 ; ici on utilise Youden tout en reportant F1-opt
threshold = opt_threshold_youden

preds = (probs_test >= threshold).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

precision = precision_score(y_test, preds, zero_division=0)
recall = recall_score(y_test, preds, zero_division=0)            # sensitivity
accuracy = accuracy_score(y_test, preds)
f1 = f1_score(y_test, preds, zero_division=0)
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
bal_acc = balanced_accuracy_score(y_test, preds)

# -----------------------
# Affichage résultats (out-of-sample uniquement)
# -----------------------
print(f"\nRésultats (test) - XGBoost")
print(f"AUC (ROC): {auc:.4f}")
print(f"Gini: {gini:.4f}")
print(f"Seuil Youden: {opt_threshold_youden:.4f} | Seuil F1-opt: {opt_threshold_f1:.4f}")
print("Matrice de confusion (tn, fp, fn, tp):", (int(tn), int(fp), int(fn), int(tp)))
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {bal_acc:.4f}")

# -----------------------
# Tracer ROC (test)
# -----------------------
plt.figure(figsize=(7,6))
plt.plot(fpr, tpr, label=f"XGBoost (AUC={auc:.3f})")
plt.plot([0,1],[0,1], linestyle='--', alpha=0.6)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC - Out-of-sample (test)")
plt.legend()
plt.grid(True)
plt.show()

# -----------------------
# Tracer Matrice de confusion (test)
# -----------------------
cm = np.array([[tn, fp],[fn, tp]])
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f"Matrice de confusion (test) - seuil={threshold:.4f}")
plt.ylabel("Vraie classe")
plt.xlabel("Classe prédite")
plt.show()

# -----------------------
# Importance des features (optionnel, utile pour interprétation)
# -----------------------
try:
    imp = pd.Series(xgb_final.feature_importances_, index=num_cols).sort_values(ascending=False)
    print("\nTop 10 features par importance (XGBoost):")
    print(imp.head(10))
    plt.figure(figsize=(6,4))
    imp.head(15).plot(kind='bar')
    plt.title("Feature importances (XGBoost)")
    plt.tight_layout()
    plt.show()
except Exception:
    pass

# -----------------------
# Résumé final en dictionary (pratique pour reporting programmatique)
# -----------------------
report = {
    'auc': float(auc),
    'gini': float(gini),
    'threshold_youden': float(opt_threshold_youden),
    'threshold_f1': float(opt_threshold_f1),
    'threshold_used': float(threshold),
    'confusion': {'tn': int(tn), 'fp': int(fp), 'fn': int(fn), 'tp': int(tp)},
    'precision': float(precision),
    'recall': float(recall),
    'specificity': float(specificity),
    'f1': float(f1),
    'accuracy': float(accuracy),
    'balanced_accuracy': float(bal_acc),
    'best_params': best_params,
    'trained_at': datetime.utcnow().isoformat() + 'Z'
}

# Enregistrer le reporting si souhaité
pd.Series(report).to_json("xgb_report_test.json")
print("\nReport JSON sauvegardé -> xgb_report_test.json")
print("\nTerminé. Seuls les résultats out-of-sample (test) ont été affichés.")



✅ Variable cible correctement décalée de 2 semaines.
Shape finale : (954, 18), target positive rate = 0.059
Chargement des données...
Tailles -> train: (667, 18), val: (143, 18), test: (144, 18)


DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int32DType'>)