In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# ========= 1) CHARGEMENT DES DONN√âES =========
# Adapter le chemin si besoin
data_path = "BXY_PRIX.csv"
df = pd.read_csv(data_path)

# On suppose qu'il y a une colonne 'datetime' et 'BXY close'
df["datetime"] = pd.to_datetime(df["datetime"])
df = df.sort_values("datetime").set_index("datetime")

# Cible
df["close"] = df["BXY close"]

# ========= 2) CR√âATION DES FEATURES DE LAGS =========
max_lag = 10
for lag in range(1, max_lag + 1):
    df[f"lag_{lag}"] = df["close"].shift(lag)

# On enl√®ve les lignes avec NaN (au d√©but)
data = df.dropna().copy()

feature_cols = [f"lag_{lag}" for lag in range(1, max_lag + 1)]
X = data[feature_cols].values
y = data["close"].values

# ========= 3) SPLIT TEMPOREL TRAIN / TEST =========
n = len(data)
n_train = int(n * 0.8)

X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = y[:n_train], y[n_train:]

print(f"Nombre de points total : {n}")
print(f"Train : {n_train} | Test : {n - n_train}")

# ========= 4) FONCTION M√âTRIQUES =========
from sklearn.metrics import mean_squared_error, mean_absolute_error

def compute_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return {"RMSE": rmse, "MAE": mae, "MAPE(%)": mape}

results = {}
models = {}

# ========= 5) RANDOM FOREST =========
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
results["RandomForest"] = compute_metrics(y_test, rf_pred)
models["RandomForest"] = rf

print("RandomForest OK")

# ========= 6) XGBOOST =========
try:
    from xgboost import XGBRegressor

    xgb = XGBRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        objective="reg:squarederror",
        n_jobs=-1,
        tree_method="hist"
    )
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)
    results["XGBoost"] = compute_metrics(y_test, xgb_pred)
    models["XGBoost"] = xgb
    print("XGBoost OK")
except Exception as e:
    print("‚ö†Ô∏è XGBoost non disponible :", e)

# ========= 7) LIGHTGBM =========
try:
    import lightgbm as lgb

    lgbm = lgb.LGBMRegressor(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    lgbm.fit(X_train, y_train)
    lgbm_pred = lgbm.predict(X_test)
    results["LightGBM"] = compute_metrics(y_test, lgbm_pred)
    models["LightGBM"] = lgbm
    print("LightGBM OK")
except Exception as e:
    print("‚ö†Ô∏è LightGBM non disponible :", e)

# ========= 8) SARIMAX (ARIMA) =========
from statsmodels.tsa.statespace.sarimax import SARIMAX

# On travaille directement sur la s√©rie de prix
series = data["close"]
train_series = series.iloc[:n_train]
test_series = series.iloc[n_train:]

sarimax_model = SARIMAX(
    train_series,
    order=(2, 1, 2),          # tu peux tester (1,1,1), (3,1,2), etc.
    seasonal_order=(0, 0, 0, 0),
    enforce_stationarity=False,
    enforce_invertibility=False
)
sarimax_res = sarimax_model.fit(disp=False, maxiter=80)
sarimax_forecast = sarimax_res.forecast(steps=len(test_series))
results["SARIMAX(2,1,2)"] = compute_metrics(test_series.values, sarimax_forecast.values)

print("SARIMAX OK")

# ========= 9) TABLEAU COMPARATIF =========
results_df = pd.DataFrame(results).T.sort_values("RMSE")
print("\n===== COMPARAISON DES MOD√àLES (test set) =====")
print(results_df)

best_model_name = results_df.index[0]
print(f"\n‚úÖ Meilleur mod√®le selon le RMSE : {best_model_name}")

# On ne sauvegarde que les mod√®les ML (RF / XGB / LGBM), pas SARIMAX
from joblib import dump
models_dir = Path(".")
best_model = models.get(best_model_name)

if best_model is not None:
    model_path = models_dir / f"best_model_BXY_{best_model_name}.pkl"
    dump(best_model, model_path)
    print(f"\nüìÅ Mod√®le sauvegard√© dans : {model_path}")
else:
    print("\n‚ö†Ô∏è Le meilleur mod√®le est SARIMAX : pour l‚Äôutiliser dans Streamlit, "
          "il faudra sauvegarder l‚Äôobjet 'sarimax_res' avec joblib ou pickle.")


Nombre de points total : 9066
Train : 7252 | Test : 1814
RandomForest OK
XGBoost OK
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 7252, number of used features: 10
[LightGBM] [Info] Start training from score 128.579300




LightGBM OK


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


SARIMAX OK

===== COMPARAISON DES MOD√àLES (test set) =====
                    RMSE       MAE   MAPE(%)
XGBoost         0.182071  0.120791  0.095196
LightGBM        0.185016  0.126214  0.099414
RandomForest    0.196054  0.136600  0.107639
SARIMAX(2,1,2)  2.228515  1.949484  1.529554

‚úÖ Meilleur mod√®le selon le RMSE : XGBoost

üìÅ Mod√®le sauvegard√© dans : best_model_BXY_XGBoost.pkl


  return get_prediction_index(
  return get_prediction_index(


In [10]:
import pandas as pd

events = pd.read_csv("gbp_events.csv")
print(events.shape)       # nombre de lignes / colonnes
print(events.columns)     # noms des colonnes
events.head(10)

(2386, 12)
Index(['id', 'datetime', 'currency', 'event', 'impact', 'actual',
       'actual_unit', 'forecast', 'forecast_unit', 'previous', 'previous_unit',
       'previous_revised'],
      dtype='object')


Unnamed: 0,id,datetime,currency,event,impact,actual,actual_unit,forecast,forecast_unit,previous,previous_unit,previous_revised
0,116736,2021-01-04 09:30:00+00:00,GBP,Final Manufacturing PMI,Medium Impact Expected,57.5,unit_none,57.3,unit_none,57.3,unit_none,False
1,119457,2021-01-04 09:30:00+00:00,GBP,M4 Money Supply m/m,Low Impact Expected,0.8,unit_percentage,0.4,unit_percentage,0.7,unit_percentage,True
2,119456,2021-01-04 09:30:00+00:00,GBP,Mortgage Approvals,Low Impact Expected,105.0,unit_thousand,82.0,unit_thousand,98.0,unit_thousand,False
3,119458,2021-01-04 09:30:00+00:00,GBP,Net Lending to Individuals m/m,Low Impact Expected,4.1,unit_billion,3.0,unit_billion,3.8,unit_billion,True
4,112372,2021-01-06 00:01:00+00:00,GBP,BRC Shop Price Index y/y,Low Impact Expected,-1.8,unit_percentage,,,-1.8,unit_percentage,False
5,116818,2021-01-06 09:30:00+00:00,GBP,Final Services PMI,Low Impact Expected,49.4,unit_none,49.9,unit_none,49.9,unit_none,False
6,119815,2021-01-06 10:05:00+00:00,GBP,10-y Bond Auction,Low Impact Expected,,unit_none,,,,unit_none,False
7,121384,2021-01-06 14:30:00+00:00,GBP,BOE Gov Bailey Speaks,High Impact Expected,,,,,,,
8,119817,2021-01-07 09:30:00+00:00,GBP,Construction PMI,Low Impact Expected,54.6,unit_none,54.6,unit_none,54.7,unit_none,False
9,116406,2021-01-07 09:32:00+00:00,GBP,Housing Equity Withdrawal q/q,Low Impact Expected,-7.0,unit_billion,-7.0,unit_billion,-9.0,unit_billion,True


In [3]:
import pandas as pd
import numpy as np

# ===============================
# 1) LOAD BXY PRICE DATA
# ===============================
bxy = pd.read_csv("BXY_PRIX.csv")
bxy["datetime"] = pd.to_datetime(bxy["datetime"])
bxy = bxy.sort_values("datetime").set_index("datetime")
bxy["close"] = bxy["BXY close"]

# Create 10 lags
for lag in range(1, 11):
    bxy[f"lag_{lag}"] = bxy["close"].shift(lag)


# ===============================
# 2) LOAD & TRANSFORM NEWS DATA
# ===============================
events = pd.read_csv("gbp_events.csv")

events["datetime"] = pd.to_datetime(events["datetime"])
events = events.sort_values("datetime").set_index("datetime")

# A) Encode impact level
impact_map = {
    "Low Impact Expected": 1,
    "Medium Impact Expected": 2,
    "High Impact Expected": 3
}
events["impact_level"] = events["impact"].map(impact_map).fillna(0)

# B) Surprise variable
events["surprise"] = (events["actual"] - events["forecast"]).fillna(0)

# C) Delta vs previous
events["delta_prev"] = (events["actual"] - events["previous"]).fillna(0)

# D) Event flag
events["has_event"] = 1

# E) Dummies for main event types
events["event_pmi"] = events["event"].str.contains("PMI", case=False).astype(int)
events["event_speech"] = events["event"].str.contains("Speaks", case=False).astype(int)
events["event_money"] = events["event"].str.contains("Money", case=False).astype(int)
events["event_lending"] = events["event"].str.contains("Lending", case=False).astype(int)

# F) Resample to hourly (same frequency as BXY)
events_hourly = events.resample("1H").agg({
    "has_event": "max",
    "impact_level": "max",
    "surprise": "sum",
    "delta_prev": "sum",
    "event_pmi": "max",
    "event_speech": "max",
    "event_money": "max",
    "event_lending": "max"
}).fillna(0)


# ===============================
# 3) MERGE NEWS WITH BXY
# ===============================
df = bxy.merge(events_hourly, how="left", left_index=True, right_index=True)
df = df.fillna(0)

# Drop rows with missing lags
df = df.dropna().copy()


# ===============================
# 4) PREPARE ML DATA
# ===============================
feature_cols = (
    [f"lag_{i}" for i in range(1, 11)] +
    ["has_event", "impact_level", "surprise", "delta_prev",
     "event_pmi", "event_speech", "event_money", "event_lending"]
)

X = df[feature_cols].values
y = df["close"].values

n = len(df)
n_train = int(0.8 * n)

X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = y[:n_train], y[n_train:]


# ===============================
# 5) TRAIN MODELS
# ===============================
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

def metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return {"RMSE": rmse, "MAE": mae, "MAPE(%)": mape}

results = {}
models = {}

# RandomForest
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
results["RandomForest"] = metrics(y_test, rf.predict(X_test))

# XGBoost
from xgboost import XGBRegressor
xgb = XGBRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=5,
    subsample=0.8, colsample_bytree=0.8,
    objective="reg:squarederror"
)
xgb.fit(X_train, y_train)
results["XGBoost"] = metrics(y_test, xgb.predict(X_test))

# LightGBM
import lightgbm as lgb
lgbm = lgb.LGBMRegressor(n_estimators=400, learning_rate=0.05)
lgbm.fit(X_train, y_train)
results["LightGBM"] = metrics(y_test, lgbm.predict(X_test))

# SARIMAX
from statsmodels.tsa.statespace.sarimax import SARIMAX
sarimax_model = SARIMAX(df["close"].iloc[:n_train], order=(2,1,2))
sarimax_res = sarimax_model.fit(disp=False)
sarimax_pred = sarimax_res.forecast(steps=len(y_test))
results["SARIMAX"] = metrics(y_test, sarimax_pred.values)


# ===============================
# 6) SHOW COMPARISON
# ===============================
results_df = pd.DataFrame(results).T.sort_values("RMSE")
print(results_df)


  events_hourly = events.resample("1H").agg({


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000507 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2572
[LightGBM] [Info] Number of data points in the train set: 7260, number of used features: 14
[LightGBM] [Info] Start training from score 128.590211


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                  RMSE       MAE   MAPE(%)
LightGBM      0.181272  0.122675  0.096643
XGBoost       0.183855  0.123713  0.097466
RandomForest  0.193317  0.134122  0.105718
SARIMAX       2.258873  1.980127  1.553656


  return get_prediction_index(
  return get_prediction_index(


In [4]:
import joblib

# Suppose que ton mod√®le final s‚Äôappelle lgbm
joblib.dump(lgbm, "best_model_BXY_LightGBM.pkl")

print("Mod√®le LightGBM enregistr√© !")


Mod√®le LightGBM enregistr√© !


In [9]:
import lightgbm as lgb
import pandas as pd
import matplotlib.pyplot as plt

importance = pd.DataFrame({
    'feature': feature_cols,
    'importance_gain': lgbm.booster_.feature_importance(importance_type='gain'),
    'importance_split': lgbm.booster_.feature_importance(importance_type='split')
}).sort_values("importance_gain", ascending=False)

print(importance)


          feature  importance_gain  importance_split
0           lag_1     4.921893e+06              2701
1           lag_2     3.069071e+04               996
4           lag_5     7.359943e+03               777
5           lag_6     5.046380e+03               837
2           lag_3     4.222075e+03               816
3           lag_4     3.993561e+03               870
8           lag_9     5.445664e+02              1149
9          lag_10     5.327727e+02              1368
6           lag_7     3.753314e+02              1019
7           lag_8     2.344648e+02              1007
13     delta_prev     1.358685e+01               286
15   event_speech     1.284881e+01                53
10      has_event     1.269411e+01                69
11   impact_level     1.106267e+01                52
12       surprise     0.000000e+00                 0
14      event_pmi     0.000000e+00                 0
16    event_money     0.000000e+00                 0
17  event_lending     0.000000e+00            