In [54]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
import plotly.graph_objects as go

# ✅ Fonction pour supprimer toutes les colonnes finissant par _Open, _Low, _High

In [55]:
def drop_ohl_columns(df):
    cols_to_drop = [col for col in df.columns
                    if col.endswith(("_Open", "_Low", "_High"))]
    return df.drop(columns=cols_to_drop)

In [56]:
df = pd.read_csv(
    "Clean_Data.csv",
    sep=",",
    index_col="Date",
)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3386 entries, 2012-01-26 to 2025-12-10
Data columns (total 87 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   VIX_Open         3386 non-null   float64
 1   VIX_High         3386 non-null   float64
 2   VIX_Low          3386 non-null   float64
 3   VIX_Close        3386 non-null   float64
 4   DX-Y.NYB_Open    3386 non-null   float64
 5   DX-Y.NYB_High    3386 non-null   float64
 6   DX-Y.NYB_Low     3386 non-null   float64
 7   DX-Y.NYB_Close   3386 non-null   float64
 8   PA_F_Open        3386 non-null   float64
 9   PA_F_High        3386 non-null   float64
 10  PA_F_Low         3386 non-null   float64
 11  PA_F_Close       3386 non-null   float64
 12  BZ_F_Open        3386 non-null   float64
 13  BZ_F_High        3386 non-null   float64
 14  BZ_F_Low         3386 non-null   float64
 15  BZ_F_Close       3386 non-null   float64
 16  OVX_Open         3386 non-null   float64
 17  OVX_

In [58]:
df.head()

Unnamed: 0_level_0,VIX_Open,VIX_High,VIX_Low,VIX_Close,DX-Y.NYB_Open,DX-Y.NYB_High,DX-Y.NYB_Low,DX-Y.NYB_Close,PA_F_Open,PA_F_High,...,rolling_mean7,vol_7,vol_15,vol_30,day_num,month,year,momentum_7,momentum_15,momentum_30
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-01-26,17.959999,19.17,16.799999,18.57,79.489998,79.540001,79.059998,79.410004,693.900024,693.900024,...,1677.942854,0.010289,0.008987,0.009661,26,1,2012,0.042956,0.066012,0.096342
2012-01-27,19.16,19.16,18.26,18.530001,79.449997,79.540001,78.769997,78.900002,689.599976,689.599976,...,1688.271432,0.010258,0.008823,0.009467,27,1,2012,0.043567,0.071592,0.08536
2012-01-30,20.33,20.33,19.379999,19.4,78.900002,79.449997,78.769997,79.160004,688.099976,688.099976,...,1699.25715,0.009878,0.008515,0.009463,30,1,2012,0.046491,0.076827,0.085675
2012-01-31,19.07,19.84,18.129999,19.440001,79.099998,79.480003,78.739998,79.279999,685.950012,685.950012,...,1709.842878,0.009927,0.008087,0.009257,31,1,2012,0.044539,0.065481,0.075638
2012-02-01,18.68,18.82,17.99,18.549999,79.309998,79.57,78.620003,78.919998,696.299988,696.299988,...,1719.714303,0.009877,0.008089,0.009226,1,2,2012,0.04118,0.065825,0.083876


In [59]:
df = df.drop(columns=['close_diff1','rolling_mean7', 'vol_7', 'vol_15',
       'vol_30', 'day_num', 'month', 'year', 'momentum_7', 'momentum_15',
       'momentum_30', 'target_tomorrow'])

In [60]:
# --- 2. Lags du prix ---
df["GC_close_lag1"] = df["GC_F_Close"].shift(1)
df["GC_close_lag2"] = df["GC_F_Close"].shift(2)

# --- 3. Range intraday ---
df["GC_range"] = df["GC_F_High"] - df["GC_F_Low"]
df["GC_range_pct"] = (df["GC_F_High"] - df["GC_F_Low"]) / df["GC_F_Close"]

# --- 4. True Range (volatilité intraday robuste) ---
df["GC_true_range"] = np.maximum(df["GC_F_High"] - df["GC_F_Low"],
                                 np.maximum(abs(df["GC_F_High"] - df["GC_F_Close"].shift(1)),
                                            abs(df["GC_F_Low"] - df["GC_F_Close"].shift(1))))

# --- 5. Position du close dans la journée ---
df["GC_close_position"] = (df["GC_F_Close"] - df["GC_F_Low"]) / (df["GC_F_High"] - df["GC_F_Low"])

# --- 6. Intraday return (Open → Close) ---
df["GC_intraday_return"] = (df["GC_F_Close"] - df["GC_F_Open"]) / df["GC_F_Open"]

In [62]:

# ============================
# 1. LISTE DES COLONNES
# ============================

price_cols = [
    'VIX_Close', 'DX-Y.NYB_Close', 'PA_F_Close', 'BZ_F_Close', 'OVX_Close',
    'USO_Close', 'CL_F_Close', 'PL_F_Close', 'TNX_Close', 'GVZ_Close',
    'MOVE_Close', 'GC_F_Close', 'EGO_Close', 'SI_F_Close', 'DJI_Close',
    'GDX_Close', 'GSPC_Close', 'EURUSD_X_Close'
]

volume_cols = ['USO_Volume', 'EGO_Volume', 'GDX_Volume']


# ============================
# 2. RETURNS
# ============================

for col in price_cols:
    df[col + "_ret"] = df[col].pct_change()


# ============================
# 3. LAGS (mémoire temporelle)
# ============================

lags = [1, 2, 3, 5]

for col in price_cols:
    for lag in lags:
        df[f"{col}_ret_lag{lag}"] = df[col + "_ret"].shift(lag)


# ============================
# 4. ROLLING STATS (vol, momentum, skew)
# ============================

windows = [5, 20, 60]

for col in price_cols:
    ret = df[col + "_ret"]
    for w in windows:
        df[f"{col}_vol_{w}"] = ret.rolling(w).std()
        df[f"{col}_mom_{w}"] = ret.rolling(w).mean()
        df[f"{col}_skew_{w}"] = ret.rolling(w).skew()
        df[f"{col}_kurt_{w}"] = ret.rolling(w).kurt()


# ============================
# 5. SPREADS & RATIOS
# ============================

df["gold_silver"] = df["GC_F_Close"] / df["SI_F_Close"]
df["gold_oil"] = df["GC_F_Close"] / df["CL_F_Close"]
df["gold_dxy"] = df["GC_F_Close"] * df["DX-Y.NYB_Close"]
df["gold_spx"] = df["GC_F_Close"] / df["GSPC_Close"]
df["gold_vix"] = df["GC_F_Close"] / df["VIX_Close"]
df["silver_oil"] = df["SI_F_Close"] / df["CL_F_Close"]
df["usd_spread"] = df["DX-Y.NYB_Close"] / df["EURUSD_X_Close"]
df["gold_rate"] = df["GC_F_Close"] / df["TNX_Close"]
df["spx_vix"] = df["GSPC_Close"] * df["VIX_Close"]

spread_cols = [
    "gold_silver", "gold_oil", "gold_dxy", "gold_spx", "gold_vix",
    "silver_oil", "usd_spread", "gold_rate", "spx_vix"
]

# returns + lags + rolling sur spreads
for col in spread_cols:
    df[col + "_ret"] = df[col].pct_change()
    for lag in lags:
        df[f"{col}_ret_lag{lag}"] = df[col + "_ret"].shift(lag)
    for w in windows:
        df[f"{col}_vol_{w}"] = df[col + "_ret"].rolling(w).std()
        df[f"{col}_mom_{w}"] = df[col + "_ret"].rolling(w).mean()


# ============================
# 6. VOLUMES
# ============================

for col in volume_cols:
    df[col + "_ret"] = df[col].pct_change()
    df[col + "_zscore"] = (df[col] - df[col].rolling(20).mean()) / df[col].rolling(20).std()
    for w in windows:
        df[f"{col}_mom_{w}"] = df[col + "_ret"].rolling(w).mean()


# ============================
# 7. CROSS-ASSET INTERACTIONS
# ============================

df["gold_vix_interact"] = df["GC_F_Close_ret"] * df["VIX_Close_ret"]
df["gold_dxy_interact"] = df["GC_F_Close_ret"] * df["DX-Y.NYB_Close_ret"]
df["gold_oil_interact"] = df["GC_F_Close_ret"] * df["CL_F_Close_ret"]
df["gold_spx_interact"] = df["GC_F_Close_ret"] * df["GSPC_Close_ret"]


# ============================
# 8. FEATURES SPÉCIFIQUES AU GOLD
# ============================

df["gold_vol_20"] = df["GC_F_Close_ret"].rolling(20).std()
df["gold_mom_20"] = df["GC_F_Close_ret"].rolling(20).mean()
df["gold_skew_20"] = df["GC_F_Close_ret"].rolling(20).skew()
df["gold_kurt_20"] = df["GC_F_Close_ret"].rolling(20).kurt()

# Création de la target return pour contrer la problématique de prédiction de valeurs jamais vues dans le train

In [127]:
#df["target_return"] = df["GC_F_Close"].pct_change().shift(-1)

In [63]:
df["target_return"] = df["GC_F_Close"].pct_change(1).shift(-1)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [64]:
df = df.dropna()

In [None]:
"""df = df.drop(columns=[
    'VIX_Close', 'DX-Y.NYB_Close', 'PA_F_Close', 'BZ_F_Close', 'OVX_Close',
    'USO_Close', 'CL_F_Close', 'PL_F_Close', 'TNX_Close', 'GVZ_Close',
    'MOVE_Close', 'EGO_Close', 'SI_F_Close', 'DJI_Close',
    'GDX_Close', 'GSPC_Close', 'EURUSD_X_Close','USO_Volume', 'EGO_Volume', 'GDX_Volume'
])"""

In [65]:
df.columns

Index(['VIX_Open', 'VIX_High', 'VIX_Low', 'VIX_Close', 'DX-Y.NYB_Open',
       'DX-Y.NYB_High', 'DX-Y.NYB_Low', 'DX-Y.NYB_Close', 'PA_F_Open',
       'PA_F_High',
       ...
       'GDX_Volume_mom_60', 'gold_vix_interact', 'gold_dxy_interact',
       'gold_oil_interact', 'gold_spx_interact', 'gold_vol_20', 'gold_mom_20',
       'gold_skew_20', 'gold_kurt_20', 'target_return'],
      dtype='object', length=520)

In [66]:
df = drop_ohl_columns(df)

# Split chronologique

In [67]:
# 1. On récupère les index
n = len(df)
n

3164

In [68]:
train_end = int(n * 0.80)
val_end   = int(n * 0.95)

# 2. Split chronologique
train_df = df.iloc[:train_end]
val_df   = df.iloc[train_end:val_end]
test_df  = df.iloc[val_end:]


In [69]:
X_train = train_df.drop(columns=["target_return"])
y_train = train_df["target_return"]

X_val = val_df.drop(columns=["target_return"])
y_val = val_df["target_return"]

X_test = test_df.drop(columns=["target_return"])
y_test = test_df["target_return"]

# Instanciation du modèle

In [70]:
early_stop = EarlyStopping(rounds=50, save_best=True)

model = XGBRegressor(
    n_estimators=600,
    learning_rate=0.02,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=2.0,
    min_child_weight=3,
    objective="reg:squarederror",
    tree_method="hist",
    callbacks = [early_stop]
)

model.fit(
    X_train, y_train,
    eval_set=[(X_train,y_train),(X_val, y_val)],
    verbose=True,
)


[0]	validation_0-rmse:0.01017	validation_1-rmse:0.00921
[1]	validation_0-rmse:0.01017	validation_1-rmse:0.00921
[2]	validation_0-rmse:0.01017	validation_1-rmse:0.00921
[3]	validation_0-rmse:0.01017	validation_1-rmse:0.00921
[4]	validation_0-rmse:0.01017	validation_1-rmse:0.00921
[5]	validation_0-rmse:0.01017	validation_1-rmse:0.00921
[6]	validation_0-rmse:0.01017	validation_1-rmse:0.00921
[7]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[8]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[9]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[10]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[11]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[12]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[13]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[14]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[15]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[16]	validation_0-rmse:0.01016	validation_1-rmse:0.00921
[17]	validation_0-rmse:0.01015	validation

In [71]:
y_pred_test = model.predict(X_test)

In [72]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("Test RMSE:", rmse_test)

Test RMSE: 0.013873410907153667


In [73]:
mae_test = mean_absolute_error(y_test, y_pred_test)
print("Test MAE:", mae_test)

Test MAE: 0.01025367046204808


# Suppressin des returns

In [74]:
pred_price = X_test["GC_F_Close"] * (1 + y_pred_test)
true_price = X_test["GC_F_Close"] * (1 + y_test)

In [75]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=true_price.index,
    y=true_price,
    mode='lines',
    name='True Price'
))

fig.add_trace(go.Scatter(
    x=pred_price.index,
    y=pred_price,
    mode='lines',
    name='Predicted Price'
))

fig.update_layout(
    title="True vs Predicted Gold Price (Test Set)",
    xaxis_title="Date",
    yaxis_title="Price",
    template="plotly_white"
)

fig.show()

In [76]:
import plotly.express as px

fig = px.scatter(
    x=y_test,
    y=y_pred_test,
    labels={'x': 'True Return', 'y': 'Predicted Return'},
    title="True vs Predicted Returns (Test Set)"
)
fig.show()

In [77]:
y_test.describe()

count    159.000000
mean       0.001624
std        0.013889
min       -0.057352
25%       -0.005446
50%        0.002223
75%        0.009142
max        0.040947
Name: target_return, dtype: float64

In [78]:
y_pred_test.mean(), y_pred_test.std()

(0.0001764357, 0.00021668516)

In [79]:
df.columns

Index(['VIX_Close', 'DX-Y.NYB_Close', 'PA_F_Close', 'BZ_F_Close', 'OVX_Close',
       'USO_Close', 'USO_Volume', 'CL_F_Close', 'PL_F_Close', 'TNX_Close',
       ...
       'GDX_Volume_mom_60', 'gold_vix_interact', 'gold_dxy_interact',
       'gold_oil_interact', 'gold_spx_interact', 'gold_vol_20', 'gold_mom_20',
       'gold_skew_20', 'gold_kurt_20', 'target_return'],
      dtype='object', length=466)

In [80]:
y_pred_test = pd.Series(y_pred_test, index=y_test.index)

In [81]:
y_test.corr(y_pred_test)

0.22646107970623763

In [82]:
residuals = y_test - y_pred_test
residuals

Date
2025-04-11   -0.005573
2025-04-14    0.003705
2025-04-15    0.033321
2025-04-16   -0.005714
2025-04-17    0.027540
                ...   
2025-12-03    0.002860
2025-12-04    0.000228
2025-12-05   -0.006217
2025-12-08    0.004503
2025-12-09   -0.002689
Length: 159, dtype: float64

In [83]:
direction_acc = (np.sign(y_pred_test) == np.sign(y_test)).mean()
direction_acc

0.5849056603773585

In [None]:
mean_absolute_error(true_price,pred_price)

37.11350791443653

In [87]:
pred_price

Date
2025-04-11    3222.755767
2025-04-14    3206.825635
2025-04-15    3219.348402
2025-04-16    3327.708487
2025-04-17    3309.677738
                 ...     
2025-12-03    4199.790889
2025-12-04    4211.938882
2025-12-05    4213.393079
2025-12-08    4187.843105
2025-12-09    4207.710171
Name: GC_F_Close, Length: 159, dtype: float64

In [88]:
true_price

Date
2025-04-11    3204.800049
2025-04-14    3218.699951
2025-04-15    3326.600098
2025-04-16    3308.699951
2025-04-17    3400.800049
                 ...     
2025-12-03    4211.799805
2025-12-04    4212.899902
2025-12-05    4187.200195
2025-12-08    4206.700195
2025-12-09    4196.399902
Length: 159, dtype: float64