In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import warnings

# -----------------------------------------------------
# 1. Càrrega i preprocesat inicial
# -----------------------------------------------------
BASE_PATH = r"C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\Conjunt de dades Preprocessades\Datasets"
file_name = "Google_Stock_Price_output.csv"
file_path = os.path.join(BASE_PATH, file_name)

df = pd.read_csv(file_path)
df['Date'] = pd.to_datetime(df['Date'], dayfirst=False)
df.sort_values('Date', inplace=True)
df.reset_index(drop=True, inplace=True)

# -----------------------------------------------------
# 2. Crear target: retorn logarítmic diari

# -----------------------------------------------------
# 4. Compilar llista de features
# -----------------------------------------------------
features = [
    'Open', 'High', 'Low', 'Volume',
    'EMA_7', 'EMA_40', 'MACD', 'Signal_Line', 'MACD_Hist', 'RSI', 'ATR'
] 

target = 'Close'

# -----------------------------------------------------
# 5. Divisió: 80% inicial per a train+val, 20% final per a test
# -----------------------------------------------------
split_pct = 0.7
split_idx = int(len(df) * split_pct)

df_trainval = df.iloc[:split_idx].copy()
df_test     = df.iloc[split_idx:].copy()

X_trainval = df_trainval[features].values
y_trainval = df_trainval[target].values
X_test      = df_test[features].values
y_test      = df_test[target].values

dates_test = df_test['Date'].values  # Per al plot posterior

# -----------------------------------------------------
# 6. Walk‐forward validation amb TimeSeriesSplit (n_splits = 5)
# -----------------------------------------------------
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

# -----------------------------------------------------
# 7. Cercar hiperparàmetres amb RandomizedSearchCV reduït
# -----------------------------------------------------
param_dist = {
    'n_estimators':       [100, 200, 300],
    'learning_rate':      [0.01, 0.05, 0.1],
    'max_depth':          [3, 4, 5],
    'subsample':          [0.6, 0.8],
    'colsample_bytree':   [0.6, 0.8],
    'reg_alpha':          [0, 0.1],
    'reg_lambda':         [1, 2],
    'gamma':              [0, 0.1],
    'booster':            ['gbtree'],  # Només gbtree per agilitzar
}

xgb_model = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    eval_metric='rmse'
)

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=100,  
    scoring='neg_root_mean_squared_error',
    cv=tscv,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    refit=False
)

random_search.fit(X_trainval, y_trainval)

best_params = random_search.best_params_
best_score  = -random_search.best_score_
print("Millors hiperparàmetres trobats (CV):")
print(best_params)
print(f"RMSE mitjà a validació (CV): {best_score:.6f}")

# -----------------------------------------------------
# 8. Reentrenar tot df_trainval amb els millors paràmetres i early stopping
# -----------------------------------------------------
final_model = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    eval_metric='rmse',
    **best_params
)

# Reservem el 10% final de trainval com a validació interna
n_trainval = len(X_trainval)
val_split_idx = int(n_trainval * 0.9)

X_tr_full = X_trainval[:val_split_idx]
y_tr_full = y_trainval[:val_split_idx]
X_va_full = X_trainval[val_split_idx:]
y_va_full = y_trainval[val_split_idx:]

# Entrenem 
final_model.fit(
    X_tr_full,
    y_tr_full,
    eval_set=[(X_va_full, y_va_full)],
    verbose=False
)

# -----------------------------------------------------
# 9. Predir sobre el test (sense indicar ntree_limit)
# -----------------------------------------------------
y_test_pred = final_model.predict(X_test)
y_test_true = y_test

# Calcular mètriques sobre el test
rmse_test = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
mae_test  = mean_absolute_error(y_test_true, y_test_pred)
r2_test   = r2_score(y_test_true, y_test_pred)

print("\nResultats en test final (retorns):")
print(f"RMSE: {rmse_test:.6f}")
print(f"MAE : {mae_test:.6f}")
print(f"R²  : {r2_test:.6f}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Millors hiperparàmetres trobats (CV):
{'subsample': 0.8, 'reg_lambda': 2, 'reg_alpha': 0, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.8, 'booster': 'gbtree'}
RMSE mitjà a validació (CV): 8.556406

Resultats en test final (retorns):
RMSE: 20.544221
MAE : 14.015249
R²  : 0.025234
