In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from collections import deque
import plotly.graph_objects as go

# --- Funció d'augmentació (jittering) ----------
def augment_features(X_orig, y_orig, n_copies=3, noise_level=0.01, random_state=42):
    """
    Crea còpies augmentades de X_orig i y_orig mitjançant jittering:
    - Afegeix soroll gaussià a cada feature contínua (totes les columnes de X).
    - Manté el target y sense canvis.
    """
    np.random.seed(random_state)
    cont_cols = X_orig.columns.tolist()

    X_list = [X_orig.copy()]
    y_list = [y_orig.copy()]

    for _ in range(n_copies):
        X_aug = X_orig.copy()
        stds = X_orig[cont_cols].std().values
        noise = np.random.normal(loc=0.0, scale=1.0, size=X_orig[cont_cols].shape)
        noise = noise * (noise_level * stds)
        X_aug.loc[:, cont_cols] = X_orig[cont_cols] + noise
        X_list.append(X_aug)
        y_list.append(y_orig.copy())

    X_all = pd.concat(X_list, ignore_index=True)
    y_all = pd.concat(y_list, ignore_index=True)
    return X_all, y_all

# Llista de fitxers a processar
datasets = [
    "Amazon_Stock_Price_output.csv",
    "Euro_Stoxx_50_Stock_Price_output.csv",
    "Google_Stock_Price_output.csv",
    "Hang_Seng_Stock_Price_output.csv",
    "IBEX_35_Stock_Price_output.csv",
    "Indra_Stock_Price_output.csv",
    "P&G_Stock_Price_output.csv",
    "S&P500_Stock_Price_output.csv"
]

# Rutes base
BASE_PATH = r"C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\Conjunt de dades Preprocessades\Datasets"
BASE_RESULTS_FOLDER = r"C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST"

# Paràmetres per RandomizedSearchCV
param_dist = {
    'n_estimators':      [int(x) for x in np.linspace(100, 500, num=5)],
    'max_depth':         [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf':  [1, 2, 4],
    'max_features':      ['auto', 'sqrt', 'log2']
}
tscv = TimeSeriesSplit(n_splits=5)

for file_name in datasets:
    # 1. Definir paths i noms
    file_path = os.path.join(BASE_PATH, file_name)
    dataset_name = os.path.splitext(file_name)[0]
    result_subfolder = os.path.join(BASE_RESULTS_FOLDER, dataset_name)
    os.makedirs(result_subfolder, exist_ok=True)

    # 2. Carregar dades
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)

    # 3. Enginyeria de característiques (lags de 'Close')
    for lag in [1, 2, 3, 5, 10]:
        df[f'Close_lag{lag}'] = df['Close'].shift(lag)

    # 4. Definir X i y
    target_col = "Close"
    feature_cols = [c for c in df.columns if c not in ['Date', target_col]]
    X = df[feature_cols].copy()
    y = df[target_col].copy()

    # 5. Split temporal 70% train_val / 30% test
    split_index = int(len(df) * 0.7)
    X_train_val = X.iloc[:split_index].reset_index(drop=True)
    y_train_val = y.iloc[:split_index].reset_index(drop=True)
    X_test = X.iloc[split_index:].reset_index(drop=True)
    y_test = y.iloc[split_index:].reset_index(drop=True)

    # 6. Augmentació (jittering)
    X_train_aug, y_train_aug = augment_features(
        X_train_val, y_train_val, n_copies=5, noise_level=0.01, random_state=42
    )

    # 7. Hyperparameter tuning amb RandomizedSearchCV
    rf = RandomForestRegressor(random_state=42)
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=50,
        cv=tscv,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=0,
        random_state=42
    )
    random_search.fit(X_train_aug, y_train_aug)
    best_params = random_search.best_params_

    # 8. Entrenar model final sobre TOT train_val AUGMENTAT
    final_rf = RandomForestRegressor(**best_params, random_state=42)
    final_rf.fit(X_train_aug, y_train_aug)

    # 9. Avaluació sobre Test (sense augmentar)
    y_pred_test = final_rf.predict(X_test)
    rmse_test = mean_squared_error(y_test, y_pred_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)

    # Guardar mètriques del model complet
    metrics_full = pd.DataFrame({
        "Model": ["RandomForest_Complet"],
        "RMSE_test": [rmse_test],
        "MAE_test": [mae_test],
        "R2_test": [r2_test]
    })
    metrics_full.to_csv(
        os.path.join(result_subfolder, f"{dataset_name}_full_metrics.csv"),
        index=False
    )

    # 10. Selecció de variables per importància
    importances = final_rf.feature_importances_
    feat_imp = pd.DataFrame({
        'feature': X_train_val.columns,
        'importance': importances
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    llindar = 0.01
    features_sel = feat_imp[feat_imp['importance'] >= llindar]['feature'].tolist()

    # 11. Entrenar model reduït amb X_train_val_red
    X_train_val_red = X_train_val[features_sel]
    X_test_red = X_test[features_sel]
    final_rf_red = RandomForestRegressor(**best_params, random_state=42)
    final_rf_red.fit(X_train_val_red, y_train_val)
    y_pred_red = final_rf_red.predict(X_test_red)

    rmse_red = mean_squared_error(y_test, y_pred_red)
    mae_red = mean_absolute_error(y_test, y_pred_red)
    r2_red = r2_score(y_test, y_pred_red)

    # Guardar mètriques del model reduït
    metrics_red = pd.DataFrame({
        "Model": ["RandomForest_Reducit"],
        "RMSE_test": [rmse_red],
        "MAE_test": [mae_red],
        "R2_test": [r2_red]
    })
    metrics_red.to_csv(
        os.path.join(result_subfolder, f"{dataset_name}_reduced_metrics.csv"),
        index=False
    )

    # 12. Graficar Real vs Predicho (Test) model reduït
    dates_test = df['Date'].iloc[split_index:].reset_index(drop=True)
    y_true_red = y_test.reset_index(drop=True)
    y_pred_red_series = pd.Series(y_pred_red).reset_index(drop=True)

    fig_test = go.Figure()
    fig_test.add_trace(go.Scatter(
        x=dates_test,
        y=y_true_red,
        mode='lines',
        name='Real (Close)',
        line=dict(color='blue')
    ))
    fig_test.add_trace(go.Scatter(
        x=dates_test,
        y=y_pred_red_series,
        mode='lines',
        name='Predicho (RF reducido)',
        line=dict(color='red', dash='dash')
    ))
    fig_test.update_layout(
        title=f"{dataset_name} – Real vs Predicho (Test) [Model Reduït]",
        xaxis_title='Fecha',
        yaxis_title='Precio Close (USD)',
        template='plotly_dark',
        xaxis_rangeslider_visible=True
    )

    fig_test.write_html(
        os.path.join(result_subfolder, f"{dataset_name}_test_reduced_plot.html")
    )

    # 13. Predicció autoregresiva pròxims 10 dies
    lag_features = [f for f in features_sel if f.startswith("Close_lag")]
    if len(lag_features) == 0:
        print(f"No es poden generar prediccions futures per {dataset_name} (cap 'Close_lag')")
        continue

    # Reentrenar model amb només lag_features, si cal
    if len(lag_features) < len(features_sel):
        final_rf_lag = RandomForestRegressor(**best_params, random_state=42)
        final_rf_lag.fit(X_train_val[lag_features], y_train_val)
        modelo_para_futuro = final_rf_lag
    else:
        modelo_para_futuro = final_rf_red

    max_lag = max(int(f.split("Close_lag")[1]) for f in lag_features)
    last_closes = deque(df['Close'].iloc[-max_lag:].values, maxlen=max_lag)
    future_dates = pd.bdate_range(
        start=df['Date'].iloc[-1] + pd.Timedelta(days=1),
        periods=10
    )
    future_preds = []
    for fecha in future_dates:
        X_new = {}
        for f in lag_features:
            lag_num = int(f.split("Close_lag")[1])
            X_new[f] = last_closes[-lag_num]
        X_new_df = pd.DataFrame([X_new])
        y_pred_fut = modelo_para_futuro.predict(X_new_df)[0]
        future_preds.append(y_pred_fut)
        last_closes.append(y_pred_fut)

    df_fut_pred = pd.DataFrame({
        "Date": future_dates,
        "Predicted_Close": future_preds
    })
    df_fut_pred.to_csv(
        os.path.join(result_subfolder, f"{dataset_name}_future_10days_reduced.csv"),
        index=False
    )

    # 14. Graficar històric + prediccions futures
    fig_future = go.Figure()
    fig_future.add_trace(go.Scatter(
        x=df['Date'],
        y=df['Close'],
        mode='lines',
        name='Històric Close',
        line=dict(color='lightblue')
    ))
    fig_future.add_trace(go.Scatter(
        x=future_dates,
        y=np.array(future_preds),
        mode='lines+markers',
        name='Predicció futura (10 dies)',
        line=dict(color='orange', dash='dash'),
        marker=dict(size=6)
    ))
    fig_future.update_layout(
        title=f"{dataset_name} – Predicció Pròxims 10 Dies (Model Reduït)",
        xaxis_title='Fecha',
        yaxis_title='Precio Close (USD)',
        template='plotly_dark',
        xaxis_rangeslider_visible=True
    )

    fig_future.write_html(
        os.path.join(result_subfolder, f"{dataset_name}_future_reduced_plot.html")
    )

    print(f"Finalitzat: {dataset_name}. Resultats a {result_subfolder}\n")

print("Proces complet finalitzat per a tots els datasets.")


75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib

Finalitzat: Amazon_Stock_Price_output. Resultats a C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST\Amazon_Stock_Price_output





75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\l

Finalitzat: Euro_Stoxx_50_Stock_Price_output. Resultats a C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST\Euro_Stoxx_50_Stock_Price_output





75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\l

Finalitzat: Google_Stock_Price_output. Resultats a C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST\Google_Stock_Price_output





75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\l

Finalitzat: Hang_Seng_Stock_Price_output. Resultats a C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST\Hang_Seng_Stock_Price_output





75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\l

Finalitzat: IBEX_35_Stock_Price_output. Resultats a C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST\IBEX_35_Stock_Price_output




Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[10288.71806311 10152.22332589  9999.76310388 10156.03100579
 10044.26102187 10446.14443565 10355.19734589 10401.05712867
 10701.74902786 10844.56132483 10822.89290478 10950.2282023
 11303.75765974 11089.03984355 11211.71690244 10986.09009675
 10962.23615189 10940.41523266 10934.93182008 10839.63314632
 10751.70242168 10445.51131146 10415.05318638 10571.71435903
 10736.67610655 10962.01545221 11087.22499597 10955.27151427
 11020.98088068 11278.02847771 11257.75485835 10931.79480617
 10985.80611074 10864.55851146 10796.61900793 10775.08597445
 10654.13538164 10158.40679961 10088.78084651  9607.96276585
  9644.10648523  9100.24447954  9953.98532928  9992.47839191
 10192.67887116 10145.40237324 10084.49369797  9162.87199075
  9378.99580706  9249.82514475  8041.46825923  7544.68712449
  7382.37585436  7933.80672637  7099.24465823  7464.3513764
  8627.06683218  7424.37705002  7432.4772466

Finalitzat: Indra_Stock_Price_output. Resultats a C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST\Indra_Stock_Price_output





75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
26 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\l

Finalitzat: P&G_Stock_Price_output. Resultats a C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST\P&G_Stock_Price_output





75 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
65 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\jesus\AppData\Local\Programs\Python\Python310\l

Finalitzat: S&P500_Stock_Price_output. Resultats a C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\RANDOM FOREST\resultats_RANDOM_FOREST\S&P500_Stock_Price_output

Proces complet finalitzat per a tots els datasets.
