MODEL LSTM - GENERACIÓ PESOS

In [None]:
import os
import random
import json
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [None]:
# Llista d'arxius a processar
DATASETS = [
    "Amazon_Stock_Price_output.csv",
    "Euro_Stoxx_50_Stock_Price_output.csv",
    "Google_Stock_Price_output.csv",
    "Hang_Seng_Stock_Price_output.csv",
    "IBEX_35_Stock_Price_output.csv",
    "Indra_Stock_Price_output.csv",
    "P&G_Stock_Price_output.csv",
    "S&P500_Stock_Price_output.csv"
]

# Ruta base on es troben els fitxers CSV preprocessats
BASE_PATH = "Conjunt de dades Preprocessades/Datasets"

# Paràmetres globals per a la construcció de les seqüències i divisió de dades
N_STEPS = 30        # longitud de seqüència temporal per a l'entrada de the LSTM
TEST_RATIO = 0.10   # 10% de les seqüències aniran a test
VAL_RATIO = 0.10    # 10% de les seqüències aniran a validació
TRAIN_RATIO = 1 - TEST_RATIO - VAL_RATIO  # restant per a entrenament

# Paràmetres per a la cerca aleatòria d'hiperparàmetres
N_TRIALS = 20       # nombre de proves amb combinacions d'hiperparàmetres
EPOCHS = 50         # nombre màxim d'epochs per prova
BATCH_SIZE = 32     # mida de batch durant l'entrenament

# Columnes d'entrada (features) i objectiu (target)
FEATURE_COLUMNS = [
    "Open", "High", "Low", "Volume",
    "EMA_7", "EMA_40", "MACD", "Signal_Line",
    "MACD_Hist", "RSI", "ATR"
]
TARGET_COLUMN = "Close"  # preu de tancament que volem predir

# Funcions auxiliars

def create_sequences(X, y, n_steps=30):
    """
    Genera seqüències de longitud n_steps a partir de X (features) i y (target).
    Retorna arrays numpy Xs de forma (n_mostres, n_steps, n_features)
    i ys de forma (n_mostres, 1).
    """
    Xs, ys = [], []
    for i in range(n_steps, len(X)):
        Xs.append(X[i - n_steps:i])  # finestra de n_steps anteriors
        ys.append(y[i])              # valor de tancament corresponent
    return np.array(Xs), np.array(ys)


def build_lstm_model(sequence_length, n_features, units, n_layers, dropout, learning_rate):
    """
    Construeix un model LSTM seqüencial amb els paràmetres especificats:
      - sequence_length: nombre de passos temporals d'entrada
      - n_features: dimensions de cada pas
      - units: neurones per capa LSTM
      - n_layers: profunditat (1 a 3)
      - dropout: taxa de Dropout després de cada capa LSTM
      - learning_rate: taxa d'aprenentatge de l'optimitzador
    L'output és una predicció lineal del tancament (Dense(1)).
    """
    model = Sequential()
    for i in range(n_layers):
        return_sequences = (i < n_layers - 1)
        if i == 0:
            # Primera capa rep explicitament la forma d'entrada
            model.add(LSTM(units,
                           return_sequences=return_sequences,
                           input_shape=(sequence_length, n_features)))
        else:
            # Capes intermitges o finals
            model.add(LSTM(units, return_sequences=return_sequences))
        model.add(Dropout(dropout))  # ajuda a evitar sobreajust
    # Capa de sortida amb activació lineal per a regressió
    model.add(Dense(1, activation="linear"))
    # Compilar amb pèrdua Huber i Adam
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss="huber", optimizer=optimizer, metrics=["mean_absolute_error"])
    return model

#Calcula MAE, RMSE i R2 a partir del model i dades escalades i desescalem prèviament per obtenir mètriques en unitats originals.
def compute_metrics(model, X_scaled, y_scaled, scaler_y):

    y_pred_scaled = model.predict(X_scaled, verbose=0)
    # Desescalar prediccions i valors reals
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    y_true = scaler_y.inverse_transform(y_scaled)

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

# Procés principal per cada dataset 

def process_dataset(filename):
    dataset_name = os.path.splitext(filename)[0]  # extreure nom sense extensió
    print(f"\n=== Procesando dataset: {dataset_name} ===")

    # Carregar CSV i ordenar per data
    path = os.path.join(BASE_PATH, filename)
    df = pd.read_csv(path)
    df['Date'] = pd.to_datetime(df['Date'])  # assegurar tipus datetime
    df.sort_values("Date", inplace=True)

    # Eliminar files amb valors NaN en qualsevol feature o en el target
    df.dropna(subset=FEATURE_COLUMNS + [TARGET_COLUMN], inplace=True)

    # Crear seqüències temporals d'entrada X_seq i vector y_seq
    data_X = df[FEATURE_COLUMNS].values
    data_y = df[TARGET_COLUMN].values.reshape(-1, 1)
    X_seq, y_seq = create_sequences(data_X, data_y, n_steps=N_STEPS)

    # Divisió temporal en train, val i test
    n_total = len(X_seq)
    train_end = int(n_total * TRAIN_RATIO)
    val_end = train_end + int(n_total * VAL_RATIO)

    X_train, y_train = X_seq[:train_end], y_seq[:train_end]
    X_val, y_val     = X_seq[train_end:val_end], y_seq[train_end:val_end]
    X_test, y_test   = X_seq[val_end:], y_seq[val_end:]

    # Escalat de X amb MinMaxScaler basat només en train
    scaler_X = MinMaxScaler()
    flat_train = X_train.reshape(-1, len(FEATURE_COLUMNS))
    X_train_scaled = scaler_X.fit_transform(flat_train).reshape(X_train.shape)
    flat_val   = X_val.reshape(-1, len(FEATURE_COLUMNS))
    X_val_scaled   = scaler_X.transform(flat_val).reshape(X_val.shape)
    flat_test  = X_test.reshape(-1, len(FEATURE_COLUMNS))
    X_test_scaled  = scaler_X.transform(flat_test).reshape(X_test.shape)

    # Escalat de y amb MinMaxScaler independent
    scaler_y = MinMaxScaler()
    y_train_scaled = scaler_y.fit_transform(y_train)
    y_val_scaled   = scaler_y.transform(y_val)
    y_test_scaled  = scaler_y.transform(y_test)

    # Cerca de hiperparàmetres: guardem millor model segons val_loss
    best_val_loss = np.inf
    best_params = None
    best_model = None

    for trial in range(N_TRIALS):
        # Selecció aleatòria d'hiperparàmetres
        n_layers = random.choice([1, 2, 3])
        units    = random.choice([64, 128, 256, 512])
        dropout  = random.choice([0.1, 0.3, 0.5])
        lr       = random.choice([1e-5, 1e-4, 1e-3])

        # Construir i entrenar model
        model = build_lstm_model(N_STEPS, len(FEATURE_COLUMNS), units, n_layers, dropout, lr)
        callbacks = [
            EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True),
            ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)
        ]
        history = model.fit(
            X_train_scaled, y_train_scaled,
            validation_data=(X_val_scaled, y_val_scaled),
            epochs=EPOCHS, batch_size=BATCH_SIZE,
            callbacks=callbacks, verbose=0
        )

        # Comprovar si aquest trial és millor
        current_val_loss = min(history.history["val_loss"])
        if current_val_loss < best_val_loss:
            best_val_loss = current_val_loss
            best_params   = (n_layers, units, dropout, lr)
            best_model    = model

        print(f" Trial {trial+1}/{N_TRIALS} - val_loss: {current_val_loss:.6f}  (layers={n_layers}, units={units}, dropout={dropout}, lr={lr})")

    # Resum de millor configuració trobada
    print(f"→ Millor val_loss: {best_val_loss:.6f} amb params: layers={best_params[0]}, units={best_params[1]}, dropout={best_params[2]}, lr={best_params[3]}")

    # Guardar paràmetres en JSON per a anàlisi posterior
    params_folder = os.path.join("resultats_LSTM", dataset_name)
    os.makedirs(params_folder, exist_ok=True)
    params_path = os.path.join(params_folder, f"{dataset_name}_best_params.json")
    with open(params_path, "w") as fp:
        json.dump({
            "n_layers": best_params[0],
            "units": best_params[1],
            "dropout": best_params[2],
            "learning_rate": best_params[3]
        }, fp)
    print(f" Paràmetres guardats a: {params_path}")

    # Avaluar el millor model amb el conjunt de test definitiu
    mae_test, rmse_test, r2_test = compute_metrics(best_model, X_test_scaled, y_test_scaled, scaler_y)

    # Emmagatzemar resultats en CSV
    df_metrics = pd.DataFrame({
        "Dataset": [dataset_name],
        "MAE": [mae_test],
        "RMSE": [rmse_test],
        "R2": [r2_test]
    })
    metrics_path = os.path.join(params_folder, f"{dataset_name}_metrics_test.csv")
    df_metrics.to_csv(metrics_path, index=False)
    print(f" Métriques de test guardades a: {metrics_path}")

    # Guardar pesos del model entrenat per a reutilització futura
    weights_path = os.path.join(params_folder, f"{dataset_name}_best_weights.weights.h5")
    best_model.save_weights(weights_path)
    print(f" Pesos guardats a: {weights_path}")


if __name__ == "__main__":
    # Crear directori mestre per a resultats LSTM si no existeix
    os.makedirs("resultats_LSTM", exist_ok=True)
    # Processar cada arxiu definid en DATASETS
    for fname in DATASETS:
        process_dataset(fname)



=== Procesando dataset: Amazon_Stock_Price_output ===


  super().__init__(**kwargs)


 Trial 1/20 - val_loss: 0.002221  (layers=2, units=256, dropout=0.5, lr=1e-05)
 Trial 2/20 - val_loss: 0.001122  (layers=2, units=256, dropout=0.1, lr=0.0001)
 Trial 3/20 - val_loss: 0.016056  (layers=1, units=64, dropout=0.5, lr=1e-05)
 Trial 4/20 - val_loss: 0.000947  (layers=3, units=128, dropout=0.5, lr=0.001)
 Trial 5/20 - val_loss: 0.001289  (layers=1, units=256, dropout=0.3, lr=1e-05)
 Trial 6/20 - val_loss: 0.000454  (layers=1, units=512, dropout=0.1, lr=0.001)
 Trial 7/20 - val_loss: 0.001403  (layers=2, units=64, dropout=0.5, lr=0.001)
 Trial 8/20 - val_loss: 0.004922  (layers=2, units=64, dropout=0.1, lr=1e-05)
 Trial 9/20 - val_loss: 0.001414  (layers=2, units=512, dropout=0.1, lr=1e-05)
 Trial 10/20 - val_loss: 0.001382  (layers=3, units=64, dropout=0.3, lr=0.0001)
 Trial 11/20 - val_loss: 0.005753  (layers=2, units=64, dropout=0.1, lr=1e-05)
 Trial 12/20 - val_loss: 0.001435  (layers=2, units=64, dropout=0.5, lr=0.0001)
 Trial 13/20 - val_loss: 0.000606  (layers=1, units=

  super().__init__(**kwargs)


 Trial 1/20 - val_loss: 0.000267  (layers=1, units=512, dropout=0.1, lr=0.0001)
 Trial 2/20 - val_loss: 0.006269  (layers=3, units=64, dropout=0.5, lr=1e-05)
 Trial 3/20 - val_loss: 0.004667  (layers=2, units=64, dropout=0.5, lr=0.0001)
 Trial 4/20 - val_loss: 0.000360  (layers=3, units=64, dropout=0.1, lr=0.0001)
 Trial 5/20 - val_loss: 0.001136  (layers=1, units=256, dropout=0.3, lr=0.0001)
 Trial 6/20 - val_loss: 0.000278  (layers=1, units=512, dropout=0.5, lr=0.001)
 Trial 7/20 - val_loss: 0.000405  (layers=3, units=128, dropout=0.1, lr=0.0001)
 Trial 8/20 - val_loss: 0.000459  (layers=3, units=512, dropout=0.5, lr=0.0001)
 Trial 9/20 - val_loss: 0.001376  (layers=3, units=64, dropout=0.5, lr=0.0001)
 Trial 10/20 - val_loss: 0.000570  (layers=2, units=64, dropout=0.3, lr=0.0001)
 Trial 11/20 - val_loss: 0.000579  (layers=1, units=512, dropout=0.3, lr=1e-05)
 Trial 12/20 - val_loss: 0.001231  (layers=3, units=256, dropout=0.3, lr=1e-05)
 Trial 13/20 - val_loss: 0.000203  (layers=1, 

  super().__init__(**kwargs)


 Trial 1/20 - val_loss: 0.003044  (layers=1, units=64, dropout=0.1, lr=1e-05)
 Trial 2/20 - val_loss: 0.000915  (layers=2, units=512, dropout=0.1, lr=0.001)
 Trial 3/20 - val_loss: 0.006500  (layers=3, units=64, dropout=0.3, lr=1e-05)
 Trial 4/20 - val_loss: 0.002689  (layers=3, units=512, dropout=0.3, lr=1e-05)
 Trial 5/20 - val_loss: 0.002668  (layers=3, units=256, dropout=0.5, lr=0.0001)
 Trial 6/20 - val_loss: 0.008131  (layers=2, units=64, dropout=0.5, lr=1e-05)
 Trial 7/20 - val_loss: 0.001443  (layers=1, units=512, dropout=0.3, lr=1e-05)
 Trial 8/20 - val_loss: 0.001247  (layers=2, units=512, dropout=0.5, lr=0.001)
 Trial 9/20 - val_loss: 0.010568  (layers=3, units=64, dropout=0.5, lr=1e-05)
 Trial 10/20 - val_loss: 0.000867  (layers=1, units=512, dropout=0.5, lr=0.0001)
 Trial 11/20 - val_loss: 0.004847  (layers=2, units=64, dropout=0.1, lr=0.0001)
 Trial 12/20 - val_loss: 0.002670  (layers=3, units=256, dropout=0.1, lr=0.0001)
 Trial 13/20 - val_loss: 0.001727  (layers=2, unit

  super().__init__(**kwargs)


 Trial 1/20 - val_loss: 0.002066  (layers=2, units=256, dropout=0.5, lr=1e-05)
 Trial 2/20 - val_loss: 0.000269  (layers=1, units=128, dropout=0.1, lr=0.0001)
 Trial 3/20 - val_loss: 0.000222  (layers=1, units=256, dropout=0.5, lr=0.0001)
 Trial 4/20 - val_loss: 0.000175  (layers=1, units=256, dropout=0.1, lr=0.001)
 Trial 5/20 - val_loss: 0.000383  (layers=1, units=64, dropout=0.3, lr=0.0001)
 Trial 6/20 - val_loss: 0.002330  (layers=2, units=128, dropout=0.1, lr=1e-05)
 Trial 7/20 - val_loss: 0.000521  (layers=3, units=128, dropout=0.3, lr=0.0001)
 Trial 8/20 - val_loss: 0.000821  (layers=3, units=512, dropout=0.1, lr=1e-05)
 Trial 9/20 - val_loss: 0.000285  (layers=3, units=512, dropout=0.3, lr=0.001)
 Trial 10/20 - val_loss: 0.000955  (layers=2, units=128, dropout=0.5, lr=1e-05)
 Trial 11/20 - val_loss: 0.000468  (layers=3, units=64, dropout=0.3, lr=0.001)
 Trial 12/20 - val_loss: 0.001084  (layers=1, units=512, dropout=0.3, lr=1e-05)
 Trial 13/20 - val_loss: 0.000688  (layers=3, u

  super().__init__(**kwargs)


 Trial 1/20 - val_loss: 0.018995  (layers=2, units=64, dropout=0.1, lr=1e-05)
 Trial 2/20 - val_loss: 0.010921  (layers=1, units=64, dropout=0.5, lr=0.0001)
 Trial 3/20 - val_loss: 0.005927  (layers=2, units=256, dropout=0.3, lr=1e-05)
 Trial 4/20 - val_loss: 0.004911  (layers=1, units=256, dropout=0.3, lr=1e-05)
 Trial 5/20 - val_loss: 0.006583  (layers=2, units=64, dropout=0.5, lr=0.001)
 Trial 6/20 - val_loss: 0.005502  (layers=3, units=256, dropout=0.3, lr=1e-05)
 Trial 7/20 - val_loss: 0.002501  (layers=2, units=512, dropout=0.3, lr=0.001)
 Trial 8/20 - val_loss: 0.010455  (layers=2, units=64, dropout=0.1, lr=1e-05)
 Trial 9/20 - val_loss: 0.003028  (layers=1, units=256, dropout=0.3, lr=1e-05)
 Trial 10/20 - val_loss: 0.002413  (layers=3, units=64, dropout=0.1, lr=0.001)
 Trial 11/20 - val_loss: 0.013358  (layers=3, units=128, dropout=0.3, lr=1e-05)
 Trial 12/20 - val_loss: 0.006358  (layers=2, units=256, dropout=0.5, lr=1e-05)
 Trial 13/20 - val_loss: 0.000802  (layers=2, units=1

  super().__init__(**kwargs)


 Trial 1/20 - val_loss: 0.001103  (layers=3, units=128, dropout=0.1, lr=0.001)
 Trial 2/20 - val_loss: 0.002013  (layers=2, units=256, dropout=0.5, lr=0.0001)
 Trial 3/20 - val_loss: 0.001402  (layers=3, units=512, dropout=0.1, lr=0.0001)
 Trial 4/20 - val_loss: 0.002421  (layers=3, units=512, dropout=0.3, lr=1e-05)
 Trial 5/20 - val_loss: 0.000905  (layers=1, units=512, dropout=0.5, lr=0.001)
 Trial 6/20 - val_loss: 0.006336  (layers=3, units=64, dropout=0.1, lr=0.0001)
 Trial 7/20 - val_loss: 0.001598  (layers=2, units=256, dropout=0.1, lr=0.0001)
 Trial 8/20 - val_loss: 0.002482  (layers=2, units=64, dropout=0.1, lr=0.001)
 Trial 9/20 - val_loss: 0.001802  (layers=2, units=256, dropout=0.1, lr=0.001)
 Trial 10/20 - val_loss: 0.001801  (layers=3, units=512, dropout=0.1, lr=0.001)
 Trial 11/20 - val_loss: 0.000977  (layers=2, units=256, dropout=0.3, lr=0.001)
 Trial 12/20 - val_loss: 0.000930  (layers=1, units=256, dropout=0.3, lr=0.0001)
 Trial 13/20 - val_loss: 0.002322  (layers=1, 

  super().__init__(**kwargs)


 Trial 1/20 - val_loss: 0.000618  (layers=1, units=128, dropout=0.5, lr=0.001)
 Trial 2/20 - val_loss: 0.003561  (layers=3, units=256, dropout=0.3, lr=1e-05)
 Trial 3/20 - val_loss: 0.004473  (layers=3, units=128, dropout=0.5, lr=1e-05)
 Trial 4/20 - val_loss: 0.000463  (layers=1, units=128, dropout=0.3, lr=0.001)
 Trial 5/20 - val_loss: 0.001104  (layers=3, units=512, dropout=0.1, lr=0.0001)
 Trial 6/20 - val_loss: 0.001167  (layers=2, units=256, dropout=0.1, lr=0.0001)
 Trial 7/20 - val_loss: 0.000649  (layers=2, units=128, dropout=0.3, lr=0.001)
 Trial 8/20 - val_loss: 0.000651  (layers=2, units=64, dropout=0.3, lr=0.001)
 Trial 9/20 - val_loss: 0.000677  (layers=3, units=256, dropout=0.1, lr=0.001)
 Trial 10/20 - val_loss: 0.000759  (layers=3, units=128, dropout=0.1, lr=0.001)
 Trial 11/20 - val_loss: 0.003510  (layers=1, units=128, dropout=0.5, lr=0.0001)
 Trial 12/20 - val_loss: 0.000757  (layers=2, units=128, dropout=0.3, lr=0.0001)
 Trial 13/20 - val_loss: 0.006462  (layers=2, 

  super().__init__(**kwargs)


 Trial 1/20 - val_loss: 0.001440  (layers=1, units=128, dropout=0.1, lr=0.0001)
 Trial 2/20 - val_loss: 0.003107  (layers=3, units=64, dropout=0.5, lr=0.0001)
 Trial 3/20 - val_loss: 0.000633  (layers=1, units=128, dropout=0.1, lr=0.0001)
 Trial 4/20 - val_loss: 0.000409  (layers=1, units=64, dropout=0.1, lr=0.0001)
 Trial 5/20 - val_loss: 0.000722  (layers=3, units=128, dropout=0.5, lr=0.0001)
 Trial 6/20 - val_loss: 0.000457  (layers=1, units=256, dropout=0.3, lr=0.001)
 Trial 7/20 - val_loss: 0.001985  (layers=3, units=64, dropout=0.3, lr=0.001)
 Trial 8/20 - val_loss: 0.000395  (layers=2, units=256, dropout=0.5, lr=0.001)
 Trial 9/20 - val_loss: 0.008657  (layers=1, units=64, dropout=0.5, lr=1e-05)
 Trial 10/20 - val_loss: 0.001508  (layers=3, units=256, dropout=0.3, lr=1e-05)
 Trial 11/20 - val_loss: 0.001956  (layers=3, units=256, dropout=0.3, lr=1e-05)
 Trial 12/20 - val_loss: 0.000219  (layers=2, units=128, dropout=0.1, lr=0.001)
 Trial 13/20 - val_loss: 0.000926  (layers=1, un