In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import plotly.graph_objects as go

# -----------------------------------
# Funció per recalcular indicadors tècnics
# -----------------------------------
def recompute_indicators(df):
    """
    Calcula EMA_7, EMA_40, MACD, Signal_Line, MACD_Hist, RSI i ATR
    basant-se en la columna 'Close'. Omple aquestes columnes al DataFrame.
    """
    # EMA 7 i EMA 40
    df['EMA_7']  = df['Close'].ewm(span=7, adjust=False).mean()
    df['EMA_40'] = df['Close'].ewm(span=40, adjust=False).mean()

    # MACD (EMA_12 - EMA_26), Signal Line (EMA de MACD), MACD Hist
    ema_12 = df['Close'].ewm(span=12, adjust=False).mean()
    ema_26 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = ema_12 - ema_26
    df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()
    df['MACD_Hist'] = df['MACD'] - df['Signal_Line']

    # RSI (14 períodes)
    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0.0)
    loss = -delta.where(delta < 0, 0.0)
    avg_gain = gain.rolling(window=14, min_periods=14).mean()
    avg_loss = loss.rolling(window=14, min_periods=14).mean()
    rs = avg_gain / avg_loss
    df['RSI'] = 100 - (100 / (1 + rs))

    # ATR (14 períodes)
    high_low = df['High'] - df['Low']
    high_prev_close = (df['High'] - df['Close'].shift(1)).abs()
    low_prev_close  = (df['Low'] - df['Close'].shift(1)).abs()
    true_range = pd.concat([high_low, high_prev_close, low_prev_close], axis=1).max(axis=1)
    df['ATR'] = true_range.rolling(window=14, min_periods=14).mean()

    return


# -----------------------------------
# Llista de fitxers CSV a processar
# -----------------------------------
csv_files = [
    "Amazon_Stock_Price_output.csv",
    "Euro_Stoxx_50_Stock_Price_output.csv",
    "Google_Stock_Price_output.csv",
    "Hang_Seng_Stock_Price_output.csv",
    "IBEX_35_Stock_Price_output.csv",
    "Indra_Stock_Price_output.csv",
    "P&G_Stock_Price_output.csv",
    "S&P500_Stock_Price_output.csv"
]

# Ruta base on estan els CSV
BASE_PATH = r"C:\Users\jesus\Desktop\TFG\GitHUb\TFG_PredictStock\Conjunt de dades Preprocessades\Datasets"

# Carpeta base de resultats
RESULTS_BASE = "resultats_SVR"
os.makedirs(RESULTS_BASE, exist_ok=True)

# Definició de les columnes de feature i target
FEATURE_COLUMNS = [
    'Open', 'High', 'Low', 'Volume',
    'EMA_7', 'EMA_40', 'MACD', 'Signal_Line',
    'MACD_Hist', 'RSI', 'ATR'
]
TARGET_COLUMN = 'Close'


for file_name in csv_files:
    # 1) Definim paths i noms
    file_path = os.path.join(BASE_PATH, file_name)
    name_no_ext = os.path.splitext(file_name)[0]                # exemple: "Amazon_Stock_Price_output"
    dataset_name = name_no_ext.replace("_Stock_Price_output", "").lower()  # exemple: "amazon"
    model_folder = os.path.join(RESULTS_BASE, dataset_name)
    os.makedirs(model_folder, exist_ok=True)
    print(f"\n=== Processant {file_name} (dataset: {dataset_name}) ===")
    print(f"Carpeta de resultats: {model_folder}")

    # 2) Llegir CSV i preprocessar
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'], dayfirst=False)
    df.sort_values('Date', inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Recalcular indicadors inicials
    recompute_indicators(df)

    # Eliminar files amb NaNs en features o target
    before_drop = len(df)
    df.dropna(subset=FEATURE_COLUMNS + [TARGET_COLUMN], inplace=True)
    after_drop = len(df)
    print(f"  Files originals: {before_drop}, després de dropna: {after_drop} (–{before_drop - after_drop} files).")
    df.reset_index(drop=True, inplace=True)

    # 3) Split cronològic: train (70%) / val (15%) / test (15%)
    n_total = len(df)
    train_size = int(n_total * 0.70)
    val_size   = int(n_total * 0.15)
    test_size  = n_total - train_size - val_size

    # Extrair X i y ja ordenats
    X_raw = df[FEATURE_COLUMNS].values
    y_raw = df[TARGET_COLUMN].values

    # Dividir
    X_train_raw = X_raw[:train_size]
    y_train_raw = y_raw[:train_size]

    X_val_raw   = X_raw[train_size : train_size + val_size]
    y_val_raw   = y_raw[train_size : train_size + val_size]

    X_test_raw  = X_raw[train_size + val_size :]
    y_test_raw  = y_raw[train_size + val_size :]

    # Verificar rangs de dates (opcional, imprimim un resum)
    dates = df['Date']
    d_train_start = dates.iloc[0].date()
    d_train_end   = dates.iloc[train_size - 1].date()
    d_val_start   = dates.iloc[train_size].date()
    d_val_end     = dates.iloc[train_size + val_size - 1].date()
    d_test_start  = dates.iloc[train_size + val_size].date()
    d_test_end    = dates.iloc[-1].date()

    print(f"  Rangs de dates:")
    print(f"    Train: {d_train_start} → {d_train_end} ({len(X_train_raw)} mostres)")
    print(f"    Val:   {d_val_start} → {d_val_end} ({len(X_val_raw)} mostres)")
    print(f"    Test:  {d_test_start} → {d_test_end} ({len(X_test_raw)} mostres)")

    # Assercions per garantir absència de NaNs
    assert not np.isnan(X_train_raw).any()
    assert not np.isnan(y_train_raw).any()
    assert not np.isnan(X_val_raw).any()
    assert not np.isnan(y_val_raw).any()
    assert not np.isnan(X_test_raw).any()
    assert not np.isnan(y_test_raw).any()

    # 4) Escalat de X i y
    scaler_X = StandardScaler()
    X_train = scaler_X.fit_transform(X_train_raw)
    X_val   = scaler_X.transform(X_val_raw)
    X_test  = scaler_X.transform(X_test_raw)

    assert not np.isnan(X_train).any()
    assert not np.isnan(X_val).any()
    assert not np.isnan(X_test).any()

    scaler_y = StandardScaler()
    y_train = scaler_y.fit_transform(y_train_raw.reshape(-1, 1)).ravel()
    y_val   = scaler_y.transform(y_val_raw.reshape(-1, 1)).ravel()
    y_test  = scaler_y.transform(y_test_raw.reshape(-1, 1)).ravel()

    assert not np.isnan(y_train).any()
    assert not np.isnan(y_val).any()
    assert not np.isnan(y_test).any()

    # 5) Cercar hiperparàmetres amb TimeSeriesSplit
    svr = SVR(kernel='rbf')
    param_grid = {
        'C':       [1, 10, 100, 1000],
        'gamma':   [0.001, 0.01, 0.1, 1],
        'epsilon': [0.01, 0.1, 1]
    }

    tscv = TimeSeriesSplit(n_splits=5)
    grid_search = GridSearchCV(
        estimator=svr,
        param_grid=param_grid,
        cv=tscv,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=0
    )
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print(f"  Millors hiperparàmetres (GridSearchCV): {best_params}")

    # 6) Entrenar SBVR final amb els millors paràmetres
    best_svr = SVR(
        kernel='rbf',
        C=best_params['C'],
        gamma=best_params['gamma'],
        epsilon=best_params['epsilon']
    )
    best_svr.fit(X_train, y_train)

    # 6.1) Avaluar sobre VALIDATION
    y_val_pred_scaled = best_svr.predict(X_val)
    y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled.reshape(-1, 1)).ravel()
    y_val_true = y_val_raw  # ja en escala original

    rmse_val = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
    mae_val = mean_absolute_error(y_val_true, y_val_pred)
    r2_val  = r2_score(y_val_true, y_val_pred)

    print(f"  VALIDATION → RMSE: {rmse_val:.4f}, MAE: {mae_val:.4f}, R²: {r2_val:.4f}")

    # 6.2) Avaluar sobre TEST
    y_test_pred_scaled = best_svr.predict(X_test)
    y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).ravel()
    y_test_true = y_test_raw  # ja en escala original

    rmse_test = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
    mae_test = mean_absolute_error(y_test_true, y_test_pred)
    r2_test  = r2_score(y_test_true, y_test_pred)

    print(f"  TEST       → RMSE: {rmse_test:.4f}, MAE: {mae_test:.4f}, R²: {r2_test:.4f}")

    # 7) Guardar mètriques en CSV
    df_metrics = pd.DataFrame([{
        "Dataset": dataset_name,
        "MAE_val": mae_val,
        "RMSE_val": rmse_val,
        "R2_val": r2_val,
        "MAE_test": mae_test,
        "RMSE_test": rmse_test,
        "R2_test": r2_test
    }])
    metrics_csv = os.path.join(model_folder, "metrics_summary.csv")
    df_metrics.to_csv(metrics_csv, index=False)
    print(f"  ✓ Mètriques guardades a: {metrics_csv}")

    # 8) Guardar prediccions de TEST en CSV
    df_test_preds = pd.DataFrame({
        "Date":       dates.iloc[train_size + val_size :].reset_index(drop=True),
        "Close_true": y_test_true,
        "Close_pred": y_test_pred
    })
    test_preds_csv = os.path.join(model_folder, "test_predictions.csv")
    df_test_preds.to_csv(test_preds_csv, index=False)
    print(f"  ✓ Prediccions (Test) guardades a: {test_preds_csv}")

    # 9) Graficar Real vs Predicho (Test) amb Plotly i desar com HTML
    dates_test = df['Date'].iloc[train_size + val_size :].reset_index(drop=True)

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=dates_test,
        y=y_test_true,
        mode='lines',
        name='Real (Close)',
        line=dict(color='blue')
    ))
    fig.add_trace(go.Scatter(
        x=dates_test,
        y=y_test_pred,
        mode='lines',
        name='Predicho',
        line=dict(color='red', dash='dash')
    ))
    fig.update_layout(
        title=f"{dataset_name.capitalize()} – Real vs Predicción (Test)",
        xaxis_title='Fecha',
        yaxis_title='Precio Close (USD)',
        template='plotly_white',
        xaxis_rangeslider_visible=True
    )

    plot_html = os.path.join(model_folder, f"{dataset_name}_test_plot.html")
    fig.write_html(plot_html)
    print(f"  ✓ Gràfica Test guardada en: {plot_html}")

    # 10) Predicció autoregressiva per als propers 10 dies laborables
    df_future = df.copy().reset_index(drop=True)
    future_dates = pd.bdate_range(
        start=df_future['Date'].iloc[-1] + pd.Timedelta(days=1),
        periods=10
    )
    future_preds = []

    for date in future_dates:
        # Última fila amb indicadors complets
        last_row = df_future.iloc[-1]
        feature_values = last_row[FEATURE_COLUMNS].values.reshape(1, -1)

        # Escalar i predir
        feature_scaled = scaler_X.transform(feature_values)
        y_pred_scaled = best_svr.predict(feature_scaled)[0]
        y_pred_real   = scaler_y.inverse_transform([[y_pred_scaled]])[0][0]
        future_preds.append(y_pred_real)

        # Afegir nova fila amb .loc
        prev = df_future.iloc[-1]
        new_index = len(df_future)
        df_future.loc[new_index, 'Date']        = date
        df_future.loc[new_index, 'Open']        = prev['Close']
        df_future.loc[new_index, 'High']        = y_pred_real
        df_future.loc[new_index, 'Low']         = y_pred_real
        df_future.loc[new_index, 'Close']       = y_pred_real
        df_future.loc[new_index, 'Volume']      = prev['Volume']
        df_future.loc[new_index, 'EMA_7']       = np.nan
        df_future.loc[new_index, 'EMA_40']      = np.nan
        df_future.loc[new_index, 'MACD']        = np.nan
        df_future.loc[new_index, 'Signal_Line'] = np.nan
        df_future.loc[new_index, 'MACD_Hist']   = np.nan
        df_future.loc[new_index, 'RSI']         = np.nan
        df_future.loc[new_index, 'ATR']         = np.nan

        # Recalcular indicadors
        recompute_indicators(df_future)

    # 11) Guardar prediccions futures en CSV
    df_fut_pred = pd.DataFrame({
        "Date":           future_dates,
        "Predicted_Close": future_preds
    })
    fut_csv = os.path.join(model_folder, f"{dataset_name}_future_10days.csv")
    df_fut_pred.to_csv(fut_csv, index=False)
    print(f"  ✓ Prediccions futures guardades en: {fut_csv}")

    # 12) Graficar històric + prediccions futures amb Plotly
    fig_future = go.Figure()
    fig_future.add_trace(go.Scatter(
        x=df['Date'], y=df['Close'],
        mode='lines', name='Histórico Close', line=dict(color='lightblue')
    ))
    fig_future.add_trace(go.Scatter(
        x=future_dates, y=np.array(future_preds),
        mode='lines+markers', name='Predicción futura',
        line=dict(color='orange', dash='dash'),
        marker=dict(size=6)
    ))
    fig_future.update_layout(
        title=f"{dataset_name.capitalize()} – Predicción Próximos 10 Días",
        xaxis_title='Fecha',
        yaxis_title='Precio Close (USD)',
        template='plotly_white',
        xaxis_rangeslider_visible=True
    )

    fut_html = os.path.join(model_folder, f"{dataset_name}_future_plot.html")
    fig_future.write_html(fut_html)
    print(f"  ✓ Gràfica futura guardada en: {fut_html}")



=== Processant Amazon_Stock_Price_output.csv (dataset: amazon) ===
Carpeta de resultats: resultats_SVR\amazon
  Files originals: 1278, després de dropna: 1265 (–13 files).
  Rangs de dates:
    Train: 2020-01-22 → 2023-07-27 (885 mostres)
    Val:   2023-07-28 → 2024-04-26 (189 mostres)
    Test:  2024-04-29 → 2025-01-31 (191 mostres)
  Millors hiperparàmetres (GridSearchCV): {'C': 1000, 'epsilon': 0.01, 'gamma': 0.001}
  VALIDATION → MSE: 0.6528, MAE: 0.6378, R²: 0.9981
  TEST       → MSE: 4.1463, MAE: 1.5237, R²: 0.9882
  ✓ Mètriques guardades a: resultats_SVR\amazon\metrics_summary.csv
  ✓ Prediccions (Test) guardades a: resultats_SVR\amazon\test_predictions.csv
  ✓ Gràfica Test guardada en: resultats_SVR\amazon\amazon_test_plot.html
  ✓ Prediccions futures guardades en: resultats_SVR\amazon\amazon_future_10days.csv
  ✓ Gràfica futura guardada en: resultats_SVR\amazon\amazon_future_plot.html

=== Processant Euro_Stoxx_50_Stock_Price_output.csv (dataset: euro_stoxx_50) ===
Carpeta d