In [None]:
import os # https://docs.python.org/3/library/os.html
import pandas as pd # https://pandas.pydata.org/docs/user_guide/10min.html
import numpy as np # https://numpy.org/doc/2.2/
import matplotlib.pyplot as plt # https://matplotlib.org/stable/index.html
import optuna # https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/002_configurations.html
from sklearn.preprocessing import MinMaxScaler, LabelEncoder # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error # https://scikit-learn.org/stable/api/sklearn.metrics.html
from keras.models import Sequential # https://keras.io/guides/sequential_model/
from keras.layers import LSTM, Dropout, Dense # https://keras.io/api/layers/recurrent_layers/lstm/
from keras.optimizers import Adam # https://keras.io/api/optimizers/adam/
from keras.callbacks import EarlyStopping # https://keras.io/api/callbacks/early_stopping/
from sklearn.model_selection import TimeSeriesSplit # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html

# Folder paths for training and test datasets
train_folder = " " # Path to a train set
test_folder = " " # Path to a test set

# Function to load and concatenate all CSV files in a folder into a single DataFrame
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
# https://pandas.pydata.org/docs/reference/api/pandas.concat.html
def load_multiple_csv(folder_path):
    # List all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    # Read each CSV and concatenate them into a single DataFrame
    dfs = [pd.read_csv(os.path.join(folder_path, csv_file)) for csv_file in csv_files]
    # Concatenate all DataFrames and reset index
    return pd.concat(dfs, ignore_index=True) if dfs else None

# Load train and test datasets
train_df = load_multiple_csv(train_folder)
test_df = load_multiple_csv(test_folder)

# Ensure data was loaded properly
if train_df is None or test_df is None:
    raise ValueError("Error: Could not load training or testing data.")

# Convert Date column to DateTime format and set it as index
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
train_df["Date"] = pd.to_datetime(train_df["Date"])
test_df["Date"] = pd.to_datetime(test_df["Date"])
train_df.set_index("Date", inplace=True) # Set Date column as index https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_index.html
test_df.set_index("Date", inplace=True)

# Encode the 'Symbol' column to numerical values
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
encoder = LabelEncoder()
train_df["Symbol_Encoded"] = encoder.fit_transform(train_df["Symbol"])
test_df["Symbol_Encoded"] = encoder.transform(test_df["Symbol"])

# Save mappings for future reference or output
symbol_mapping = dict(zip(train_df["Symbol_Encoded"], train_df["Symbol"]))
name_mapping = dict(zip(train_df["Symbol_Encoded"], train_df["Name"]))

# Remove original Symbol and Name columns
train_df = train_df.drop(columns=["Symbol", "Name"])
test_df = test_df.drop(columns=["Symbol", "Name"])

# Initialize containers for forecasts and evaluation metrics
all_forecasts = []
test_metrics = []

features = ['Close','High','Low','Open','Marketcap','Volume','trend','MA_7d','MA_14d', 'MA_30d']

time_step = 30
# Set up time series cross-validator
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
tscv = TimeSeriesSplit(n_splits=5)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Loop over each crypto asset (by encoded symbol)
for symbol_encoded in train_df["Symbol_Encoded"].unique(): # https://pandas.pydata.org/docs/reference/api/pandas.Series.unique.html
    train_crypto_data = train_df[train_df["Symbol_Encoded"] == symbol_encoded].copy() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.copy.html
    test_crypto_data = test_df[test_df["Symbol_Encoded"] == symbol_encoded].copy()

    train_crypto_data.sort_index(inplace=True) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_index.html
    test_crypto_data.sort_index(inplace=True)

    print(f"\n Running LSTM Forecast for {symbol_mapping[symbol_encoded]} ({name_mapping[symbol_encoded]})")

    # Scale each feature individually using MinMaxScaler
    # https://scikit-learn.org/0.19/modules/generated/sklearn.preprocessing.MinMaxScaler.html
    scaler_dict = {col: MinMaxScaler() for col in features}
    scaled_train_data = np.zeros_like(train_crypto_data[features].values)
    scaled_test_data = np.zeros_like(test_crypto_data[features].values)

    for idx, col in enumerate(features):
        scaled_train_data[:, idx] = scaler_dict[col].fit_transform(train_crypto_data[[col]]).flatten()
        scaled_test_data[:, idx] = scaler_dict[col].transform(test_crypto_data[[col]]).flatten()

    X, y = [], []
    for i in range(time_step, len(scaled_train_data)):
        X.append(scaled_train_data[i - time_step:i])
        y.append(scaled_train_data[i, features.index("Close")])
    X, y = np.array(X), np.array(y)

    # Define LSTM model using Optuna
    def objective(trial):
        model = Sequential()
        model.add(LSTM(trial.suggest_int("lstm1_units", 32, 128), return_sequences=True, input_shape=(X.shape[1], X.shape[2])))
        model.add(Dropout(trial.suggest_float("dropout1", 0.2, 0.5)))
        model.add(LSTM(trial.suggest_int("lstm2_units", 16, 64)))
        model.add(Dropout(trial.suggest_float("dropout2", 0.2, 0.5)))
        model.add(Dense(1))
        model.compile(optimizer=Adam(learning_rate=trial.suggest_float("lr", 1e-4, 1e-2, log=True)), loss='mean_squared_error')

        for train_idx, val_idx in tscv.split(X):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0,
                      validation_data=(X_val, y_val), callbacks=[early_stopping])
        return model.evaluate(X_val, y_val, verbose=0)

    print("  -> Running Optuna...")
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=30)
    print(f"  -> Best Params: {study.best_params}")

    best_params = study.best_params
    model = Sequential()
    model.add(LSTM(best_params["lstm1_units"], return_sequences=True, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dropout(best_params["dropout1"]))
    model.add(LSTM(best_params["lstm2_units"]))
    model.add(Dropout(best_params["dropout2"]))
    model.add(Dense(1))
    model.compile(optimizer=Adam(learning_rate=best_params["lr"]), loss='mean_squared_error')
    model.fit(X, y, epochs=10, batch_size=16, verbose=0, callbacks=[early_stopping])

    # Predict first 7 days of the test set using rolling forecast
    last_window = scaled_train_data[:7]
    rolling_predictions = []

    for i in range(7):
        input_seq = last_window[:7].reshape(1, 7, len(features))
        pred = model.predict(input_seq, verbose=0)[0][0]

        # Build the next input row
        next_row = scaled_test_data[i].copy()
        next_row[features.index("Close")] = pred

        last_window = np.vstack([last_window[1:], next_row]) # https://numpy.org/devdocs/reference/generated/numpy.vstack.html
        rolling_predictions.append(pred)

    # Unscale predictions and actual values
    temp_scaled = np.zeros((7, len(features))) # https://numpy.org/devdocs/reference/generated/numpy.zeros.html
    for i in range(7):
        temp_scaled[i, features.index("Close")] = rolling_predictions[i]
    forecast_unscaled = scaler_dict["Close"].inverse_transform(temp_scaled[:, [features.index("Close")]]).flatten()
    actual_values = test_crypto_data["Close"].values[:7]

    # Calculate test metrics
    test_mae = mean_absolute_error(actual_values, forecast_unscaled) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html
    test_mse = mean_squared_error(actual_values, forecast_unscaled)  # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
    test_rmse = np.sqrt(test_mse) # https://numpy.org/doc/2.1/reference/generated/numpy.sqrt.html
    test_mape = mean_absolute_percentage_error(actual_values, forecast_unscaled) * 100 # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html
    test_metrics.append([symbol_mapping[symbol_encoded], test_mae, test_mse, test_rmse, test_mape])

    # Plot predictions
    # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html
    # https://matplotlib.org/stable/tutorials/index.html
    plt.figure(figsize=(10, 5))
    plt.plot(test_crypto_data.index[:7], actual_values, label="Actual", marker='o')
    plt.plot(test_crypto_data.index[:7], forecast_unscaled, label="Predicted", marker='x')
    plt.title(f"{symbol_mapping[symbol_encoded]}: Forecast for First 7 Days")
    plt.legend()
    plt.grid(True)
    plt.show()

    # Build final forecast table for this crypto
    forecast_table = pd.DataFrame({
        "Date": test_crypto_data.index[:7],
        "Symbol": symbol_mapping[symbol_encoded],
        "Actual Value": actual_values,
        "Predicted Value": forecast_unscaled
    })
    print(forecast_table)
    all_forecasts.append(forecast_table)

# Print evaluation metrics
final_forecast_df = pd.concat(all_forecasts, ignore_index=True)
print("\nFinal Forecasted vs. Actual Values:")
print(final_forecast_df.to_string(index=False)) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_string.html

test_metrics_df = pd.DataFrame(test_metrics, columns=["Symbol", "Test MAE", "Test MSE", "Test RMSE", "Test MAPE (%) 7d LSTM Optuna"])
print("\nTest Metrics (First 7 Days):")
print(test_metrics_df)
# Save the predictions as csv files
test_metrics_df.to_csv("LSTM_Optuna_7d_metrics.csv", index=False) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
final_forecast_df.to_csv("LSTM_Optuna_7d_predictions.csv", index=False)