In [None]:
import os # https://docs.python.org/3/library/os.html
import pandas as pd # https://pandas.pydata.org/docs/user_guide/10min.html
import numpy as np # https://numpy.org/doc/2.2/
import matplotlib.pyplot as plt # https://matplotlib.org/stable/index.html
from sklearn.preprocessing import MinMaxScaler, LabelEncoder # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from keras.models import Sequential # https://keras.io/guides/sequential_model/
from keras.layers import LSTM, Dropout, Dense # https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM
from sklearn.model_selection import TimeSeriesSplit # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
from keras.optimizers import Adam # https://keras.io/api/optimizers/
from keras.callbacks import EarlyStopping # https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/EarlyStopping

# Folder paths for training and test datasets
train_folder = " " # Path to a train set
test_folder = " " # Path to a test set

# Function to load and concatenate all CSV files in a folder into a single DataFrame
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
# https://pandas.pydata.org/docs/reference/api/pandas.concat.html
def load_multiple_csv(folder_path):
    # List all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    # Read each CSV and concatenate them into a single DataFrame
    dfs = [pd.read_csv(os.path.join(folder_path, csv_file)) for csv_file in csv_files]
    # Concatenate all DataFrames and reset index
    return pd.concat(dfs, ignore_index=True) if dfs else None

# Load train and test datasets
train_df = load_multiple_csv(train_folder)
test_df = load_multiple_csv(test_folder)

# Ensure data was loaded properly
if train_df is None or test_df is None:
    raise ValueError("Error: Could not load training or testing data.")

# Convert Date column to DateTime format and set it as index
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
train_df["Date"] = pd.to_datetime(train_df["Date"])
train_df.set_index("Date", inplace=True) # Set Date column as index https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_index.html
test_df["Date"] = pd.to_datetime(test_df["Date"])
test_df.set_index("Date", inplace=True) # Set Date column as index

# Encode the 'Symbol' column to numerical values
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
encoder = LabelEncoder()
train_df["Symbol_Encoded"] = encoder.fit_transform(train_df["Symbol"])
test_df["Symbol_Encoded"] = encoder.transform(test_df["Symbol"])

# Save mappings for future reference or output
symbol_mapping = dict(zip(train_df["Symbol_Encoded"], train_df["Symbol"]))
name_mapping = dict(zip(train_df["Symbol_Encoded"], train_df["Name"]))

# Remove original Symbol and Name columns
train_df = train_df.drop(columns=["Symbol", "Name"])
test_df = test_df.drop(columns=["Symbol", "Name"])

# Initialize containers for forecasts and evaluation metrics
all_forecasts = []
validation_metrics = []
test_metrics = []

# Explicit list of features to use:
features = ['Close','High','Low','Open','Marketcap','Volume','trend','MA_7d','MA_14d', 'MA_30d']

# Set up time series cross-validator
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
tscv = TimeSeriesSplit(n_splits=5)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Loop over each crypto asset (by encoded symbol)
for symbol_encoded in train_df["Symbol_Encoded"].unique(): # https://pandas.pydata.org/docs/reference/api/pandas.Series.unique.html
    train_crypto_data = train_df[train_df["Symbol_Encoded"] == symbol_encoded].copy() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.copy.html
    test_crypto_data = test_df[test_df["Symbol_Encoded"] == symbol_encoded].copy()

    train_crypto_data = train_crypto_data.sort_index() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_index.html
    test_crypto_data = test_crypto_data.sort_index()

    print(f"\n Running LSTM Forecast for {symbol_mapping[symbol_encoded]} ({name_mapping[symbol_encoded]})")

    # Scale each feature individually using MinMaxScaler
    # https://scikit-learn.org/0.19/modules/generated/sklearn.preprocessing.MinMaxScaler.html
    scaler_dict = {col: MinMaxScaler() for col in features}
    scaled_train_data = np.zeros_like(train_crypto_data[features].values)

    for idx, col in enumerate(features):
        scaled_train_data[:, idx] = scaler_dict[col].fit_transform(train_crypto_data[[col]]).flatten() # https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.Series.reshape.html

    time_step = 30
    X, y = [], []
    for i in range(time_step, len(scaled_train_data)):
        X.append(scaled_train_data[i - time_step:i])
        y.append(scaled_train_data[i, features.index("Close")])

    X, y = np.array(X), np.array(y)

     # Time series cross-validation loop
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Define LSTM model
        model = Sequential([
            LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
            Dropout(0.3),
            LSTM(50),
            Dropout(0.3),
            Dense(1)
        ])
        
        # Train the model
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0, callbacks=[early_stopping])

        forecast_lstm_scaled = model.predict(X_val)

        temp_scaled = np.zeros((len(forecast_lstm_scaled), len(features)))
        temp_scaled[:, features.index("Close")] = forecast_lstm_scaled.flatten()

        for idx, col in enumerate(features):
            if col != "Close":
                temp_scaled[:, idx] = scaled_train_data[val_idx, idx]

        forecast_lstm = scaler_dict["Close"].inverse_transform(temp_scaled[:, [features.index("Close")]]).flatten()
        actual_values = train_crypto_data["Close"].iloc[val_idx].values

        # Validation metrics
        val_mae = mean_absolute_error(actual_values, forecast_lstm) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html
        val_mse = mean_squared_error(actual_values, forecast_lstm) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
        val_rmse = np.sqrt(val_mse) # https://numpy.org/doc/2.1/reference/generated/numpy.sqrt.html
        val_mape = mean_absolute_percentage_error(actual_values, forecast_lstm) * 100 # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html

        validation_metrics.append([symbol_mapping[symbol_encoded], fold+1, val_mae, val_mse, val_rmse, val_mape])

        # Plot predictions
        # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html
        # https://matplotlib.org/stable/tutorials/index.html
        plt.figure(figsize=(10, 5))
        plt.plot(train_crypto_data.index[val_idx], actual_values, label="Actual", marker='o')
        plt.plot(train_crypto_data.index[val_idx], forecast_lstm, label="Predicted", marker='o')
        plt.title(f"LSTM Forecast for {symbol_mapping[symbol_encoded]} (Fold {fold+1})")
        plt.legend()
        plt.show()

    # Initialize an array of zeros with the same shape as the test data for selected features
    scaled_test_data = np.zeros_like(test_crypto_data[features].values)
    # Apply feature-wise scaling using pre-fitted scalers
    for idx, col in enumerate(features):
        # Scale each column individually and flatten the result to fit into the array
        scaled_test_data[:, idx] = scaler_dict[col].transform(test_crypto_data[[col]]).flatten()

    # Prepare sequences for LSTM input
    X_test, y_test = [], []
    for i in range(time_step, len(scaled_test_data)):
        # Append sequences of length 'time_step' to X_test
        X_test.append(scaled_test_data[i - time_step:i])
        # Append the target variable ('Close' price) at time i to y_test
        y_test.append(scaled_test_data[i, features.index("Close")])

    # Convert lists to NumPy arrays for model input
    X_test, y_test = np.array(X_test), np.array(y_test)

    # Predict the next 7 'Close' values using the trained LSTM model
    forecast_lstm_scaled = model.predict(X_test[:7])
    # Create a temporary scaled array to help inverse transform predictions
    temp_scaled = np.zeros((7, len(features)))
    # Fill in only the predicted 'Close' values
    temp_scaled[:, features.index("Close")] = forecast_lstm_scaled.flatten()

    # For all other features (non-'Close'), copy the original scaled values from test data
    for idx, col in enumerate(features):
        if col != "Close":
            temp_scaled[:, idx] = scaled_test_data[:7, idx]

    # Inverse transform the predicted 'Close' values back to the original scale
    forecast_lstm = scaler_dict["Close"].inverse_transform(temp_scaled[:, [features.index("Close")]]).flatten()
    # Extract the actual 'Close' values from the test data for comparison
    actual_values = test_crypto_data["Close"].values[:7]

    # Calculate test metrics
    test_mae = mean_absolute_error(actual_values, forecast_lstm)
    test_mse = mean_squared_error(actual_values, forecast_lstm)
    test_rmse = np.sqrt(test_mse)
    test_mape = mean_absolute_percentage_error(actual_values, forecast_lstm) * 100

    # Append the metrics to a results list, tagged with the corresponding crypto symbol
    test_metrics.append([symbol_mapping[symbol_encoded], test_mae, test_mse, test_rmse, test_mape])

    # Plot the results
    plt.figure(figsize=(10, 5))
    plt.plot(test_crypto_data.index[:7], actual_values, label="Actual", marker='o')
    plt.plot(test_crypto_data.index[:7], forecast_lstm, label="Predicted", marker='o')
    plt.title(f"LSTM Test Forecast for {symbol_mapping[symbol_encoded]}")
    plt.legend()
    plt.show()

    # Build final forecast table for this crypto
    forecast_table = pd.DataFrame({
        "Date": test_crypto_data.index[:7],
        "Symbol": symbol_mapping[symbol_encoded],
        "Actual Value": actual_values,
        "Predicted Value": forecast_lstm
    })
    print(forecast_table)
    all_forecasts.append(forecast_table)

# Combine forecasts for all cryptocurrencies
final_forecast_df = pd.concat(all_forecasts, ignore_index=True)
print("\n Final Forecasted vs. Actual Values:")
print(final_forecast_df.to_string(index=False))

# Display Validation and Test Metrics
validation_metrics_df = pd.DataFrame(validation_metrics, columns=["Symbol", "Fold", "Validation MAE", "Validation MSE", "Validation RMSE", "Validation MAPE"])
test_metrics_df = pd.DataFrame(test_metrics, columns=["Symbol", "Test MAE", "Test MSE", "Test RMSE", "Test MAPE"])

# Print evaluation metrics
print("\nValidation Metrics:")
print(validation_metrics_df)

print("\nTest Metrics:")
print(test_metrics_df)