In [None]:
import os  # https://docs.python.org/3/library/os.html
import pandas as pd # https://pandas.pydata.org/docs/user_guide/10min.html
import numpy as np # https://numpy.org/doc/2.2/
import matplotlib.pyplot as plt # https://matplotlib.org/stable/index.html
from sklearn.preprocessing import MinMaxScaler, LabelEncoder # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error # https://scikit-learn.org/stable/api/sklearn.metrics.html
from sklearn.model_selection import TimeSeriesSplit # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
import xgboost as xgb # https://xgboost.readthedocs.io/en/stable/python/python_intro.html

# Folder paths for training and test datasets
train_folder = " " # Path to a train set
test_folder = " " # Path to a test set

# Function to load and concatenate all CSV files in a folder into a single DataFrame
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
# https://pandas.pydata.org/docs/reference/api/pandas.concat.html
def load_multiple_csv(folder_path):
    # List all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    # Read each CSV and concatenate them into a single DataFrame
    dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in csv_files]
    # Concatenate all DataFrames and reset index
    return pd.concat(dfs, ignore_index=True) if dfs else None

# Load train and test datasets
train_df = load_multiple_csv(train_folder)
test_df = load_multiple_csv(test_folder)

# Ensure data was loaded properly
if train_df is None or test_df is None:
    raise ValueError("Could not load training or testing data.")

# Convert Date column to DateTime format and set it as index
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html
train_df["Date"] = pd.to_datetime(train_df["Date"])
train_df.set_index("Date", inplace=True) # Set Date column as index https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_index.html
test_df["Date"] = pd.to_datetime(test_df["Date"])
test_df.set_index("Date", inplace=True) # Set Date column as index

# Encode the 'Symbol' column to numerical values
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
encoder = LabelEncoder()
train_df["Symbol_Encoded"] = encoder.fit_transform(train_df["Symbol"])
test_df["Symbol_Encoded"] = encoder.transform(test_df["Symbol"])

# Save mappings for future reference or output
symbol_mapping = dict(zip(train_df["Symbol_Encoded"], train_df["Symbol"]))
name_mapping = dict(zip(train_df["Symbol_Encoded"], train_df["Name"]))

# Remove original Symbol and Name columns
train_df.drop(columns=["Symbol", "Name"], inplace=True)
test_df.drop(columns=["Symbol", "Name"], inplace=True)

features = ['Close', 'High', 'Low', 'Open', 'Marketcap', 'Volume', 'trend', 'MA_7d', 'MA_14d', 'MA_30d']

# Initialize containers for forecasts and evaluation metrics
all_forecasts = []
validation_metrics = []
test_metrics = []

# Loop over each crypto asset (by encoded symbol)
for symbol_encoded in train_df["Symbol_Encoded"].unique():# https://pandas.pydata.org/docs/reference/api/pandas.Series.unique.html
    # Filter data for the current asset
    train_crypto = train_df[train_df["Symbol_Encoded"] == symbol_encoded].copy() # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.copy.html
    test_crypto = test_df[test_df["Symbol_Encoded"] == symbol_encoded].copy()

    train_crypto.sort_index(inplace=True) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_index.html
    test_crypto.sort_index(inplace=True)

    print(f"\nRunning XGBoost Forecast for {symbol_mapping[symbol_encoded]} ({name_mapping[symbol_encoded]})")

    # Scale each feature individually using MinMaxScaler
    # https://scikit-learn.org/0.19/modules/generated/sklearn.preprocessing.MinMaxScaler.html
    feature_scaler = MinMaxScaler()
    X_train_scaled = feature_scaler.fit_transform(train_crypto[features]) # https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.Series.reshape.html
    X_test_scaled = feature_scaler.transform(test_crypto[features])

    close_idx = features.index("Close")
    y_train_scaled = X_train_scaled[:, close_idx]

    # Set up time series cross-validator
    # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html
    tscv = TimeSeriesSplit(n_splits=5)
    fold_metrics = []

    # Time series cross-validation loop
    for train_idx, val_idx in tscv.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_train_fold, y_val_fold = y_train_scaled[train_idx], y_train_scaled[val_idx]

        # Define XGBoost model architecture
        # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor
        model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=500,
                                 learning_rate=0.05, max_depth=5, random_state=42)
        # Train the model
        model.fit(X_train_fold, y_train_fold)
        
        # Predict on validation data
        y_val_pred = model.predict(X_val_fold)

        # Calculate metrics for validation
        mae = mean_absolute_error(y_val_fold, y_val_pred) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html
        mse = mean_squared_error(y_val_fold, y_val_pred) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
        rmse = np.sqrt(mse) # https://numpy.org/doc/2.1/reference/generated/numpy.sqrt.html
        mape = mean_absolute_percentage_error(y_val_fold, y_val_pred) * 100 # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html
        fold_metrics.append([mae, mse, rmse, mape]) 

    avg_metrics = np.mean(fold_metrics, axis=0) # https://numpy.org/doc/2.2/reference/generated/numpy.mean.html
    validation_metrics.append([symbol_mapping[symbol_encoded], *avg_metrics])

    # Predict on the first 7 rows of test set
    y_pred_scaled = model.predict(X_test_scaled[:7])

    # Inverse transform: fill dummy rows with feature means, replace 'Close' with predicted values
    dummy_rows = np.tile(np.mean(X_train_scaled, axis=0), (len(y_pred_scaled), 1))
    dummy_rows[:, close_idx] = y_pred_scaled
    y_pred_inversed = feature_scaler.inverse_transform(dummy_rows)[:, close_idx]

    actual_values = test_crypto["Close"].values[:7]

    # Calculate test metrics
    test_mae = mean_absolute_error(actual_values, y_pred_inversed)
    test_mse = mean_squared_error(actual_values, y_pred_inversed)
    test_rmse = np.sqrt(test_mse)
    test_mape = mean_absolute_percentage_error(actual_values, y_pred_inversed) * 100
    test_metrics.append([symbol_mapping[symbol_encoded], test_mae, test_mse, test_rmse, test_mape])

    # Plot predictions
    # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html
    # https://matplotlib.org/stable/tutorials/index.html
    plt.figure(figsize=(10, 5))
    plt.plot(test_crypto.index[:7], actual_values, label="Actual", marker='o')
    plt.plot(test_crypto.index[:7], y_pred_inversed, label="Predicted", marker='o')
    plt.title(f"XGBoost Actual vs Predicted for {symbol_mapping[symbol_encoded]}")
    plt.legend()
    plt.show()

    # Build final forecast table for this crypto
    forecast_table = pd.DataFrame({
        "Date": test_crypto.index[:7],
        "Symbol": symbol_mapping[symbol_encoded],
        "Actual Value": actual_values,
        "Predicted Value": y_pred_inversed
    })

    print(forecast_table)
    all_forecasts.append(forecast_table)

# Display Validation and Test Metrics
validation_metrics_df = pd.DataFrame(validation_metrics, columns=["Symbol", "Validation MAE", "Validation MSE", "Validation RMSE", "Validation MAPE"])
test_metrics_df = pd.DataFrame(test_metrics, columns=["Symbol", "Test MAE", "Test MSE", "Test RMSE", "Test MAPE XGB"])

# Print evaluation metrics
print("\nValidation Metrics:")
print(validation_metrics_df)

print("\nTest Metrics:")
print(test_metrics_df)

# Combine forecasts for all cryptocurrencies
final_forecast_df = pd.concat(all_forecasts, ignore_index=True) # https://pandas.pydata.org/docs/reference/api/pandas.concat.html
print("\nFinal Forecasted vs. Actual Values:")
print(final_forecast_df.to_string(index=False)) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_string.html

# Save as csv files
test_metrics_df.to_csv("XGB_7d_metrics.csv", index=False) # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
final_forecast_df.to_csv("XGB_7d_predictions.csv", index=False)
