In [58]:
# Import necessary libraries
import sys
from pathlib import Path
repo_root = Path("/home/ubuntu/michael/MSc-Machine-Learning-Project")
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
import os
import time
import re
import math
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.feather as feather
from pathlib import Path
from tqdm import tqdm
from mamba_ssm import Mamba
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
from mamba_ssm import Mamba
from torch.utils.data import TensorDataset, DataLoader
from typing import Optional, Literal
from prob_mamba.models import FeatureNet, ProbabilisticMamba 
from prob_mamba.ti_lgssm import TimeInvariantLGSSM
from prob_mamba.utils import param_count, softplus_pos

In [2]:
# Function to clean the equities datasets and select features
def clean_and_select_features(df: pd.DataFrame, redundant_features: list) -> pd.DataFrame:
    """
    Takes a raw financial dataframe and returns a cleaned feature set after handling look-ahead bias and multicollinearity.
    Args:
        df: The raw input dataframe
        redundant_features:  list of redundant column names to drop
    Returns:
        A dataframe containing only the cleaned and selected features.
    """

    # Sort by date
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)

    # Identify and shift "-F" columns to avoid look-ahead bias
    future_cols = [c for c in df.columns if '-F' in c]
    intl_indices = ['FTSE', 'GDAXI', 'FCHI', 'HSI', 'SSEC']
    col_to_shift = future_cols + [c for c in intl_indices if c in df.columns]
    print(f"\nShifting {len(col_to_shift)} columns by +1 day")

    for col in col_to_shift:
        df[col] = df[col].shift(1)

    # Define leaky feature columns to be dropped 
    leaky_momentum = ['mom', 'mom1', 'mom2', 'mom3']

     # Drop name of index (constant across all observations)
    name= ['Name']

    cols_to_drop = leaky_momentum + name + redundant_features
    # Combine all lists of columns to drop 
    cleaned_df = df.drop(columns=cols_to_drop, errors='ignore')

    print(f"Number of columns dropped: {len(set(cols_to_drop) & set(df.columns))}")

    return cleaned_df

In [3]:
# Redundant features
redundant_nyse = [
    'CAC-F', 'DAX-F', 'FTSE-F', 'HSI-F', 'DJI-F', 'S&P-F',
    'CTB1Y', 'CTB6M', 'DTB3', 'DTB4WK', 'DTB6',
    'DBAA',
    'EMA_20', 'EMA_50',
    'TE2', 'TE3',
    'oil'
]
redundant_ixic = [
    'CAC-F', 'DAX-F', 'FTSE-F', 'HSI-F', 'DJI-F', 'S&P-F', 'NASDAQ-F',
    'CTB1Y', 'CTB6M', 'DTB3', 'DTB4WK', 'DTB6',
    'DBAA',
    'EMA_20', 'EMA_50',
    'TE2', 'TE3',
    'oil',
]

In [4]:
# Clean the data and save as csv
df_nyse_raw = pd.read_csv("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_NYSE.csv")
df_nyse_cleaned = clean_and_select_features(df_nyse_raw, redundant_nyse)
df_nyse_cleaned.to_csv("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_NYSE.csv", index=False)

df_ixic_raw = pd.read_csv("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_IXIC.csv")
df_ixic_cleaned = clean_and_select_features(df_ixic_raw, redundant_ixic)
df_ixic_cleaned.to_csv("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_IXIC.csv", index=False)


Shifting 21 columns by +1 day
Number of columns dropped: 22

Shifting 21 columns by +1 day
Number of columns dropped: 23


In [5]:
# Function to clean the bitcoin dataset and select features
def clean_and_select_features_bitcoin(df: pd.DataFrame) -> pd.DataFrame:
    """
    Takes a raw hourly Bitcoin dataframe and returns a cleaned feature set after handling look-ahead bias and multicollinearity.
    Args:
        df: The raw input dataframe
    Returns:
        A dataframe containing only the cleaned and selected features.
    """

    # Sort by timestamp
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp').reset_index(drop=True)

    # Remove cash-market indices
    cash_market_indices = [
        'Close_ibovespa', 'High_ibovespa', 'Low_ibovespa', 'Open_ibovespa', 'Volume_ibovespa',
        'Close_ipc_mexico', 'High_ipc_mexico', 'Low_ipc_mexico', 'Open_ipc_mexico', 'Volume_ipc_mexico',
        'Close_dax', 'High_dax', 'Low_dax', 'Open_dax', 'Volume_dax',
        'Close_nasdaq', 'High_nasdaq', 'Low_nasdaq', 'Open_nasdaq', 'Volume_nasdaq',
        'Close_russell_2000', 'High_russell_2000', 'Low_russell_2000', 'Open_russell_2000', 'Volume_russell_2000',
        'Close_vix', 'High_vix', 'Low_vix', 'Open_vix', 'Volume_vix',
        'Close_cac_40', 'High_cac_40', 'Low_cac_40', 'Open_cac_40', 'Volume_cac_40',
        'Close_euro_stoxx_50', 'High_euro_stoxx_50', 'Low_euro_stoxx_50', 'Open_euro_stoxx_50', 'Volume_euro_stoxx_50',
        'Close_dow_jones', 'High_dow_jones', 'Low_dow_jones', 'Open_dow_jones', 'Volume_dow_jones',
        'Close_ftse_100', 'High_ftse_100', 'Low_ftse_100', 'Open_ftse_100', 'Volume_ftse_100',
        'Close_sptsx', 'High_sptsx', 'Low_sptsx', 'Open_sptsx', 'Volume_sptsx',
        'Close_sp500', 'High_sp500', 'Low_sp500', 'Open_sp500', 'Volume_sp500'    
        ]
    
    # Remove OHLC for all cryptocurrencies except close and volume
    redundant_crypto = [
        'BNB_USDT_1h_open', 'BNB_USDT_1h_high', 'BNB_USDT_1h_low',
        'BTC_USDT_1h_open', 'BTC_USDT_1h_high', 'BTC_USDT_1h_low',
        'DOGE_USDT_1h_open', 'DOGE_USDT_1h_high', 'DOGE_USDT_1h_low',
        'ETH_USDT_1h_open', 'ETH_USDT_1h_high', 'ETH_USDT_1h_low',
        'SOL_USDT_1h_open', 'SOL_USDT_1h_high', 'SOL_USDT_1h_low',
        'XRP_USDT_1h_open', 'XRP_USDT_1h_high', 'XRP_USDT_1h_low'
    ]
    # Redundant trend data
    redundant_trend = [
        'google_trends_buy_crypto', 'google_trends_bitcoin'
    ]
    cols_to_drop = cash_market_indices + redundant_crypto + redundant_trend

    # Combine all lists of columns to drop 
    cleaned_df = df.drop(columns=cols_to_drop, errors='ignore')

    print(f"Number of columns dropped: {len(set(cols_to_drop) & set(df.columns))}")

    return cleaned_df

In [6]:
df_btc_raw = pd.read_csv("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/Bitcoin_hourly_dataset.csv")
df_btc_cleaned = clean_and_select_features_bitcoin(df_btc_raw)
df_btc_cleaned.to_csv("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_Bitcoin.csv", index=False)

Number of columns dropped: 80


In [7]:
# Function to convert csv to arrow
def csv_to_arrow(csv_filepath: str):
    """
    Reads a CSV file from a given path and converts it into Apache Arrow (feather) format,
    and saves it in the same directory.
    Args: csv_filepath (str): The path to the CSV file to be converted.
    """
    # Create a new filename for the arrow file
    arrow_filepath = csv_filepath.replace('.csv', '.arrow')
    
    try:
        print(f"Converting '{csv_filepath}' to '{arrow_filepath}'")

        # Read just the header to find the date column name
        header_df = pd.read_csv(csv_filepath, nrows=0)
        date_col = 'timestamp' if 'timestamp' in header_df.columns else 'Date'

        # Parse_dates to ensure 'Date' column is in datetime format
        df = pd.read_csv(csv_filepath, parse_dates= [date_col])

        # Write the DataFrame to a Feather file (which uses Arrow format)
        feather.write_feather(df, arrow_filepath)
        print(f"Successfully converted file")
    except Exception as e:
        print(f"An error occurred while converting '{csv_filepath}': {e}")

In [8]:
if __name__ == "__main__":
    # Directory where data is stored
    data_dir = '/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/'

    # List of CSV files to convert
    csv_files = [
        'cleaned_IXIC.csv',
        'cleaned_NYSE.csv',
        'cleaned_Bitcoin.csv'
    ]
    # Convertion
    for csv_file in csv_files:
        csv_filepath = os.path.join(data_dir, csv_file)
        csv_to_arrow(csv_filepath)

Converting '/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_IXIC.csv' to '/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_IXIC.arrow'
Successfully converted file
Converting '/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_NYSE.csv' to '/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_NYSE.arrow'
Successfully converted file
Converting '/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_Bitcoin.csv' to '/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_Bitcoin.arrow'
Successfully converted file


In [9]:
# Function to pre-process data
def preprocess_data(arrow_filepath: str, train_end: str = '2019-12-31', val_end: str = '2021-12-31'):
    """
    Loads data from an arrow file and performs a full pre-processing pipeline:
    1. Handles missing values using forward fill (no backward fill to avoid look-ahead bias);
    2. Drop any leading rows that remain NaNs after forward fill;
    3. Splits data chronologically into training, validation, and test sets;
    4. Normalise the data using StandardScaler fitted on the training set.
    Args:
        arrow_filepath (str): The path to the Arrow file to be processed.
        train_end (str): The end date for the training set
        val_end (str): The end date for the validation set
    Returns:
        tuple: (train_df, val_df, test_df, scaler)
    """
    print(f"Beginning pre-processing of {arrow_filepath}:")

    # Step 1: Load and sort the data
    df = feather.read_feather(arrow_filepath)
    date_col = "timestamp" if "timestamp" in df.columns else "Date"
    df[date_col] = pd.to_datetime(df[date_col])
    df = df.sort_values(date_col).reset_index(drop=True)
    print(f"Loaded data from {arrow_filepath} with {df.shape[0]} rows and {df.shape[1]} columns.")

    # Step 2: Engineer target column: log-returns
    price_col = "Price" if "Price" in df.columns else "BTC_USDT_1h_close"
    df["_log_price"] = np.log(df[price_col])
    df["ret_t"] = df["_log_price"].diff()
    df["y_next"]     = df["_log_price"].shift(-1) - df["_log_price"]
    df = df.dropna(subset=["y_next"]).reset_index(drop=True)

    # Step 3: Causal imputation (forward-fill only) and drop leading incomplete rows
    # Forward-fill all columns except data, target and log-price columns
    protect_cols = {date_col, "y_next", "_log_price"}
    ffill_cols = [c for c in df.columns if c not in protect_cols]
    df.loc[:, ffill_cols] = df[ffill_cols].ffill()

    # Build numeric feature set excluding target and log_price
    numeric_feature_cols = (
        df.select_dtypes(include=np.number).columns.difference(["y_next", "_log_price"])
    )
    # Drop columns that are entirely NaN (prevents false "no complete rows")
    all_nan_cols = [c for c in numeric_feature_cols if df[c].isna().all()]
    if all_nan_cols:
        print(f"Dropping {len(all_nan_cols)} all-NaN columns (e.g., {all_nan_cols[:5]})")
        df = df.drop(columns=all_nan_cols)
        numeric_feature_cols = [c for c in numeric_feature_cols if c not in all_nan_cols]

    if len(numeric_feature_cols) == 0:
        raise ValueError("No numeric feature columns remain after exclusions.")

    complete_mask = df[numeric_feature_cols].notna().all(axis=1)
    if not complete_mask.any():
        na_counts = df[numeric_feature_cols].isna().sum().sort_values(ascending=False).head(10)
        raise ValueError(f"After forward-fill, no rows have complete numeric features. Top NA columns:\n{na_counts}")
    first_complete_idx = complete_mask.idxmax()
    if first_complete_idx > 0:
        print(f"Dropping {first_complete_idx} leading rows with unresolved NaNs.")
    df = df.loc[first_complete_idx:].reset_index(drop=True)
    
    print("Missing values handled.")

    # Step 4: Chronological split using the provided end dates
    # Random splitting ignores the temporal order of the data, which is crucial for time series.
    # Here we split the data chronologically into training, validation, and test sets.
    train_end_date = pd.Timestamp(train_end)
    val_end_date = pd.Timestamp(val_end)
    train_df = df[df[date_col] <= train_end_date].copy()
    val_df = df[(df[date_col] > train_end_date) & (df[date_col] <= val_end_date)].copy()
    test_df = df[df[date_col] > val_end_date].copy()
    print("Split data chronologically:")
    print(f"Training set: {train_df.shape[0]} rows (<= {train_end_date.date()})")
    print(f"Validation set: {val_df.shape[0]} rows (until {val_end_date.date()})")
    print(f"Test set: {test_df.shape[0]} rows (after {val_end_date.date()})")

    # Step 5: Normalisation
    # Exclude target and log_price from scaling
    numeric_cols = df.select_dtypes(include=np.number).columns.difference(["y_next", "_log_price"])
    scaler = StandardScaler()
    scaler.fit(train_df[numeric_cols])
    print("Scaler fitted on training data.")
    # Transform the training, validation, and test sets
    train_df[numeric_cols] = scaler.transform(train_df[numeric_cols])
    val_df[numeric_cols] = scaler.transform(val_df[numeric_cols])
    test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])

    return train_df, val_df, test_df, scaler

In [10]:
# Data pre-processing
if __name__ == "__main__":
    # Data directory
    data_dir = "/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/"

    # Per-file cut-off dates
    split_params = {
        "cleaned_IXIC.arrow":   dict(train_end="2019-08-12", val_end="2021-07-14"),
        "cleaned_NYSE.arrow":   dict(train_end="2019-12-31", val_end="2021-12-31"),
        "cleaned_Bitcoin.arrow": dict(train_end="2024-09-01", val_end="2024-12-21"),
        # If you omit a file or set either value to None, preprocess_data uses its defaults.
    }
    # Process each file
    for arrow_file in ["cleaned_IXIC.arrow",
                       "cleaned_NYSE.arrow",
                       "cleaned_Bitcoin.arrow"]:
        arrow_path = os.path.join(data_dir, arrow_file)
        try:
            # Look up per-file dates (fallback to {} â†’ defaults)
            params = split_params.get(arrow_file, {})
            train_df, val_df, test_df, scaler = preprocess_data(
                arrow_path,
                **params  # expands to train_end=..., val_end=...
            )

            # Save processed splits
            stem = arrow_file.replace(".arrow", "")
            train_df.to_feather(os.path.join(data_dir, f"{stem}_train.arrow"))
            val_df.to_feather(os.path.join(data_dir, f"{stem}_val.arrow"))
            test_df.to_feather(os.path.join(data_dir, f"{stem}_test.arrow"))
            print(f"Processed data saved for {arrow_file}.\n")

        except FileNotFoundError as e:
            print(f"File not found: {e}\n")
        except Exception as e:
            print(f"An error occurred while processing {arrow_file}: {e}\n")
        

Beginning pre-processing of /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_IXIC.arrow:
Loaded data from /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_IXIC.arrow with 3470 rows and 61 columns.
Dropping 1 leading rows with unresolved NaNs.
Missing values handled.
Split data chronologically:
Training set: 2417 rows (<= 2019-08-12)
Validation set: 484 rows (until 2021-07-14)
Test set: 567 rows (after 2021-07-14)
Scaler fitted on training data.
Processed data saved for cleaned_IXIC.arrow.

Beginning pre-processing of /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_NYSE.arrow:
Loaded data from /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_NYSE.arrow with 3470 rows and 62 columns.
Dropping 1 leading rows with unresolved NaNs.
Missing values handled.
Split data chronologically:
Training set: 2515 rows (<= 2019-12-31)
Validation set: 505 rows (until 2021-12-31)
Test set: 448 