In [1]:
# Import necessary libraries
import sys
import pandas as pd
import numpy as np
repo_root = "/home/ubuntu/michael/MSc-Machine-Learning-Project/src/"
if repo_root not in sys.path:
    sys.path.append(repo_root)
from feature_selection import run_vif

In [2]:
def mom_correlation(file_path:str):
    """
    This function loads the dataset and tests its momentum columns for look-ahead bias.
    Args: file_path: the path to the feather file.
    """
    print("Loading dataset: ", file_path)
    try:
        df = pd.read_feather(file_path)
        df['Date'] = pd.to_datetime(df['Date']) # Ensure 'Date' is in datetime format
        df = df.sort_values('Date').reset_index(drop=True)  # Sort by Date chronologically
        mom_cols = [c for c in ['mom','mom1','mom2', 'mom3'] if c in df.columns] # Comprehension to extract momentum columns
        report = {}
        for col in mom_cols:
            report[col] = {}
            print(f"\n Testing '{col}':")
            s = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, coerce invalid values to NaN
            
            for k in [1,2,3,5,10]: # Test against k-day future returns
                future_returns = df['Price'].shift(-k)/df['Price']-1.0
                valid_mask = s.notna() & future_returns.notna() # Both momentum and future return must be valid
                if valid_mask.sum() < 20: # Minimum valid datapoints threshold
                    print("Not enough valid data to test {col} against {k}-day future returns")
                    continue
                mom_series = s[valid_mask] # Only valid momentum values
                future_series = future_returns[valid_mask] # Only valid future returns
                # Calculate Pearson correlation coefficient and extract correlation between mom_series and future_series
                correlation = np.corrcoef(mom_series, future_series)[0,1] 
                report[col][f'corr_with_{k}d_future_return'] = correlation # Store in report
                print(f"  Correlation with {k}-day future returns: {correlation:.4f}")
        return report
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred while processing {file_path}: {e}")

In [3]:
# Perform correlation analysis on datasets
Nyse_test = mom_correlation('/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_NYSE.feather')
Ixic_test = mom_correlation('/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_IXIC.feather')

Loading dataset:  /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_NYSE.feather

 Testing 'mom':
  Correlation with 1-day future returns: -0.9991
  Correlation with 2-day future returns: -0.6735
  Correlation with 3-day future returns: -0.5941
  Correlation with 5-day future returns: -0.4331
  Correlation with 10-day future returns: -0.3227

 Testing 'mom1':
  Correlation with 1-day future returns: 0.0891
  Correlation with 2-day future returns: -0.6736
  Correlation with 3-day future returns: -0.4901
  Correlation with 5-day future returns: -0.4094
  Correlation with 10-day future returns: -0.2682

 Testing 'mom2':
  Correlation with 1-day future returns: -0.0893
  Correlation with 2-day future returns: -0.0042
  Correlation with 3-day future returns: -0.5941
  Correlation with 5-day future returns: -0.4665
  Correlation with 10-day future returns: -0.3243

 Testing 'mom3':
  Correlation with 1-day future returns: 0.0324
  Correlation with 2-day future

In [4]:
# Run VIF on NYSE and IXIC datasets
Include_price = False
Save_artifacts = True
Feature_always_exclude = ["Date", "Price", "Name", "weekday"]
Drop_momentum = True

# Run VIF analysis on NYSE dataset
Nyse_vif = run_vif(
    feather_path='/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_NYSE.feather',
    train_end="2019-12-31",
    val_end="2020-12-31",
    vif_threshold=10.0,
    date_col="Date"
)
# Run VIF analysis on IXIC dataset
Ixic_vif = run_vif(
    feather_path='/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_IXIC.feather',
    train_end="2019-12-31",
    val_end="2020-12-31",
    vif_threshold=10.0,
    date_col="Date"
)


 VIF analysis for dataset: /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_NYSE.feather
Excluded features before VIF: ['Price', 'mom', 'mom1', 'mom2', 'mom3']
Rows in training set after imputation: 2515
Rows in validation set after imputation: 253
Rows in test set after imputation: 701
Features retained after VIF pruning: ['Vol.', 'weekday', 'ROC_5', 'ROC_10', 'ROC_15', 'ROC_20', 'EMA_10', 'WTI-oil', 'FTSE-F', 'HSI-F', 'Gold-F', 'NZD', 'Brent', 'DBAA', 'XAU', 'AUD', 'AMZN', 'RUSSELL-F', 'CNY', 'MSFT', 'silver-F', 'CAD', 'DAX-F', 'DJI-F', 'XAG', 'XOM', 'EUR', 'WFC', 'Dollar Index-F', 'GE', 'copper-F', 'RUT', 'JPM', 'GAS-F', 'JPY', 'wheat-F', 'GBP', 'SSEC', 'Nikkei-F', 'CHF', 'KOSPI-F', 'AAPL', 'CAC-F', 'NASDAQ-F', 'JNJ', 'TE1', 'DE1', 'DE2']
Dropped features due to high VIF (> 10.0): ['TE6', 'DTB6', 'DE4', 'TE5', 'DTB4WK', 'DAAA', 'DGS10', 'DE5', 'DTB3', 'DE6', 'EMA_20', 'CTB6M', 'CTB3M', 'EMA_50', 'CTB1Y', 'TE2', 'GSPC', 'DGS5', 'S&P-F', 'FCHI', 'EMA_