In [1]:
# Import necessary libraries
import sys
import os
import pandas as pd
from pathlib import Path
repo_root = Path("/home/ubuntu/michael/MSc-Machine-Learning-Project")
src_path = repo_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
from preprocessing_pipeline import clean_and_select_features_equities, preprocess_data_equities, preprocess_data_crypto

In [2]:
# Redundant features
redundant_nyse = [
    'TE6', 'DTB6', 'DE4', 'TE5', 'DTB4WK', 'DAAA', 
    'DGS10', 'DE5', 'DTB3', 'DE6', 'EMA_20', 'CTB6M', 
    'CTB3M', 'EMA_50', 'CTB1Y', 'TE2', 'GSPC', 'DGS5', 
    'S&P-F', 'FCHI', 'EMA_200', 'GDAXI', 'oil', 'TE3', 
    'IXIC', 'HSI', 'FTSE', 'Dollar Index', 'DJI'
    ]

redundant_ixic = [
    'DAAA', 'DTB6', 'DTB4WK', 'DGS10', 'TE3', 'DE4', 
    'TE2', 'DE5', 'DTB3', 'DE6', 'EMA_20', 'CTB6M', 
    'CTB3M', 'EMA_50', 'CTB1Y', 'EMA_200', 'DGS5', 
    'S&P-F', 'FCHI', 'GSPC', 'GDAXI', 'oil', 'NYSE', 
    'HSI', 'FTSE', 'Dollar Index', 'TE6', 'DJI'
    ]

In [3]:
# Clean the data and save as feather
df_nyse_raw = pd.read_feather("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_NYSE.feather")
df_nyse_cleaned = clean_and_select_features_equities(df_nyse_raw, redundant_nyse)
df_nyse_cleaned.to_feather("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_NYSE.feather")

df_ixic_raw = pd.read_feather("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/combined_dataframe_IXIC.feather")
df_ixic_cleaned = clean_and_select_features_equities(df_ixic_raw, redundant_ixic)
df_ixic_cleaned.to_feather("/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_IXIC.feather")


Shifting 21 columns by +1 day
Number of columns dropped: 34

Shifting 21 columns by +1 day
Number of columns dropped: 33


In [4]:
# Data pre-processing
if __name__ == "__main__":
    # Data directory
    data_dir = "/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/"

    # Per-file cut-off dates
    split_params = {
        "Processed/cleaned_IXIC.feather":   dict(train_end="2019-08-12", val_end="2021-07-14"),
        "Processed/cleaned_NYSE.feather":   dict(train_end="2019-08-12", val_end="2021-07-14")
    }
    # Process each file
    for feather_file in ["Processed/cleaned_IXIC.feather",
                         "Processed/cleaned_NYSE.feather"]:
        feather_path = os.path.join(data_dir, feather_file)
        try:
            # Look up per-file dates (fallback to {} ‚Üí defaults)
            params = split_params.get(feather_file, {})
            train_df, val_df, test_df, scaler = preprocess_data_equities(
                feather_path,
                **params  # expands to train_end=..., val_end=...
            )

            # Save processed splits
            stem = feather_file.replace(".feather", "")
            train_df.to_feather(os.path.join(data_dir, f"{stem}_train.feather"))
            val_df.to_feather(os.path.join(data_dir, f"{stem}_val.feather"))
            test_df.to_feather(os.path.join(data_dir, f"{stem}_test.feather"))
            print(f"Processed data saved for {feather_file}.\n")

        except FileNotFoundError as e:
            print(f"File not found: {e}\n")
        except Exception as e:
            print(f"An error occurred while processing {feather_file}: {e}\n")
        

Beginning pre-processing of /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_IXIC.feather:
Loaded data from /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_IXIC.feather with 3470 rows and 51 columns.
Dropping 1 leading rows with unresolved NaNs.
Missing values handled.
Split data chronologically:
Training set: 2417 rows (<= 2019-08-12)
Validation set: 484 rows (until 2021-07-14)
Test set: 567 rows (after 2021-07-14)
Scaler fitted on training data.
Processed data saved for Processed/cleaned_IXIC.feather.

Beginning pre-processing of /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_NYSE.feather:
Loaded data from /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed/cleaned_NYSE.feather with 3470 rows and 50 columns.
Dropping 1 leading rows with unresolved NaNs.
Missing values handled.
Split data chronologically:
Training set: 2417 rows (<= 2019-08-12)
Validation set: 484 rows (until 2021-0

In [11]:
# Data pre-processing for crypto
if __name__ == "__main__":
    # Data directory
    data_dir = "/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/"

    # Per-file cut-off dates
    split_params = {
        "BTC_USDT-5m.feather":   dict(train_end="2021-08-13 23:59:59", val_end="2021-11-07 23:59:59")
    }
    for feather_file in ["BTC_USDT-5m.feather"]:
        feather_path = os.path.join(data_dir, feather_file)
        try:
            # Look up per-file dates (fallback to {} ‚Üí defaults)
            params = split_params.get(feather_file, {})
            
            # Use crypto-specific preprocessing function
            train_df, val_df, test_df, scaler = preprocess_data_crypto(
                feather_path,
                **params  # expands to train_end=..., val_end=...
            )

            # Save processed splits
            save_dir = "/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Processed"
            Path(save_dir).mkdir(parents=True, exist_ok=True)
            stem = feather_file.replace(".feather", "")
            train_df.to_feather(os.path.join(save_dir, f"{stem}_train.feather"))
            val_df.to_feather(os.path.join(save_dir, f"{stem}_val.feather"))
            test_df.to_feather(os.path.join(save_dir, f"{stem}_test.feather"))
            print(f"Processed data saved for {feather_file}.\n")

        except FileNotFoundError as e:
            print(f"File not found: {e}\n")
        except Exception as e:
            print(f"An error occurred while processing {feather_file}: {e}\n")

Beginning pre-processing of /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/BTC_USDT-5m.feather:
Converting timezone-aware datetime (UTC) to UTC timezone-naive
Loaded data from /home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/BTC_USDT-5m.feather with 104923 rows and 6 columns.
Dropping 1 leading rows with unresolved NaNs.
Missing values handled.
Split data chronologically:
Training set: 64626 rows (<= 2021-08-13)
Validation set: 24744 rows (until 2021-11-07)
Test set: 15551 rows (after 2021-11-07)
Scaler fitted on training data.
Processed data saved for BTC_USDT-5m.feather.



In [12]:
# Check BTC data range to understand the timestamps
import pandas as pd

btc_path = "/home/ubuntu/michael/MSc-Machine-Learning-Project/Datasets/Raw/BTC_USDT-1m.feather"
df_check = pd.read_feather(btc_path)

print("üìä BTC Dataset Time Range Analysis:")
print(f"Shape: {df_check.shape}")

# Find date column
date_col = "date" if "date" in df_check.columns else "Date"
df_check[date_col] = pd.to_datetime(df_check[date_col])

print(f"\nüìÖ Date Range ({date_col}):")
print(f"Start: {df_check[date_col].min()}")
print(f"End:   {df_check[date_col].max()}")
print(f"Duration: {df_check[date_col].max() - df_check[date_col].min()}")

# Check some sample timestamps
print(f"\nüïê Sample timestamps:")
for i in [0, len(df_check)//4, len(df_check)//2, 3*len(df_check)//4, -1]:
    if i >= 0:
        idx = i
    else:
        idx = len(df_check) + i
    print(f"  [{idx}] {df_check[date_col].iloc[idx]}")

# Check frequency
time_diff = df_check[date_col].diff().dropna()
most_common_freq = time_diff.mode()
if len(most_common_freq) > 0:
    print(f"\n‚è±Ô∏è  Most common frequency: {most_common_freq.iloc[0]}")

üìä BTC Dataset Time Range Analysis:
Shape: (524607, 6)

üìÖ Date Range (date):
Start: 2021-01-01 00:00:00+00:00
End:   2021-12-31 23:59:00+00:00
Duration: 364 days 23:59:00

üïê Sample timestamps:
  [0] 2021-01-01 00:00:00+00:00
  [131151] 2021-04-02 04:40:00+00:00
  [262303] 2021-07-02 13:46:00+00:00
  [393455] 2021-10-01 22:08:00+00:00
  [524606] 2021-12-31 23:59:00+00:00

‚è±Ô∏è  Most common frequency: 0 days 00:01:00
