In [None]:
import pandas as pd
import numpy as np
import ast
from tslearn.metrics import dtw
from fastdtw import fastdtw
from scipy.spatial.distance import cdist
from tqdm import tqdm  # For progress tracking

tickers = ['XLF', 'XLU', 'QQQ', 'SPY', 'XLP', 'EWZ', 'EWH', 'XLY', 'XLE']

In [None]:
k = 100  # Number of most similar training windows

for ticker in tickers:
    for year in range(2019, 2024+1):
        # Read in data
        df = pd.read_csv(f'./data/test_years/{ticker}_{year}.csv')

        # Convert string representations of lists back to Python lists
        for col in df.columns:
            if col not in ['Date', 'Label']:
                df[col] = df[col].apply(ast.literal_eval)

        # Split into training (year t-5 to t-1) and testing (year t) sets
        train_df = df[df['Date'].str[:4].astype(int) < year]
        test_df = df[df['Date'].str[:4].astype(int) == year]

        # Drop non-numeric columns
        train_features = train_df.drop(columns=['Label', 'Date']).values
        test_features = test_df.drop(columns=['Label', 'Date']).values

        # Convert nested lists into 15x15 matrices
        train_features = np.array([np.stack(row) for row in train_features])  # Shape: (train_samples, 15, 15)
        test_features = np.array([np.stack(row) for row in test_features])    # Shape: (test_samples, 15, 15)

        # Preallocate distance matrices
        train_distance_matrix = np.full((train_features.shape[0], train_features.shape[0]), np.inf)  # Lower triangle only
        test_distance_matrix = np.zeros((test_features.shape[0], train_features.shape[0]))

        # Compute MDTW distances (Train-to-Train, Lower Triangle)
        for i in tqdm(range(train_features.shape[0]), desc=f"Processing {ticker}_{year} Train-to-Train"):
            train_flat_i = train_features[i].flatten()  # Flatten once
            for j in range(i):  # Only past samples (lower triangle)
                train_flat_j = train_features[j].flatten()  # Flatten once
                train_distance_matrix[i, j] = fastdtw(train_flat_i, train_flat_j)[0]  # Store distance

        # Compute MDTW distances (Test-to-Train)
        for i in tqdm(range(test_features.shape[0]), desc=f"Processing {ticker}_{year} Test-to-Train"):
            test_flat = test_features[i].flatten()  # Flatten once
            for j in range(train_features.shape[0]):
                train_flat = train_features[j].flatten()  # Flatten once
                test_distance_matrix[i, j] = fastdtw(test_flat, train_flat)[0]  # Store distance

        # Find k most similar train samples (Train-to-Train, Lower Triangle)
        most_similar_train = [np.argsort(train_distance_matrix[i, :i])[:k].tolist() if i > 0 else [] 
                              for i in range(train_features.shape[0])]

        # Find k most similar train samples for each test window (Test-to-Train)
        most_similar_test = np.argsort(test_distance_matrix, axis=1)[:, :k].tolist()

        # Store results in DataFrame
        train_df_copy = train_df.copy()
        test_df_copy = test_df.copy()
        train_df_copy['Most_Similar_Train_Windows'] = most_similar_train
        test_df_copy['Most_Similar_Train_Windows'] = most_similar_test

        # Combine and save results
        df_combined = pd.concat([train_df_copy, test_df_copy], axis=0)
        df_combined.to_csv(f'./data/mdtw/{ticker}_{year}.csv', index=False)

print("All processing completed successfully!")

All processing completed successfully!
