In [None]:
import pandas as pd
import numpy as np
import ast
from tslearn.metrics import dtw
from fastdtw import fastdtw
from scipy.spatial.distance import cdist
from tqdm import tqdm  # For progress tracking

tickers = ['XLF', 'XLU', 'QQQ', 'SPY', 'XLP', 'EWZ', 'EWH', 'XLY', 'XLE']
tickers = ['SPY']

# Takes very long, focus on SPY first

In [None]:
k = 100  # Number of most similar training windows

for ticker in tickers:
    for year in range(2019, 2021):
        # Read in data
        df = pd.read_csv(f'./data/test_years/{ticker}_{year}.csv')

        # Convert string representations of lists back to Python lists
        for col in df.columns:
            if col not in ['Date', 'Label']:
                df[col] = df[col].apply(ast.literal_eval)

        # Split into training (year t-5 to t-1) and testing (year t) sets
        train_df = df[df['Date'].str[:4].astype(int) < year]
        test_df = df[df['Date'].str[:4].astype(int) == year]

        # Remove 'Label' and 'Date' columns
        train_features = train_df.drop(columns=['Label', 'Date']).values
        test_features = test_df.drop(columns=['Label', 'Date']).values

        # Convert features to numpy arrays (15x15 matrices)
        train_features = np.array([np.stack(row) for row in train_features])  # Shape: (train_samples, 15, 15)
        test_features = np.array([np.stack(row) for row in test_features])    # Shape: (test_samples, 15, 15)

        # Initialize the result
        most_similar_indices_train = []
        most_similar_indices_test_to_train = []

        # Compute MDTW distances for train-to-train
        for i in tqdm(range(train_features.shape[0]), desc=f"Processing {ticker}_{year} Train-to-Train"):
            distances = []

            for j in range(i + 1, train_features.shape[0]):  # Only calculate for the upper triangle of train set
                distance, _ = fastdtw(train_features[i].flatten(), train_features[j].flatten())  # Flatten if needed for DTW
                distances.append((distance, j))

            # Sort the distances and get the k most similar windows for train-to-train
            distances.sort(key=lambda x: x[0])
            most_similar = [index for _, index in distances[:k]]

            # Store the k most similar windows (train-to-train)
            most_similar_indices_train.append(most_similar)

        # Compute MDTW distances for test-to-train
        for i in tqdm(range(test_features.shape[0]), desc=f"Processing {ticker}_{year} Test-to-Train"):
            distances = []

            for j in range(train_features.shape[0]):  # Compare with all train samples
                distance, _ = fastdtw(test_features[i].flatten(), train_features[j].flatten())  # Flatten if needed for DTW
                distances.append((distance, j))

            # Sort the distances and get the k most similar windows for test-to-train
            distances.sort(key=lambda x: x[0])
            most_similar = [index for _, index in distances[:k]]

            # Store the k most similar windows (test-to-train)
            most_similar_indices_test_to_train.append(most_similar)

        # Combine both results in the DataFrame (train-to-train and test-to-train)
        train_df_copy = train_df.copy()
        test_df_copy = test_df.copy()

        train_df_copy['Most_Similar_Windows'] = most_similar_indices_train
        test_df_copy['Most_Similar_Windows'] = most_similar_indices_test_to_train

        # Combine train and test data back into a single DataFrame
        df_combined = pd.concat([train_df_copy, test_df_copy], axis=0)

        # Save the updated DataFrame with the most similar windows information
        df_combined.to_csv(f'./data/mdtw/{ticker}_{year}.csv', index=False)

Processing XLF_2019 Train-to-Train:   0%|          | 2/1243 [00:41<7:12:35, 20.92s/it]


KeyboardInterrupt: 