In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define LSTM Encoder
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

# Define LSTM Decoder with Attention
class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, context_size, output_size, num_layers, dropout):
        super(LSTMDecoderWithAttention, self).__init__()
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size * 2 + context_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, hidden, cell, context_data):
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        context_combined = torch.cat((context, context_data), dim=1).unsqueeze(1)  # Add sequence dimension
        outputs, (hidden, cell) = self.lstm(context_combined, (hidden, cell))
        prediction = self.fc(outputs.squeeze(1))
        return prediction, hidden, cell, attn_weights

# Define the Hybrid Model
class HybridModel(nn.Module):
    def __init__(self, input_size, hidden_size, context_size, output_size, num_layers, dropout):
        super(HybridModel, self).__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = LSTMDecoderWithAttention(hidden_size, context_size, output_size, num_layers, dropout)

    def forward(self, past_data, context_data):
        encoder_outputs, hidden, cell = self.encoder(past_data)
        prediction, _, _, attn_weights = self.decoder(encoder_outputs, hidden, cell, context_data)
        return prediction, attn_weights

def load_and_preprocess_data(file_path, features, target, context_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()
    scaler_context = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    # Normalize context columns
    df[context_features] = scaler_context.fit_transform(df[context_features])

    return df, scaler_target, scaler_context

def create_sequences(data, target_data, context_data, n_timesteps):
    X, y, context_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values)
        y.append(target_data.iloc[i + n_timesteps].values)
        context_features.append(context_data.iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(context_features)

def prepare_data(df, features, target, context_features, n_timesteps, test_size=0.2):
    X, y, context = create_sequences(df[features], df[target], df[context_features], n_timesteps)
    X_train, X_test, y_train, y_test, context_train, context_test = train_test_split(
        X, y, context, test_size=test_size, random_state=42)

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    context_train = torch.tensor(context_train, dtype=torch.float32)
    context_test = torch.tensor(context_test, dtype=torch.float32)

    return X_train, X_test, y_train, y_test, context_train, context_test

def train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            past_data, y_batch, context_data = batch
            optimizer.zero_grad()
            outputs, _ = model(past_data, context_data)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                past_data, y_batch, context_data = batch
                outputs, _ = model(past_data, context_data)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        scheduler.step(val_loss)

        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

def evaluate_model(model, test_loader, scaler_target):
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    predictions, true_values, attention_weights = [], [], []
    with torch.no_grad():
        for batch in test_loader:
            past_data, y_batch, context_data = batch
            outputs, attn_weights = model(past_data, context_data)
            predictions.append(outputs.cpu().numpy())
            true_values.append(y_batch.cpu().numpy())
            attention_weights.append(attn_weights.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_values = np.concatenate(true_values, axis=0)
    attention_weights = np.concatenate(attention_weights, axis=0)

    predictions_original_scale = scaler_target.inverse_transform(predictions)
    true_values_original_scale = scaler_target.inverse_transform(true_values)

    mse = mean_squared_error(true_values_original_scale, predictions_original_scale)
    r2 = r2_score(true_values_original_scale, predictions_original_scale)

    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')

    return predictions_original_scale, true_values_original_scale, attention_weights

def plot_results(y_test, y_pred):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test, label='True Values')
    plt.plot(y_pred, label='Predictions')
    plt.legend()
    plt.xlabel('Samples')
    plt.ylabel('Values')
    plt.title('True Values vs Predictions')
    plt.show()

def plot_attention_weights(attention_weights, sample_idx=0):
    plt.figure(figsize=(10, 6))
    sns.heatmap(attention_weights[sample_idx], cmap='viridis')
    plt.xlabel('Encoder Time Steps')
    plt.ylabel('Attention Weights')
    plt.title('Attention Weights for Sample Index {}'.format(sample_idx))
    plt.show()

def main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X_train, X_test, y_train, y_test, context_train, context_test = prepare_data(df, features, target, context_features, n_timesteps)

    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train, context_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test, context_test)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, optimizer, criterion, and scheduler
    input_size = X_train.size(2)
    context_size = context_train.size(2)
    output_size = y_train.size(1)
    hidden_size = 128
    num_layers = 2
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, dropout)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train model with early stopping
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f'Fold {fold+1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
        train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience)

    # Evaluate the model
    y_pred_original_scale, y_test_original_scale, attention_weights = evaluate_model(model, test_loader, scaler_target)

    # Plot results
    plot_results(y_test_original_scale, y_pred_original_scale)

    # Plot attention weights
    plot_attention_weights(attention_weights)

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
n_epochs = 50
batch_size = 64
learning_rate = 0.001
patience = 10

# Run the main function
main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define LSTM Encoder
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

# Define LSTM Decoder with Attention
class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, context_size, output_size, num_layers, dropout):
        super(LSTMDecoderWithAttention, self).__init__()
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size * 2 + context_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, hidden, cell, context_data):
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        context_combined = torch.cat((context, context_data), dim=1).unsqueeze(1)  # Add sequence dimension
        outputs, (hidden, cell) = self.lstm(context_combined, (hidden, cell))
        prediction = self.fc(outputs.squeeze(1))
        return prediction, hidden, cell, attn_weights

# Define the Hybrid Model
class HybridModel(nn.Module):
    def __init__(self, input_size, hidden_size, context_size, output_size, num_layers, dropout):
        super(HybridModel, self).__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = LSTMDecoderWithAttention(hidden_size, context_size, output_size, num_layers, dropout)

    def forward(self, past_data, context_data):
        encoder_outputs, hidden, cell = self.encoder(past_data)
        prediction, _, _, attn_weights = self.decoder(encoder_outputs, hidden, cell, context_data)
        return prediction, attn_weights

def load_and_preprocess_data(file_path, features, target, context_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()
    scaler_context = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    # Normalize context columns
    df[context_features] = scaler_context.fit_transform(df[context_features])

    return df, scaler_target, scaler_context

def create_sequences(data, target_data, context_data, n_timesteps):
    X, y, context_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values)
        y.append(target_data.iloc[i + n_timesteps].values)
        context_features.append(context_data.iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(context_features)

def augment_data(X, y, context, augmentation_factor=2):
    augmented_X, augmented_y, augmented_context = [], [], []
    for _ in range(augmentation_factor):
        noise = np.random.normal(0, 0.01, X.shape)
        augmented_X.append(X + noise)
        augmented_y.append(y)
        augmented_context.append(context)
    return np.concatenate(augmented_X), np.concatenate(augmented_y), np.concatenate(augmented_context)

def prepare_data(df, features, target, context_features, n_timesteps, test_size=0.2, augmentation_factor=2):
    X, y, context = create_sequences(df[features], df[target], df[context_features], n_timesteps)
    X, y, context = augment_data(X, y, context, augmentation_factor)
    X_train, X_test, y_train, y_test, context_train, context_test = train_test_split(
        X, y, context, test_size=test_size, random_state=42)

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    context_train = torch.tensor(context_train, dtype=torch.float32)
    context_test = torch.tensor(context_test, dtype=torch.float32)

    return X_train, X_test, y_train, y_test, context_train, context_test

def train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            past_data, y_batch, context_data = batch
            optimizer.zero_grad()
            outputs, _ = model(past_data, context_data)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                past_data, y_batch, context_data = batch
                outputs, _ = model(past_data, context_data)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        scheduler.step(val_loss)

        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

def evaluate_model(model, test_loader, scaler_target):
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    predictions, true_values, attention_weights = [], [], []
    with torch.no_grad():
        for batch in test_loader:
            past_data, y_batch, context_data = batch
            outputs, attn_weights = model(past_data, context_data)
            predictions.append(outputs.cpu().numpy())
            true_values.append(y_batch.cpu().numpy())
            attention_weights.append(attn_weights.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_values = np.concatenate(true_values, axis=0)
    attention_weights = np.concatenate(attention_weights, axis=0)

    predictions_original_scale = scaler_target.inverse_transform(predictions)
    true_values_original_scale = scaler_target.inverse_transform(true_values)

    mse = mean_squared_error(true_values_original_scale, predictions_original_scale)
    r2 = r2_score(true_values_original_scale, predictions_original_scale)

    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')

    return predictions_original_scale, true_values_original_scale, attention_weights

def plot_results(y_test, y_pred):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test, label='True Values')
    plt.plot(y_pred, label='Predictions')
    plt.legend()
    plt.xlabel('Samples')
    plt.ylabel('Values')
    plt.title('True Values vs Predictions')
    plt.show()

def plot_attention_weights(attention_weights, sample_idx=0):
    plt.figure(figsize=(10, 6))
    sns.heatmap(attention_weights[sample_idx], cmap='viridis')
    plt.xlabel('Encoder Time Steps')
    plt.ylabel('Attention Weights')
    plt.title('Attention Weights for Sample Index {}'.format(sample_idx))
    plt.show()

def main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X_train, X_test, y_train, y_test, context_train, context_test = prepare_data(df, features, target, context_features, n_timesteps)

    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train, context_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test, context_test)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, optimizer, criterion, and scheduler
    input_size = X_train.size(2)
    context_size = context_train.size(2)
    output_size = y_train.size(1)
    hidden_size = 64  # Reduced hidden size to simplify the model
    num_layers = 1  # Reduced number of layers to simplify the model
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, dropout)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train model with early stopping
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f'Fold {fold+1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
        train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience)

    # Evaluate the model
    y_pred_original_scale, y_test_original_scale, attention_weights = evaluate_model(model, test_loader, scaler_target)

    # Plot results
    plot_results(y_test_original_scale, y_pred_original_scale)

    # Plot attention weights
    plot_attention_weights(attention_weights)

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
n_epochs = 50
batch_size = 64
learning_rate = 0.001
patience = 10

# Run the main function
main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience)

In [None]:
# Assuming you have a large, related dataset
large_dataset_file_path = 'large_related_dataset.csv'

# Pre-train the model on the large dataset
def pretrain_model(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X_train, X_test, y_train, y_test, context_train, context_test = prepare_data(df, features, target, context_features, n_timesteps)

    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train, context_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test, context_test)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, optimizer, criterion, and scheduler
    input_size = X_train.size(2)
    context_size = context_train.size(2)
    output_size = y_train.size(1)
    hidden_size = 128
    num_layers = 2
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, dropout)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train model with early stopping
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f'Fold {fold+1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
        train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience)

    # Save the pre-trained model
    torch.save(model.state_dict(), 'pretrained_model.pth')

# Define parameters for pre-training
large_dataset_file_path = 'large_related_dataset.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
n_epochs = 50
batch_size = 64
learning_rate = 0.001
patience = 10

# Pre-train the model
pretrain_model(large_dataset_file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience)

In [None]:
def main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X_train, X_test, y_train, y_test, context_train, context_test = prepare_data(df, features, target, context_features, n_timesteps)

    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train, context_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test, context_test)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, optimizer, criterion, and scheduler
    input_size = X_train.size(2)
    context_size = context_train.size(2)
    output_size = y_train.size(1)
    hidden_size = 128
    num_layers = 2
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, dropout)

    # Load pre-trained weights
    model.load_state_dict(torch.load('pretrained_model.pth'), strict=False)

    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train model with early stopping
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f'Fold {fold+1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
        train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience)

    # Evaluate the model
    y_pred_original_scale, y_test_original_scale, attention_weights = evaluate_model(model, test_loader, scaler_target)

    # Plot results
    plot_results(y_test_original_scale, y_pred_original_scale)

    # Plot attention weights
    plot_attention_weights(attention_weights)

# Define parameters for fine-tuning
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
n_epochs = 50
batch_size = 64
learning_rate = 0.001
patience = 10

# Run the main function for fine-tuning
main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience)

In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Define the function to load and preprocess today's context data
def load_context_data(context_file_path, context_features, scaler_context):
    context_df = pd.read_csv(context_file_path)
    context_df[context_features] = scaler_context.transform(context_df[context_features])
    context_today = context_df[context_features].values[-1].reshape(1, 1, len(context_features))
    context_today = torch.tensor(context_today, dtype=torch.float32)
    return context_today

# Define the function to extract the last sequence from the existing data
def extract_last_sequence(df, features, n_timesteps, scaler_features):
    df[features] = scaler_features.transform(df[features])
    X_last_sequence = df[features].values[-n_timesteps:].reshape(1, n_timesteps, len(features))
    X_last_sequence = torch.tensor(X_last_sequence, dtype=torch.float32)
    return X_last_sequence

# Define the function to forecast today's data
def forecast_today(model, X_last_sequence, context_today, scaler_target):
    model.eval()
    with torch.no_grad():
        prediction, _ = model(X_last_sequence, context_today)
        prediction = prediction.cpu().numpy()

    # Inverse transform the prediction to the original scale
    prediction_original_scale = scaler_target.inverse_transform(prediction)

    return prediction_original_scale

# Define the main function for forecasting today's data
def main_forecast_today(file_path, context_file_path, features, target, context_features, n_timesteps, model_path, scaler_features, scaler_target, scaler_context):
    # Load the pre-trained model
    input_size = len(features)
    context_size = len(context_features)
    output_size = len(target)
    hidden_size = 128
    num_layers = 2
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, dropout)
    model.load_state_dict(torch.load(model_path))

    # Load and preprocess today's context data
    context_today = load_context_data(context_file_path, context_features, scaler_context)

    # Load the existing data and extract the last sequence
    df = pd.read_csv(file_path)
    X_last_sequence = extract_last_sequence(df, features, n_timesteps, scaler_features)

    # Forecast today's data
    prediction_today = forecast_today(model, X_last_sequence, context_today, scaler_target)

    print("Today's Forecast:", prediction_today)

# Define parameters for forecasting today's data
file_path = 'time_series_data.csv'
context_file_path = 'context_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
model_path = 'best_model.pth'

# Load the scalers used during training
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()
scaler_context = MinMaxScaler()

# Assuming you have saved the scalers during training, load them here
# scaler_features = joblib.load('scaler_features.pkl')
# scaler_target = joblib.load('scaler_target.pkl')
# scaler_context = joblib.load('scaler_context.pkl')

# Run the main function for forecasting today's data
main_forecast_today(file_path, context_file_path, features, target, context_features, n_timesteps, model_path, scaler_features, scaler_target, scaler_context)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define GRU Encoder
class GRUEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(GRUEncoder, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

    def forward(self, x):
        outputs, hidden = self.gru(x)
        return outputs, hidden

# Define GRU Decoder with Attention
class GRUDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, context_size, output_size, num_layers, dropout):
        super(GRUDecoderWithAttention, self).__init__()
        self.attention = Attention(hidden_size)
        self.gru = nn.GRU(hidden_size * 2 + context_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, hidden, context_data):
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        context_combined = torch.cat((context, context_data), dim=1).unsqueeze(1)  # Add sequence dimension
        outputs, hidden = self.gru(context_combined, hidden)
        prediction = self.fc(outputs.squeeze(1))
        return prediction, hidden, attn_weights

# Define the Hybrid Model
class HybridModel(nn.Module):
    def __init__(self, input_size, hidden_size, context_size, output_size, num_layers, dropout):
        super(HybridModel, self).__init__()
        self.encoder = GRUEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = GRUDecoderWithAttention(hidden_size, context_size, output_size, num_layers, dropout)

    def forward(self, past_data, context_data):
        encoder_outputs, hidden = self.encoder(past_data)
        prediction, _, attn_weights = self.decoder(encoder_outputs, hidden, context_data)
        return prediction, attn_weights

def load_and_preprocess_data(file_path, features, target, context_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()
    scaler_context = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    # Normalize context columns
    df[context_features] = scaler_context.fit_transform(df[context_features])

    return df, scaler_target, scaler_context

def create_sequences(data, target_data, context_data, n_timesteps):
    X, y, context_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values)
        y.append(target_data.iloc[i + n_timesteps].values)
        context_features.append(context_data.iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(context_features)

def prepare_data(df, features, target, context_features, n_timesteps, test_size=0.2):
    X, y, context = create_sequences(df[features], df[target], df[context_features], n_timesteps)
    X_train, X_test, y_train, y_test, context_train, context_test = train_test_split(
        X, y, context, test_size=test_size, random_state=42)

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    context_train = torch.tensor(context_train, dtype=torch.float32)
    context_test = torch.tensor(context_test, dtype=torch.float32)

    return X_train, X_test, y_train, y_test, context_train, context_test

def train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            past_data, y_batch, context_data = batch
            optimizer.zero_grad()
            outputs, _ = model(past_data, context_data)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                past_data, y_batch, context_data = batch
                outputs, _ = model(past_data, context_data)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        scheduler.step(val_loss)

        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

def evaluate_model(model, test_loader, scaler_target):
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    predictions, true_values, attention_weights = [], [], []
    with torch.no_grad():
        for batch in test_loader:
            past_data, y_batch, context_data = batch
            outputs, attn_weights = model(past_data, context_data)
            predictions.append(outputs.cpu().numpy())
            true_values.append(y_batch.cpu().numpy())
            attention_weights.append(attn_weights.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_values = np.concatenate(true_values, axis=0)
    attention_weights = np.concatenate(attention_weights, axis=0)

    predictions_original_scale = scaler_target.inverse_transform(predictions)
    true_values_original_scale = scaler_target.inverse_transform(true_values)

    mse = mean_squared_error(true_values_original_scale, predictions_original_scale)
    r2 = r2_score(true_values_original_scale, predictions_original_scale)

    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')

    return predictions_original_scale, true_values_original_scale, attention_weights

def plot_results(y_test, y_pred):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test, label='True Values')
    plt.plot(y_pred, label='Predictions')
    plt.legend()
    plt.xlabel('Samples')
    plt.ylabel('Values')
    plt.title('True Values vs Predictions')
    plt.show()

def plot_attention_weights(attention_weights, sample_idx=0):
    plt.figure(figsize=(10, 6))
    sns.heatmap(attention_weights[sample_idx], cmap='viridis')
    plt.xlabel('Encoder Time Steps')
    plt.ylabel('Attention Weights')
    plt.title('Attention Weights for Sample Index {}'.format(sample_idx))
    plt.show()

def main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X_train, X_test, y_train, y_test, context_train, context_test = prepare_data(df, features, target, context_features, n_timesteps)

    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train, context_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test, context_test)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, optimizer, criterion, and scheduler
    input_size = X_train.size(2)
    context_size = context_train.size(2)
    output_size = y_train.size(1)
    hidden_size = 128
    num_layers = 2
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, dropout)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train model with early stopping
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f'Fold {fold+1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
        train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience)

    # Evaluate the model
    y_pred_original_scale, y_test_original_scale, attention_weights = evaluate_model(model, test_loader, scaler_target)

    # Plot results
    plot_results(y_test_original_scale, y_pred_original_scale)

    # Plot attention weights
    plot_attention_weights(attention_weights)

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
n_epochs = 50
batch_size = 64
learning_rate = 0.001
patience = 10

# Run the main function
main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define LSTM Encoder
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

# Define LSTM Decoder with Attention
class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, context_size, output_size, num_layers, dropout):
        super(LSTMDecoderWithAttention, self).__init__()
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size * 2 + context_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, hidden, cell, context_data):
        attn_weights = self.attention(hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        context_combined = torch.cat((context, context_data), dim=1).unsqueeze(1)  # Add sequence dimension
        outputs, (hidden, cell) = self.lstm(context_combined, (hidden, cell))
        prediction = self.fc(outputs.squeeze(1))
        return prediction, hidden, cell, attn_weights

# Define the Hybrid Model
class HybridModel(nn.Module):
    def __init__(self, input_size, hidden_size, context_size, output_size, num_layers, dropout):
        super(HybridModel, self).__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = LSTMDecoderWithAttention(hidden_size, context_size, output_size, num_layers, dropout)

    def forward(self, past_data, context_data):
        encoder_outputs, hidden, cell = self.encoder(past_data)
        prediction, _, _, attn_weights = self.decoder(encoder_outputs, hidden, cell, context_data)
        return prediction, attn_weights

def load_and_preprocess_data(file_path, features, target, context_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()
    scaler_context = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    # Normalize context columns
    df[context_features] = scaler_context.fit_transform(df[context_features])

    return df, scaler_target, scaler_context

def apply_moving_average(data, window_size=3):
    return data.rolling(window=window_size, min_periods=1).mean()

def create_sequences(data, target_data, context_data, n_timesteps):
    X, y, context_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values)
        y.append(target_data.iloc[i + n_timesteps].values)
        context_features.append(context_data.iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(context_features)

def prepare_data(df, features, target, context_features, n_timesteps, test_size=0.2):
    # Apply moving average to denoise the data
    df[features] = apply_moving_average(df[features])

    X, y, context = create_sequences(df[features], df[target], df[context_features], n_timesteps)
    X_train, X_test, y_train, y_test, context_train, context_test = train_test_split(
        X, y, context, test_size=test_size, random_state=42)

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    context_train = torch.tensor(context_train, dtype=torch.float32)
    context_test = torch.tensor(context_test, dtype=torch.float32)

    return X_train, X_test, y_train, y_test, context_train, context_test

def train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            past_data, y_batch, context_data = batch
            optimizer.zero_grad()
            outputs, _ = model(past_data, context_data)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                past_data, y_batch, context_data = batch
                outputs, _ = model(past_data, context_data)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        scheduler.step(val_loss)

        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

def evaluate_model(model, test_loader, scaler_target):
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    predictions, true_values, attention_weights = [], [], []
    with torch.no_grad():
        for batch in test_loader:
            past_data, y_batch, context_data = batch
            outputs, attn_weights = model(past_data, context_data)
            predictions.append(outputs.cpu().numpy())
            true_values.append(y_batch.cpu().numpy())
            attention_weights.append(attn_weights.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_values = np.concatenate(true_values, axis=0)
    attention_weights = np.concatenate(attention_weights, axis=0)

    predictions_original_scale = scaler_target.inverse_transform(predictions)
    true_values_original_scale = scaler_target.inverse_transform(true_values)

    mse = mean_squared_error(true_values_original_scale, predictions_original_scale)
    r2 = r2_score(true_values_original_scale, predictions_original_scale)

    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')

    return predictions_original_scale, true_values_original_scale, attention_weights

def plot_results(y_test, y_pred):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test, label='True Values')
    plt.plot(y_pred, label='Predictions')
    plt.legend()
    plt.xlabel('Samples')
    plt.ylabel('Values')
    plt.title('True Values vs Predictions')
    plt.show()

def plot_attention_weights(attention_weights, sample_idx=0):
    plt.figure(figsize=(10, 6))
    sns.heatmap(attention_weights[sample_idx], cmap='viridis')
    plt.xlabel('Encoder Time Steps')
    plt.ylabel('Attention Weights')
    plt.title('Attention Weights for Sample Index {}'.format(sample_idx))
    plt.show()

def main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X_train, X_test, y_train, y_test, context_train, context_test = prepare_data(df, features, target, context_features, n_timesteps)

    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train, context_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test, context_test)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, optimizer, criterion, and scheduler
    input_size = X_train.size(2)
    context_size = context_train.size(2)
    output_size = y_train.size(1)
    hidden_size = 128
    num_layers = 2
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, dropout)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train model with early stopping
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f'Fold {fold+1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
        train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience)

    # Evaluate the model
    y_pred_original_scale, y_test_original_scale, attention_weights = evaluate_model(model, test_loader, scaler_target)

    # Plot results
    plot_results(y_test_original_scale, y_pred_original_scale)

    # Plot attention weights
    plot_attention_weights(attention_weights)

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
n_epochs = 50
batch_size = 64
learning_rate = 0.001
patience = 10

# Run the main function
main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience)

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded
    
    def train_autoencoder(autoencoder, train_loader, n_epochs, optimizer, criterion):
    for epoch in range(n_epochs):
        autoencoder.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            X_batch = batch[0]
            _, decoded = autoencoder(X_batch)
            loss = criterion(decoded, X_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Autoencoder Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}')

        def denoise_data(autoencoder, data_loader):
    autoencoder.eval()
    denoised_data = []
    with torch.no_grad():
        for batch in data_loader:
            X_batch = batch[0]
            _, decoded = autoencoder(X_batch)
            denoised_data.append(decoded.cpu().numpy())
    return np.concatenate(denoised_data, axis=0)

def main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience, ae_n_epochs):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X, y, context = create_sequences(df[features], df[target], df[context_features], n_timesteps)
    X_train, X_test, y_train, y_test, context_train, context_test = train_test_split(
        X, y, context, test_size=0.2, random_state=42)

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    context_train = torch.tensor(context_train, dtype=torch.float32)
    context_test = torch.tensor(context_test, dtype=torch.float32)

    # Train Autoencoder
    input_dim = X_train.size(2)
    encoding_dim = 20
    hidden_dim = 128
    autoencoder = Autoencoder(input_dim, encoding_dim, hidden_dim)
    ae_optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
    ae_criterion = nn.MSELoss()

    train_dataset = torch.utils.data.TensorDataset(X_train.view(-1, input_dim))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    train_autoencoder(autoencoder, train_loader, ae_n_epochs, ae_optimizer, ae_criterion)

    # Denoise data using the trained autoencoder
    denoised_train = denoise_data(autoencoder, train_loader)
    denoised_train = denoised_train.reshape(X_train.size(0), X_train.size(1), -1)
    denoised_test = denoise_data(autoencoder, torch.utils.data.DataLoader(torch.utils.data.TensorDataset(X_test.view(-1, input_dim)), batch_size=batch_size))
    denoised_test = denoised_test.reshape(X_test.size(0), X_test.size(1), -1)

    # Create DataLoader for LSTM model
    train_dataset = torch.utils.data.TensorDataset(torch.tensor(denoised_train, dtype=torch.float32), y_train, context_train)
    test_dataset = torch.utils.data.TensorDataset(torch.tensor(denoised_test, dtype=torch.float32), y_test, context_test)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize LSTM model, optimizer, criterion, and scheduler
    input_size = denoised_train.shape[2]
    context_size = context_train.size(2)
    output_size = y_train.size(1)
    hidden_size = 128
    num_layers = 2
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, dropout)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train LSTM model with early stopping
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(denoised_train)):
        print(f'Fold {fold+1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
        train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience)

    # Evaluate the model
    y_pred_original_scale, y_test_original_scale, attention_weights = evaluate_model(model, test_loader, scaler_target)

    # Plot results
    plot_results(y_test_original_scale, y_pred_original_scale)

    # Plot attention weights
    plot_attention_weights(attention_weights)

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
n_epochs = 50
batch_size = 64
learning_rate = 0.001
patience = 10
ae_n_epochs = 50  # Number of epochs for autoencoder training

# Run the main function
main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience, ae_n_epochs)