In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Define Transformer Encoder
class TransformerEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_heads, dropout):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Linear(input_size, hidden_size)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 1000, hidden_size))  # Assuming max sequence length of 1000
        self.encoder_layers = nn.TransformerEncoderLayer(hidden_size, num_heads, hidden_size * 4, dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layers, num_layers)

    def forward(self, x):
        x = self.embedding(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        return x

# Define Transformer Decoder with Attention
class TransformerDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, context_size, output_size, num_layers, num_heads, dropout):
        super(TransformerDecoderWithAttention, self).__init__()
        self.attention = nn.MultiheadAttention(hidden_size, num_heads, dropout=dropout)
        self.lstm = nn.LSTM(hidden_size + context_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, context_data):
        attn_output, _ = self.attention(context_data, encoder_outputs, encoder_outputs)
        combined = torch.cat((attn_output, context_data), dim=2)
        outputs, _ = self.lstm(combined)
        prediction = self.fc(outputs[:, -1, :])
        return prediction

# Define the Hybrid Model
class HybridModel(nn.Module):
    def __init__(self, input_size, hidden_size, context_size, output_size, num_layers, num_heads, dropout):
        super(HybridModel, self).__init__()
        self.transformer_encoder = TransformerEncoder(input_size, hidden_size, num_layers, num_heads, dropout)
        self.transformer_decoder = TransformerDecoderWithAttention(hidden_size, context_size, output_size, num_layers, num_heads, dropout)

    def forward(self, past_data, context_data):
        encoder_outputs = self.transformer_encoder(past_data)
        prediction = self.transformer_decoder(encoder_outputs, context_data)
        return prediction

def load_and_preprocess_data(file_path, features, target, context_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()
    scaler_context = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    # Normalize context columns
    df[context_features] = scaler_context.fit_transform(df[context_features])

    return df, scaler_target, scaler_context

def create_sequences(data, target_data, context_data, n_timesteps):
    X, y, context_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values)
        y.append(target_data.iloc[i + n_timesteps].values)
        context_features.append(context_data.iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(context_features)

def prepare_data(df, features, target, context_features, n_timesteps, test_size=0.2):
    X, y, context = create_sequences(df[features], df[target], df[context_features], n_timesteps)
    X_train, X_test, y_train, y_test, context_train, context_test = train_test_split(
        X, y, context, test_size=test_size, random_state=42)

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    context_train = torch.tensor(context_train, dtype=torch.float32)
    context_test = torch.tensor(context_test, dtype=torch.float32)

    return X_train, X_test, y_train, y_test, context_train, context_test

def train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            past_data, y_batch, context_data = batch
            optimizer.zero_grad()
            outputs = model(past_data, context_data)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                past_data, y_batch, context_data = batch
                outputs = model(past_data, context_data)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        scheduler.step(val_loss)

        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

def evaluate_model(model, test_loader, scaler_target):
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    predictions, true_values = [], []
    with torch.no_grad():
        for batch in test_loader:
            past_data, y_batch, context_data = batch
            outputs = model(past_data, context_data)
            predictions.append(outputs.cpu().numpy())
            true_values.append(y_batch.cpu().numpy())

    predictions = np.concatenate(predictions, axis=0)
    true_values = np.concatenate(true_values, axis=0)

    predictions_original_scale = scaler_target.inverse_transform(predictions)
    true_values_original_scale = scaler_target.inverse_transform(true_values)

    mse = mean_squared_error(true_values_original_scale, predictions_original_scale)
    r2 = r2_score(true_values_original_scale, predictions_original_scale)

    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')

    return predictions_original_scale, true_values_original_scale

def plot_results(y_test, y_pred):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test, label='True Values')
    plt.plot(y_pred, label='Predictions')
    plt.legend()
    plt.xlabel('Samples')
    plt.ylabel('Values')
    plt.title('True Values vs Predictions')
    plt.show()

def main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X_train, X_test, y_train, y_test, context_train, context_test = prepare_data(df, features, target, context_features, n_timesteps)

    # Create DataLoader
    train_dataset = torch.utils.data.TensorDataset(X_train, y_train, context_train)
    test_dataset = torch.utils.data.TensorDataset(X_test, y_test, context_test)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, optimizer, criterion, and scheduler
    input_size = X_train.size(2)
    context_size = context_train.size(2)
    output_size = y_train.size(1)
    hidden_size = 128
    num_layers = 2
    num_heads = 4
    dropout = 0.3

    model = HybridModel(input_size, hidden_size, context_size, output_size, num_layers, num_heads, dropout)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

    # Train model with early stopping
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f'Fold {fold+1}')
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_subsampler)
        val_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=val_subsampler)
        train_model(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler, patience)

    # Evaluate the model
    y_pred_original_scale, y_test_original_scale = evaluate_model(model, test_loader, scaler_target)

    # Plot results
    plot_results(y_test_original_scale, y_pred_original_scale)

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
n_epochs = 50
batch_size = 64
learning_rate = 0.001
patience = 10

# Run the main function
main(file_path, features, target, context_features, n_timesteps, n_epochs, batch_size, learning_rate, patience)