In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader, TensorDataset

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define Encoder
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

# Define Decoder with Attention
class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, dropout):
        super(LSTMDecoderWithAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size + 4, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)

    def forward(self, hidden, cell, encoder_outputs, current_temporal_features):
        attention_weights = self.attention(hidden[-1], encoder_outputs)
        attention_weights = attention_weights.unsqueeze(1)
        context = torch.bmm(attention_weights, encoder_outputs)
        decoder_input = torch.cat([context, current_temporal_features], dim=2)
        output, (hidden, cell) = self.lstm(decoder_input, (hidden, cell))
        prediction = self.fc(output).squeeze(1)
        return prediction, hidden, cell, attention_weights.squeeze(1)

# Define full Encoder-Decoder Model
class LSTMEncoderDecoderWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):
        super(LSTMEncoderDecoderWithAttention, self).__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = LSTMDecoderWithAttention(hidden_size, output_size, num_layers, dropout)

    def forward(self, encoder_input, current_temporal_features):
        encoder_outputs, hidden, cell = self.encoder(encoder_input)
        prediction, _, _, attention_weights = self.decoder(hidden, cell, encoder_outputs, current_temporal_features)
        return prediction, attention_weights

# Define Autoencoder class
class Autoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )
        self.init_weights()

    def init_weights(self):
        for layer in self.encoder:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.constant_(layer.bias, 0)
        for layer in self.decoder:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.constant_(layer.bias, 0)

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

def load_and_preprocess_data(file_path, features, target, current_temporal_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    return df, scaler_target

def create_sequences(data, target_data, current_temporal_features, n_timesteps):
    X, y, current_time_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values)
        y.append(target_data.iloc[i + n_timesteps].values)
        current_time_features.append(data[current_temporal_features].iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(current_time_features)

def prepare_data(df, features, target, current_temporal_features, n_timesteps, test_size=0.2):
    X, y, current_temporal = create_sequences(df[features], df[target], current_temporal_features, n_timesteps)
    X_train, X_test, y_train, y_test, current_temporal_train, current_temporal_test = train_test_split(
        X, y, current_temporal, test_size=test_size, random_state=42)

    # Convert to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.float32)
    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.float32)
    y_test = torch.tensor(y_test, dtype=torch.float32)
    current_temporal_train = torch.tensor(current_temporal_train, dtype=torch.float32).unsqueeze(1)
    current_temporal_test = torch.tensor(current_temporal_test, dtype=torch.float32).unsqueeze(1)

    return X_train, X_test, y_train, y_test, current_temporal_train, current_temporal_test

def train_autoencoder(autoencoder, train_loader, n_epochs, ae_optimizer, ae_criterion):
    for epoch in range(n_epochs):
        autoencoder.train()
        total_loss = 0
        for batch in train_loader:
            ae_optimizer.zero_grad()
            X_batch = batch[0]
            encoded, decoded = autoencoder(X_batch)
            loss = ae_criterion(decoded, X_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(autoencoder.parameters(), max_norm=1.0)
            ae_optimizer.step()
            total_loss += loss.item()
        if (epoch + 1) % 10 == 0:
            print(f'Autoencoder Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}')

def encode_data(autoencoder, data):
    autoencoder.eval()
    with torch.no_grad():
        encoded_data, _ = autoencoder.encoder(data)
    return encoded_data

def train_lstm(model, train_loader, val_loader, n_epochs, optimizer, criterion, scheduler=None, patience=10):
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            X_batch, y_batch, current_temporal_batch = batch
            output, _ = model(X_batch, current_temporal_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()

        if scheduler:
            scheduler.step()

        # Evaluate on validation set
        val_loss = 0
        model.eval()
        with torch.no_grad():
            for batch in val_loader:
                X_batch, y_batch, current_temporal_batch = batch
                output, _ = model(X_batch, current_temporal_batch)
                loss = criterion(output, y_batch)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        print(f'Epoch [{epoch+1}/{n_epochs}], Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}')

        # Check for early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print('Early stopping!')
                model.load_state_dict(torch.load('best_model.pth'))
                break

def evaluate_lstm(model, test_loader, scaler_target):
    model.eval()  # Set the model to evaluation mode
    attention_weights_list = []  # List to store attention weights for each batch
    forecast_list = []  # List to store forecasts for each batch

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for batch in test_loader:  # Iterate over batches in the test loader
            X_batch, _, current_temporal_batch = batch  # Unpack the batch
            forecast, attention_weights = model(X_batch, current_temporal_batch)  # Get the model's forecast and attention weights
            forecast_list.append(forecast.cpu().numpy())  # Store the forecast
            attention_weights_list.append(attention_weights.cpu().numpy())  # Store the attention weights

    # Concatenate the forecasts and attention weights from all batches
    forecast_array = np.concatenate(forecast_list, axis=0)
    attention_weights_array = np.concatenate(attention_weights_list, axis=0)

    # Inverse-transform the predictions to the original scale
    forecast_original_scale = scaler_target.inverse_transform(forecast_array)
    return forecast_original_scale, attention_weights_array

def visualize_attention_weights(attention_weights_array, sample_index=0):
    attention_weights_sample = attention_weights_array[sample_index]
    plt.figure(figsize=(10, 6))
    sns.heatmap(attention_weights_sample.reshape(1, -1), cmap='viridis', annot=True)
    plt.title('Attention Weights for Sample Index {}'.format(sample_index))
    plt.xlabel('Input Sequence Index')
    plt.ylabel('Attention Weight')
    plt.show()

def main(file_path, features, target, current_temporal_features, n_timesteps, encoding_dim, hidden_dim, lstm_hidden_size, lstm_num_layers, lstm_dropout, lstm_learning_rate, lstm_n_epochs, ae_n_epochs, batch_size, patience):
    # Load and preprocess data
    df, scaler_target = load_and_preprocess_data(file_path, features, target, current_temporal_features)

    # Prepare data
    X_train, X_test, y_train, y_test, current_temporal_train, current_temporal_test = prepare_data(df, features, target, current_temporal_features, n_timesteps)

    # Split training data into training and validation sets
    X_train, X_val, y_train, y_val, current_temporal_train, current_temporal_val = train_test_split(
        X_train, y_train, current_temporal_train, test_size=0.2, random_state=42)

    # Create DataLoader for autoencoder training
    train_dataset = TensorDataset(X_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Define the input dimension, encoding dimension, and hidden dimension
    input_dim = X_train.size(2)

    # Initialize the autoencoder
    autoencoder = Autoencoder(input_dim, encoding_dim, hidden_dim)

    # Define the optimizer and loss function for autoencoder
    ae_optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
    ae_criterion = nn.MSELoss()

    # Train the autoencoder
    train_autoencoder(autoencoder, train_loader, ae_n_epochs, ae_optimizer, ae_criterion)

    # Encode the training, validation, and test data
    encoded_train = encode_data(autoencoder, X_train.view(-1, X_train.size(2)))
    encoded_val = encode_data(autoencoder, X_val.view(-1, X_val.size(2)))
    encoded_test = encode_data(autoencoder, X_test.view(-1, X_test.size(2)))

    # Reshape the encoded features back to the original sequence shape
    encoded_train = encoded_train.view(X_train.size(0), X_train.size(1), -1)
    encoded_val = encoded_val.view(X_val.size(0), X_val.size(1), -1)
    encoded_test = encoded_test.view(X_test.size(0), X_test.size(1), -1)

    # Create DataLoader for LSTM training
    train_dataset = TensorDataset(encoded_train, y_train, current_temporal_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = TensorDataset(encoded_val, y_val, current_temporal_val)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_dataset = TensorDataset(encoded_test, y_test, current_temporal_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model
    input_size = encoded_train.size(2)  # Encoded feature size
    output_size = y_train.size(1)  # Number of targets
    model = LSTMEncoderDecoderWithAttention(input_size, lstm_hidden_size, output_size, lstm_num_layers, lstm_dropout)
    optimizer = optim.Adam(model.parameters(), lr=lstm_learning_rate)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    # Train the LSTM model with early stopping
    train_lstm(model, train_loader, val_loader, lstm_n_epochs, optimizer, criterion, scheduler, patience)

    # Evaluate the LSTM model
    forecast_original_scale, attention_weights_array = evaluate_lstm(model, test_loader, scaler_target)

    # Compare predictions to the actual values
    y_test_original_scale = scaler_target.inverse_transform(y_test.cpu().numpy())
    print("Predictions on original scale:", forecast_original_scale)
    print("True values on original scale:", y_test_original_scale)

    # Visualize attention weights for a specific sample
    visualize_attention_weights(attention_weights_array)

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
current_temporal_features = ['days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
n_timesteps = 12
encoding_dim = 20
hidden_dim = 128
lstm_hidden_size = 128
lstm_num_layers = 2
lstm_dropout = 0.3
lstm_learning_rate = 0.001
lstm_n_epochs = 50
ae_n_epochs = 50
batch_size = 64
patience = 10

# Run the main function
main(file_path, features, target, current_temporal_features, n_timesteps, encoding_dim, hidden_dim, lstm_hidden_size, lstm_num_layers, lstm_dropout, lstm_learning_rate, lstm_n_epochs, ae_n_epochs, batch_size, patience)