In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset
import joblib

# Define LSTM Encoder-Decoder Model with Attention
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, dropout):
        super(LSTMDecoderWithAttention, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size + 4, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                nn.init.xavier_uniform_(param)
            elif 'bias' in name:
                nn.init.constant_(param, 0)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)

    def forward(self, hidden, cell, encoder_outputs, current_temporal_features):
        attention_weights = self.attention(hidden[-1], encoder_outputs)
        attention_weights = attention_weights.unsqueeze(1)
        context = torch.bmm(attention_weights, encoder_outputs)
        decoder_input = torch.cat([context, current_temporal_features], dim=2)
        output, (hidden, cell) = self.lstm(decoder_input, (hidden, cell))
        prediction = self.fc(output).squeeze(1)
        return prediction, hidden, cell, attention_weights.squeeze(1)

class LSTMEncoderDecoderWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):
        super(LSTMEncoderDecoderWithAttention, self).__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = LSTMDecoderWithAttention(hidden_size, output_size, num_layers, dropout)

    def forward(self, encoder_input, current_temporal_features):
        encoder_outputs, hidden, cell = self.encoder(encoder_input)
        prediction, _, _, attention_weights = self.decoder(hidden, cell, encoder_outputs, current_temporal_features)
        return prediction, attention_weights

def load_and_preprocess_data(file_path, features, target, current_temporal_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    # Save the scalers for later use (e.g., inverse transforming predictions)
    joblib.dump(scaler_features, 'scaler_features.pkl')
    joblib.dump(scaler_target, 'scaler_target.pkl')

    return df, scaler_target

def prepare_data(df, features, target, current_temporal_features, n_timesteps):
    X, y, current_temporal = create_sequences(df[features], df[target], current_temporal_features, n_timesteps)

    # Convert to PyTorch tensors
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.float32)
    current_temporal = torch.tensor(current_temporal, dtype=torch.float32).unsqueeze(1)

    return X, y, current_temporal

def create_sequences(data, target_data, current_temporal_features, n_timesteps):
    X, y, current_time_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values)
        y.append(target_data.iloc[i + n_timesteps].values)
        current_time_features.append(data[current_temporal_features].iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(current_time_features)

def train_lstm(model, train_loader, n_epochs, optimizer, criterion, scheduler=None):
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            X_batch, y_batch, current_temporal_batch = batch
            output, _ = model(X_batch, current_temporal_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
        if scheduler:
            scheduler.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(train_loader):.4f}')

def evaluate_lstm(model, test_loader, scaler_target):
    model.eval()
    attention_weights_list = []
    with torch.no_grad():
        for batch in test_loader:
            X_batch, _, current_temporal_batch = batch
            forecast, attention_weights = model(X_batch, current_temporal_batch)
            attention_weights_list.append(attention_weights.cpu().numpy())

    # Convert list to numpy array
    attention_weights_array = np.concatenate(attention_weights_list, axis=0)

    # Inverse-transform the predictions to the original scale
    forecast_original_scale = scaler_target.inverse_transform(forecast.cpu().numpy())
    return forecast_original_scale, attention_weights_array

def visualize_attention_weights(attention_weights_array, sample_index=0):
    attention_weights_sample = attention_weights_array[sample_index]
    plt.figure(figsize=(10, 6))
    sns.heatmap(attention_weights_sample.reshape(1, -1), cmap='viridis', annot=True)
    plt.title('Attention Weights for Sample Index {}'.format(sample_index))
    plt.xlabel('Input Sequence Index')
    plt.ylabel('Attention Weight')
    plt.show()

def main_with_cross_validation(file_path, features, target, current_temporal_features, n_timesteps, hidden_dim, num_layers, dropout, learning_rate, n_epochs, batch_size, save_model_path, n_splits=5):
    # Load and preprocess data
    df, scaler_target = load_and_preprocess_data(file_path, features, target, current_temporal_features)

    # Prepare data
    X, y, current_temporal = prepare_data(df, features, target, current_temporal_features, n_timesteps)

    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    fold = 1
    for train_index, val_index in kf.split(X):
        print(f"Fold {fold}")
        fold += 1

        # Split data into training and validation sets
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        current_temporal_train, current_temporal_val = current_temporal[train_index], current_temporal[val_index]

        # Initialize the model
        input_size = X_train.size(2)
        output_size = y_train.size(1)
        model = LSTMEncoderDecoderWithAttention(input_size, hidden_dim, output_size, num_layers, dropout)

        # Train the model
        optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
        criterion = nn.MSELoss()
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

        train_dataset = TensorDataset(X_train, y_train, current_temporal_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        val_dataset = TensorDataset(X_val, y_val, current_temporal_val)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        train_lstm(model, train_loader, n_epochs, optimizer, criterion, scheduler)

        # Evaluate the model on the validation set
        val_forecast, _ = evaluate_lstm(model, val_loader, scaler_target)
        y_val_original_scale = scaler_target.inverse_transform(y_val.cpu().numpy())
        val_loss = np.mean((val_forecast - y_val_original_scale) ** 2)
        print(f"Validation Loss: {val_loss}")

    # Save the trained model
    torch.save(model.state_dict(), save_model_path)
    print(f"Model saved to {save_model_path}")

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
current_temporal_features = ['days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
n_timesteps = 12
hidden_dim = 128
num_layers = 2
dropout = 0.3
learning_rate = 0.001
n_epochs = 50
batch_size = 64
save_model_path = 'pretrained_model.pth'
n_splits = 5

# Run the main function with cross-validation
main_with_cross_validation(file_path, features, target, current_temporal_features, n_timesteps, hidden_dim, num_layers, dropout, learning_rate, n_epochs, batch_size, save_model_path, n_splits)