In [None]:
import torch
import torch.nn as nn

class LSTMTransformerEncoderDecoder(nn.Module):
    def __init__(self, input_dim, macro_dim, embed_dim, lstm_hidden_dim, lstm_layers, num_heads, num_layers, dropout=0.1, output_seq_len=1):
        super(LSTMTransformerEncoderDecoder, self).__init__()
        
        # LSTM Encoder for past sequence data
        self.lstm_encoder = nn.LSTM(input_dim, lstm_hidden_dim, lstm_layers, batch_first=True)
        
        # Linear layer to embed LSTM output to Transformer-compatible dimension
        self.past_embedding = nn.Linear(lstm_hidden_dim, embed_dim)
        
        # Macro data embedding
        self.macro_embedding = nn.Linear(macro_dim, embed_dim)
        
        # Positional encoding for the past sequence
        self.positional_encoding = nn.Parameter(torch.zeros(1, 1000, embed_dim))  # Assuming max sequence length of 1000
        
        # Transformer Encoder layers
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout)
            for _ in range(num_layers)
        ])
        
        # LSTM Decoder
        self.lstm_decoder = nn.LSTM(embed_dim, lstm_hidden_dim, lstm_layers, batch_first=True)
        
        # Linear output layer
        self.output_layer = nn.Linear(lstm_hidden_dim, input_dim)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
        # Output sequence length for forecasting multiple steps
        self.output_seq_len = output_seq_len
        
    def forward(self, past_sequence, macro_today):
        """
        past_sequence: (batch_size, sequence_length, input_dim) - historical sequence data
        macro_today: (batch_size, macro_dim) - today's macro data
        """
        
        # LSTM Encoder: process the past sequence data
        lstm_out, (h_n, c_n) = self.lstm_encoder(past_sequence)  # lstm_out: (batch_size, sequence_length, lstm_hidden_dim)
        
        # Embed the LSTM output to transformer-compatible dimension
        past_embedded = self.past_embedding(lstm_out)  # (batch_size, sequence_length, embed_dim)
        
        # Add positional encodings to past sequence
        seq_length = past_embedded.size(1)
        past_embedded = past_embedded + self.positional_encoding[:, :seq_length, :]
        
        # Embed today’s macro data and expand for broadcasting
        macro_today_embedded = self.macro_embedding(macro_today)  # (batch_size, embed_dim)
        macro_today_embedded = macro_today_embedded.unsqueeze(1)  # (batch_size, 1, embed_dim)
        
        # Concatenate past and macro embeddings to allow today's data to influence predictions
        x = torch.cat([past_embedded, macro_today_embedded], dim=1)  # (batch_size, sequence_length + 1, embed_dim)
        
        # Transpose for transformer input (sequence_length + 1, batch_size, embed_dim)
        x = x.permute(1, 0, 2)
        
        # Pass through transformer layers
        for layer in self.transformer_layers:
            x = layer(x)
        
        # Transpose back to (batch_size, sequence_length + 1, embed_dim)
        x = x.permute(1, 0, 2)
        
        # LSTM Decoder for autoregressive prediction
        decoder_input = x[:, -1, :].unsqueeze(1)  # Start with the output of the last transformer step
        
        # Collect decoder outputs
        decoder_outputs = []
        hidden_state, cell_state = h_n, c_n  # Initialize with the encoder LSTM's final hidden states
        
        for _ in range(self.output_seq_len):
            # Pass through LSTM Decoder one step at a time
            decoder_output, (hidden_state, cell_state) = self.lstm_decoder(decoder_input, (hidden_state, cell_state))
            
            # Apply output layer to get the forecast for this step
            step_output = self.output_layer(decoder_output.squeeze(1))  # (batch_size, input_dim)
            decoder_outputs.append(step_output)
            
            # Prepare next input (autoregressive)
            decoder_input = decoder_output  # Feed the last output as the next input
            
        # Stack all the step outputs
        final_output = torch.stack(decoder_outputs, dim=1)  # (batch_size, output_seq_len, input_dim)
        
        return final_output

# Model configuration
input_dim = 5                # Number of input features
macro_dim = 3                # Number of macroeconomic variables
embed_dim = 64               # Embedding dimension for Transformer
lstm_hidden_dim = 128        # Hidden dimension for LSTM
lstm_layers = 2              # Number of LSTM layers
num_heads = 4                # Number of attention heads in Transformer
num_layers = 2               # Number of Transformer layers
dropout = 0.1                # Dropout rate
output_seq_len = 5           # Number of forecasted steps

# Instantiate the model
model = LSTMTransformerEncoderDecoder(input_dim, macro_dim, embed_dim, lstm_hidden_dim, lstm_layers, num_heads, num_layers, dropout, output_seq_len)

# Example input (batch_size=32, sequence_length=10, input_dim=5)
past_sequence = torch.randn(32, 10, input_dim)  # Historical sequence data
macro_today = torch.randn(32, macro_dim)        # Today's macroeconomic data

# Get the output prediction
output = model(past_sequence, macro_today)
print(output.shape)  # Expected: (32, output_seq_len, input_dim)
