In [None]:
import torch
import torch.nn as nn

class LSTMTransformerEncoderDecoder(nn.Module):
    def __init__(self, input_dim, macro_dim, embed_dim, lstm_hidden_dim, lstm_layers, num_heads, num_layers, dropout=0.1, output_seq_len=1):
        super(LSTMTransformerEncoderDecoder, self).__init__()
        
        # LSTM Encoder for past sequence data
        self.lstm_encoder = nn.LSTM(input_dim, lstm_hidden_dim, lstm_layers, batch_first=True)
        
        # Linear layer to embed LSTM output to Transformer-compatible dimension
        self.past_embedding = nn.Linear(lstm_hidden_dim, embed_dim)
        
        # Macro data embedding
        self.macro_embedding = nn.Linear(macro_dim, embed_dim)
        
        # Positional encoding for the past sequence
        self.positional_encoding = nn.Parameter(torch.zeros(1, 1000, embed_dim))  # Assuming max sequence length of 1000
        
        # Transformer Encoder layers
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout)
            for _ in range(num_layers)
        ])
        
        # LSTM Decoder
        self.lstm_decoder = nn.LSTM(embed_dim, lstm_hidden_dim, lstm_layers, batch_first=True)
        
        # Linear output layer
        self.output_layer = nn.Linear(lstm_hidden_dim, input_dim)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
        # Output sequence length for forecasting multiple steps
        self.output_seq_len = output_seq_len
        
    def forward(self, past_sequence, macro_today):
        """
        past_sequence: (batch_size, sequence_length, input_dim) - historical sequence data
        macro_today: (batch_size, macro_dim) - today's macro data
        """
        
        # LSTM Encoder: process the past sequence data
        lstm_out, (h_n, c_n) = self.lstm_encoder(past_sequence)  # lstm_out: (batch_size, sequence_length, lstm_hidden_dim)
        
        # Embed the LSTM output to transformer-compatible dimension
        past_embedded = self.past_embedding(lstm_out)  # (batch_size, sequence_length, embed_dim)
        
        # Add positional encodings to past sequence
        seq_length = past_embedded.size(1)
        past_embedded = past_embedded + self.positional_encoding[:, :seq_length, :]
        
        # Embed today’s macro data and expand for broadcasting
        macro_today_embedded = self.macro_embedding(macro_today)  # (batch_size, embed_dim)
        macro_today_embedded = macro_today_embedded.unsqueeze(1)  # (batch_size, 1, embed_dim)
        
        # Concatenate past and macro embeddings to allow today's data to influence predictions
        x = torch.cat([past_embedded, macro_today_embedded], dim=1)  # (batch_size, sequence_length + 1, embed_dim)
        
        # Transpose for transformer input (sequence_length + 1, batch_size, embed_dim)
        x = x.permute(1, 0, 2)
        
        # Pass through transformer layers
        for layer in self.transformer_layers:
            x = layer(x)
        
        # Transpose back to (batch_size, sequence_length + 1, embed_dim)
        x = x.permute(1, 0, 2)
        
        # LSTM Decoder for autoregressive prediction
        decoder_input = x[:, -1, :].unsqueeze(1)  # Start with the output of the last transformer step
        
        # Collect decoder outputs
        decoder_outputs = []
        hidden_state, cell_state = h_n, c_n  # Initialize with the encoder LSTM's final hidden states
        
        for _ in range(self.output_seq_len):
            # Pass through LSTM Decoder one step at a time
            decoder_output, (hidden_state, cell_state) = self.lstm_decoder(decoder_input, (hidden_state, cell_state))
            
            # Apply output layer to get the forecast for this step
            step_output = self.output_layer(decoder_output.squeeze(1))  # (batch_size, input_dim)
            decoder_outputs.append(step_output)
            
            # Prepare next input (autoregressive)
            decoder_input = decoder_output  # Feed the last output as the next input
            
        # Stack all the step outputs
        final_output = torch.stack(decoder_outputs, dim=1)  # (batch_size, output_seq_len, input_dim)
        
        return final_output

# Model configuration
input_dim = 5                # Number of input features
macro_dim = 3                # Number of macroeconomic variables
embed_dim = 64               # Embedding dimension for Transformer
lstm_hidden_dim = 128        # Hidden dimension for LSTM
lstm_layers = 2              # Number of LSTM layers
num_heads = 4                # Number of attention heads in Transformer
num_layers = 2               # Number of Transformer layers
dropout = 0.1                # Dropout rate
output_seq_len = 5           # Number of forecasted steps

# Instantiate the model
model = LSTMTransformerEncoderDecoder(input_dim, macro_dim, embed_dim, lstm_hidden_dim, lstm_layers, num_heads, num_layers, dropout, output_seq_len)

# Example input (batch_size=32, sequence_length=10, input_dim=5)
past_sequence = torch.randn(32, 10, input_dim)  # Historical sequence data
macro_today = torch.randn(32, macro_dim)        # Today's macroeconomic data

# Get the output prediction
output = model(past_sequence, macro_today)
print(output.shape)  # Expected: (32, output_seq_len, input_dim)


In [None]:
import torch
import os
import time
import math
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

def train_model(model, dataset, optimizer, criterion, config):
    # Initialize distributed process group if DDP is enabled
    ddp = int(os.environ.get('RANK', -1)) != -1
    if ddp:
        init_process_group(backend=config['backend'])
        ddp_rank = int(os.environ['RANK'])
        ddp_local_rank = int(os.environ['LOCAL_RANK'])
        ddp_world_size = int(os.environ['WORLD_SIZE'])
        device = f'cuda:{ddp_local_rank}'
        torch.cuda.set_device(device)
        model = DDP(model, device_ids=[ddp_local_rank])
    else:
        device = config['device']

    # Set model, optimizer, and criterion to specified device
    model.to(device)
    criterion.to(device)

    # Prepare DataLoader for batching
    batch_size = config['batch_size']
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    # Training loop
    total_loss = 0.0
    model.train()
    for epoch in range(config['num_epochs']):
        running_loss = 0.0
        for i, (inputs, targets) in enumerate(dataloader):
            # Move data to the appropriate device
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Backward pass
            loss.backward()

            # Gradient clipping (optional)
            if config.get('grad_clip', 0.0) > 0.0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config['grad_clip'])

            # Step the optimizer
            optimizer.step()

            # Track running loss for logging
            running_loss += loss.item()
            if (i + 1) % config['log_interval'] == 0:
                print(f"Epoch [{epoch + 1}/{config['num_epochs']}], Step [{i + 1}/{len(dataloader)}], "
                      f"Loss: {running_loss / config['log_interval']:.4f}")
                running_loss = 0.0

            # Optionally, evaluate and save checkpoints
            if config['eval_interval'] > 0 and i % config['eval_interval'] == 0:
                evaluate_and_checkpoint(model, optimizer, device, config)

        # Update total loss for epoch
        total_loss += running_loss

    # Clean up DDP process group if necessary
    if ddp:
        destroy_process_group()

    return total_loss / config['num_epochs']


def evaluate_and_checkpoint(model, optimizer, device, config):
    model.eval()
    eval_loss = 0.0
    # Dummy evaluation loop (replace with actual validation data and logic)
    with torch.no_grad():
        for _ in range(config['eval_iters']):
            # Perform evaluation here; set eval_loss appropriately
            pass

    print(f"Eval loss: {eval_loss}")
    model.train()

    # Save model checkpoint
    checkpoint_path = os.path.join(config['out_dir'], f"checkpoint_epoch_{config['current_epoch']}.pt")
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")


In [None]:
import inspect

class LSTMTransformerEncoderDecoder(nn.Module):
    def __init__(self, input_dim, macro_dim, embed_dim, lstm_hidden_dim, lstm_layers, num_heads, num_layers, dropout=0.1, output_seq_len=1):
        super(LSTMTransformerEncoderDecoder, self).__init__()
        
        # Model architecture as previously defined
        # ...

    def forward(self, past_sequence, macro_today):
        # Forward pass logic as defined in the previous model
        # ...

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # Collect model parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        
        # Filter out parameters that do not require gradient computation
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        
        # Separate parameters into decay and no-decay groups
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        
        # Define optimizer parameter groups
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        
        # Print parameter count information
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        
        # Check if the fused AdamW optimizer is available for CUDA devices
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        
        # Initialize the AdamW optimizer
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        
        print(f"Using fused AdamW: {use_fused}")
        
        return optimizer

# Example usage
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
model = LSTMTransformerEncoderDecoder(input_dim, macro_dim, embed_dim, lstm_hidden_dim, lstm_layers, num_heads, num_layers, dropout, output_seq_len)
optimizer = model.configure_optimizers(weight_decay=0.01, learning_rate=1e-4, betas=(0.9, 0.999), device_type=device_type)


In [None]:
import torch
import os
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group

def train_model(model, dataset, optimizer, criterion, config):
    # Distributed Data Parallel (DDP) initialization
    ddp = int(os.environ.get('RANK', -1)) != -1
    if ddp:
        init_process_group(backend=config['backend'])
        ddp_rank = int(os.environ['RANK'])
        ddp_local_rank = int(os.environ['LOCAL_RANK'])
        device = f'cuda:{ddp_local_rank}'
        torch.cuda.set_device(device)
        model = DDP(model, device_ids=[ddp_local_rank])
    else:
        device = config['device']

    model.to(device)
    criterion.to(device)

    # Prepare DataLoader for time series batching
    batch_size = config['batch_size']
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

    total_loss = 0.0
    model.train()
    for epoch in range(config['num_epochs']):
        running_loss = 0.0
        for i, (inputs, targets) in enumerate(dataloader):
            # Move inputs and targets to the correct device
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass for hybrid model
            optimizer.zero_grad()
            outputs = model(inputs)  # Ensure model handles the time series input shape properly
            loss = criterion(outputs, targets)

            # Backward pass and optimization
            loss.backward()

            # Optional: gradient clipping for stability
            if config.get('grad_clip', 0.0) > 0.0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config['grad_clip'])

            optimizer.step()

            running_loss += loss.item()
            if (i + 1) % config['log_interval'] == 0:
                print(f"Epoch [{epoch + 1}/{config['num_epochs']}], Step [{i + 1}/{len(dataloader)}], "
                      f"Loss: {running_loss / config['log_interval']:.4f}")
                running_loss = 0.0

            # Optional: evaluation and checkpointing
            if config['eval_interval'] > 0 and i % config['eval_interval'] == 0:
                evaluate_and_checkpoint(model, optimizer, device, config)

        total_loss += running_loss

    if ddp:
        destroy_process_group()

    return total_loss / config['num_epochs']


def evaluate_and_checkpoint(model, optimizer, device, config):
    model.eval()
    eval_loss = 0.0
    # Dummy evaluation loop; replace with actual validation set and metrics
    with torch.no_grad():
        for _ in range(config['eval_iters']):
            # Perform evaluation on validation data here
            pass

    print(f"Eval loss: {eval_loss}")
    model.train()

    # Save model checkpoint
    checkpoint_path = os.path.join(config['out_dir'], f"checkpoint_epoch_{config['current_epoch']}.pt")
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    print(f"Checkpoint saved at {checkpoint_path}")


In [None]:
import torch
import torch.nn as nn

class LSTMTransformerEncoderDecoder(nn.Module):
    def __init__(self, input_dim, macro_dim, embed_dim, lstm_hidden_dim, lstm_layers, num_heads, num_layers, dropout=0.1, output_seq_len=1):
        super(LSTMTransformerEncoderDecoder, self).__init__()
        
        # LSTM Encoder for past sequence data
        self.lstm_encoder = nn.LSTM(input_dim, lstm_hidden_dim, lstm_layers, batch_first=True)
        
        # Linear layer to embed LSTM output to Transformer-compatible dimension
        self.past_embedding = nn.Linear(lstm_hidden_dim, embed_dim)
        
        # Macro data embedding
        self.macro_embedding = nn.Linear(macro_dim, embed_dim)
        
        # Transformer Encoder layers
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout)
            for _ in range(num_layers)
        ])
        
        # LSTM Decoder
        self.lstm_decoder = nn.LSTM(embed_dim, lstm_hidden_dim, lstm_layers, batch_first=True)
        
        # Linear output layer
        self.output_layer = nn.Linear(lstm_hidden_dim, input_dim)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)
        
        # Output sequence length for forecasting multiple steps
        self.output_seq_len = output_seq_len
        
    def get_positional_encoding(self, seq_len, embed_dim, device):
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1).to(device)
        div_term = torch.exp(torch.arange(0, embed_dim, 2, dtype=torch.float) * -(torch.log(torch.tensor(10000.0)) / embed_dim)).to(device)
        
        pe = torch.zeros(seq_len, embed_dim, device=device)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        return pe.unsqueeze(0)  # Shape: (1, seq_len, embed_dim)

    def forward(self, past_sequence, macro_today):
        """
        past_sequence: (batch_size, sequence_length, input_dim) - historical sequence data
        macro_today: (batch_size, macro_dim) - today's macro data
        """
        device = past_sequence.device
        
        # LSTM Encoder: process the past sequence data
        lstm_out, (h_n, c_n) = self.lstm_encoder(past_sequence)  # lstm_out: (batch_size, sequence_length, lstm_hidden_dim)
        
        # Embed the LSTM output to transformer-compatible dimension
        past_embedded = self.past_embedding(lstm_out)  # (batch_size, sequence_length, embed_dim)
        
        # Generate dynamic positional encoding based on actual sequence length
        seq_length = past_embedded.size(1)
        positional_encoding = self.get_positional_encoding(seq_length, past_embedded.size(2), device)
        
        # Add positional encodings to past sequence
        past_embedded = past_embedded + positional_encoding
        
        # Embed today’s macro data and expand for broadcasting
        macro_today_embedded = self.macro_embedding(macro_today)  # (batch_size, embed_dim)
        macro_today_embedded = macro_today_embedded.unsqueeze(1)  # (batch_size, 1, embed_dim)
        
        # Concatenate past and macro embeddings to allow today's data to influence predictions
        x = torch.cat([past_embedded, macro_today_embedded], dim=1)  # (batch_size, sequence_length + 1, embed_dim)
        
        # Transpose for transformer input (sequence_length + 1, batch_size, embed_dim)
        x = x.permute(1, 0, 2)
        
        # Pass through transformer layers
        for layer in self.transformer_layers:
            x = layer(x)
        
        # Transpose back to (batch_size, sequence_length + 1, embed_dim)
        x = x.permute(1, 0, 2)
        
        # LSTM Decoder for autoregressive prediction
        decoder_input = x[:, -1, :].unsqueeze(1)  # Start with the output of the last transformer step
        
        # Collect decoder outputs
        decoder_outputs = []
        hidden_state, cell_state = h_n, c_n  # Initialize with the encoder LSTM's final hidden states
        
        for _ in range(self.output_seq_len):
            # Pass through LSTM Decoder one step at a time
            decoder_output, (hidden_state, cell_state) = self.lstm_decoder(decoder_input, (hidden_state, cell_state))
            
            # Apply output layer to get the forecast for this step
            step_output = self.output_layer(decoder_output.squeeze(1))  # (batch_size, input_dim)
            decoder_outputs.append(step_output)
            
            # Prepare next input (autoregressive)
            decoder_input = decoder_output  # Feed the last output as the next input
            
        # Stack all the step outputs
        final_output = torch.stack(decoder_outputs, dim=1)  # (batch_size, output_seq_len, input_dim)
        
        return final_output

# Model configuration
input_dim = 5                # Number of input features
macro_dim = 3                # Number of macroeconomic variables
embed_dim = 64               # Embedding dimension for Transformer
lstm_hidden_dim = 128        # Hidden dimension for LSTM
lstm_layers = 2              # Number of LSTM layers
num_heads = 4                # Number of attention heads in Transformer
num_layers = 2               # Number of Transformer layers
dropout = 0.1                # Dropout rate
output_seq_len = 5           # Number of forecasted steps

# Instantiate the model
model = LSTMTransformerEncoderDecoder(input_dim, macro_dim, embed_dim, lstm_hidden_dim, lstm_layers, num_heads, num_layers, dropout, output_seq_len)

# Example input (batch_size=32, sequence_length=10, input_dim=5)
past_sequence = torch.randn(32, 10, input_dim)  # Historical sequence data
macro_today = torch.randn(32, macro_dim)        # Today's macroeconomic data

# Get the output prediction
output = model(past_sequence, macro_today)
print(output.shape)  # Expected: (32, output_seq_len, input_dim)
