In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define LSTM Encoder
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        return outputs, hidden, cell

# Define LSTM Decoder with Attention
class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, dropout):
        super(LSTMDecoderWithAttention, self).__init__()
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size * 2, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, hidden, cell, targets=None, teacher_forcing_ratio=0.5):
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        output_size = self.fc.out_features

        outputs = torch.zeros(batch_size, seq_len, output_size).to(encoder_outputs.device)
        input = encoder_outputs[:, -1, :].unsqueeze(1)  # First input to the decoder is the last hidden state of the encoder

        for t in range(seq_len):
            attn_weights = self.attention(hidden[-1], encoder_outputs)
            context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
            lstm_input = torch.cat([context, input.squeeze(1)], dim=1).unsqueeze(1)  # Add sequence dimension
            output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
            output = self.fc(output.squeeze(1))
            outputs[:, t, :] = output

            teacher_force = targets is not None and torch.rand(1).item() < teacher_forcing_ratio
            input = targets[:, t].unsqueeze(1) if teacher_force else output.unsqueeze(1)

        return outputs, attn_weights

# Define the Hybrid Model
class HybridModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):
        super(HybridModel, self).__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = LSTMDecoderWithAttention(hidden_size, output_size, num_layers, dropout)

    def forward(self, past_data, targets=None, teacher_forcing_ratio=0.5):
        encoder_outputs, hidden, cell = self.encoder(past_data)
        prediction, attn_weights = self.decoder(encoder_outputs, hidden, cell, targets, teacher_forcing_ratio)
        return prediction, attn_weights

def moving_average(data, window_size):
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

# Example usage
input_size = 10
hidden_size = 128
output_size = 1
num_layers = 2
dropout = 0.5
batch_size = 32
seq_len = 20
window_size = 3  # Window size for moving average

model = HybridModel(input_size, hidden_size, output_size, num_layers, dropout)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Dummy data
past_data = torch.randn(batch_size, seq_len, input_size)
targets = torch.randn(batch_size, seq_len, output_size)

# Denoise data using moving average
past_data_np = past_data.numpy()
denoised_data = np.apply_along_axis(moving_average, 1, past_data_np, window_size)
# Pad the denoised data to match the original length
denoised_data = np.pad(denoised_data, ((0, 0), (window_size-1, 0), (0, 0)), mode='edge')
past_data = torch.tensor(denoised_data, dtype=torch.float32)

# Forward pass
outputs, attn_weights = model(past_data, targets, teacher_forcing_ratio=0.5)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()

print("Output shape:", outputs.shape)
print("Attention weights shape:", attn_weights.shape)

Improving the performance of an LSTM model to match or exceed that of a momentum strategy involves several steps, including hyperparameter tuning, feature engineering, and model optimization. Here are some strategies to enhance the performance of your LSTM model:

1. Hyperparameter Tuning
Learning Rate: Experiment with different learning rates to find the optimal value.
Batch Size: Adjust the batch size to improve training stability and performance.
Number of Layers: Experiment with different numbers of LSTM layers.
Hidden Units: Adjust the number of hidden units in each LSTM layer.
Dropout: Add dropout layers to prevent overfitting.
2. Feature Engineering
Lag Features: Create lag features to capture temporal dependencies.
Rolling Statistics: Add rolling mean, rolling standard deviation, and other statistical features.
Seasonal Features: Include seasonal indicators such as day of the week, month, etc.
3. Model Optimization
Regularization: Use L2 regularization to prevent overfitting.
Early Stopping: Implement early stopping to prevent overfitting and reduce training time.
Ensemble Methods: Combine multiple LSTM models or use ensemble methods like bagging and boosting.
4. Data Preprocessing
Normalization: Normalize the input data to improve training stability.
Denoising: Apply denoising techniques to remove noise from the data.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import trange

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define LSTM Encoder
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        outputs = self.dropout(outputs)
        return outputs, hidden, cell

# Define LSTM Decoder with Attention
class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, dropout):
        super(LSTMDecoderWithAttention, self).__init__()
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size * 2, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, encoder_outputs, hidden, cell, targets=None, teacher_forcing_ratio=0.5):
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        output_size = self.fc.out_features

        outputs = torch.zeros(batch_size, seq_len, output_size).to(encoder_outputs.device)
        input = encoder_outputs[:, -1, :].unsqueeze(1)  # First input to the decoder is the last hidden state of the encoder

        for t in range(seq_len):
            attn_weights = self.attention(hidden[-1], encoder_outputs)
            context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
            lstm_input = torch.cat([context, input.squeeze(1)], dim=1).unsqueeze(1)  # Add sequence dimension
            output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
            output = self.fc(self.dropout(output.squeeze(1)))
            outputs[:, t, :] = output

            teacher_force = targets is not None and torch.rand(1).item() < teacher_forcing_ratio
            input = targets[:, t].unsqueeze(1) if teacher_force else output.unsqueeze(1)

        return outputs, attn_weights

# Define the Hybrid Model
class HybridModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):
        super(HybridModel, self).__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = LSTMDecoderWithAttention(hidden_size, output_size, num_layers, dropout)

    def forward(self, past_data, targets=None, teacher_forcing_ratio=0.5):
        encoder_outputs, hidden, cell = self.encoder(past_data)
        prediction, attn_weights = self.decoder(encoder_outputs, hidden, cell, targets, teacher_forcing_ratio)
        return prediction, attn_weights

def moving_average(data, window_size):
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

def add_lag_features(data, max_lag):
    lagged_data = np.zeros((data.shape[0], data.shape[1], data.shape[2] * (max_lag + 1)))
    for lag in range(max_lag + 1):
        lagged_data[:, lag:, lag * data.shape[2]:(lag + 1) * data.shape[2]] = data[:, :data.shape[1] - lag, :]
    return lagged_data

# Read data from CSV
data = pd.read_csv('time_series_data.csv', index_col='date', parse_dates=True)
features = data.columns.tolist()
input_size = len(features)

# Convert to numpy array
data_np = data.values

# Denoise data using moving average
window_size = 3  # Window size for moving average
denoised_data = np.apply_along_axis(moving_average, 0, data_np, window_size)
# Pad the denoised data to match the original length
denoised_data = np.pad(denoised_data, ((window_size-1, 0), (0, 0)), mode='edge')

# Add lag features
max_lag = 3  # Maximum lag for lag features
lagged_data = add_lag_features(denoised_data.reshape(1, -1, input_size), max_lag)
lagged_data = lagged_data.reshape(-1, lagged_data.shape[2])

# Prepare data for PyTorch
seq_len = 20
X = []
y = []
for i in range(len(lagged_data) - seq_len):
    X.append(lagged_data[i:i+seq_len])
    y.append(data_np[i+seq_len])

X = np.array(X)
y = np.array(y)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Create DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize model
hidden_size = 128
output_size = input_size
num_layers = 2
dropout = 0.5
model = HybridModel(input_size * (max_lag + 1), hidden_size, output_size, num_layers, dropout)

# Train model
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
n_epochs = 100
teacher_forcing_ratio = 0.5

for epoch in trange(n_epochs, desc="Epochs"):
    model.train()
    total_loss = 0

    for batch in train_loader:
        inputs, targets = batch
        optimizer.zero_grad()
        outputs, _ = model(inputs, targets, teacher_forcing_ratio)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    total_loss /= len(train_loader)
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {total_loss:.4f}')

# Predict
model.eval()
with torch.no_grad():
    test_input = X_tensor[:1]
    prediction, _ = model(test_input)
    print(prediction)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import trange

# Define Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        self.init_weights()

    def init_weights(self):
        nn.init.xavier_uniform_(self.attn.weight)
        nn.init.constant_(self.attn.bias, 0)
        nn.init.uniform_(self.v, -0.1, 0.1)

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(1)
        hidden = hidden.unsqueeze(1).repeat(1, seq_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_weights = torch.bmm(v, energy).squeeze(1)
        return torch.softmax(attention_weights, dim=1)

# Define LSTM Encoder
class LSTMEncoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        outputs, (hidden, cell) = self.lstm(x)
        outputs = self.dropout(outputs)
        return outputs, hidden, cell

# Define LSTM Decoder with Attention
class LSTMDecoderWithAttention(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, dropout):
        super(LSTMDecoderWithAttention, self).__init__()
        self.attention = Attention(hidden_size)
        self.lstm = nn.LSTM(hidden_size * 2, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, encoder_outputs, hidden, cell, targets=None, teacher_forcing_ratio=0.5):
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        output_size = self.fc.out_features

        outputs = torch.zeros(batch_size, seq_len, output_size).to(encoder_outputs.device)
        input = encoder_outputs[:, -1, :].unsqueeze(1)  # First input to the decoder is the last hidden state of the encoder

        for t in range(seq_len):
            attn_weights = self.attention(hidden[-1], encoder_outputs)
            context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs).squeeze(1)
            lstm_input = torch.cat([context, input.squeeze(1)], dim=1).unsqueeze(1)  # Add sequence dimension
            output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
            output = self.fc(self.dropout(output.squeeze(1)))
            outputs[:, t, :] = output

            teacher_force = targets is not None and torch.rand(1).item() < teacher_forcing_ratio
            input = targets[:, t].unsqueeze(1) if teacher_force else output.unsqueeze(1)

        return outputs, attn_weights

# Define the Hybrid Model
class HybridModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):
        super(HybridModel, self).__init__()
        self.encoder = LSTMEncoder(input_size, hidden_size, num_layers, dropout)
        self.decoder = LSTMDecoderWithAttention(hidden_size, output_size, num_layers, dropout)

    def forward(self, past_data, targets=None, teacher_forcing_ratio=0.5):
        encoder_outputs, hidden, cell = self.encoder(past_data)
        prediction, attn_weights = self.decoder(encoder_outputs, hidden, cell, targets, teacher_forcing_ratio)
        return prediction, attn_weights

def moving_average(data, window_size):
    return np.convolve(data, np.ones(window_size)/window_size, mode='valid')

def add_lag_features(data, max_lag):
    lagged_data = np.zeros((data.shape[0], data.shape[1], data.shape[2] * (max_lag + 1)))
    for lag in range(max_lag + 1):
        lagged_data[:, lag:, lag * data.shape[2]:(lag + 1) * data.shape[2]] = data[:, :data.shape[1] - lag, :]
    return lagged_data

# Read data from CSV
data = pd.read_csv('time_series_data.csv', index_col='date', parse_dates=True)
features = data.columns.tolist()
input_size = len(features)

# Convert to numpy array
data_np = data.values

# Denoise data using moving average
window_size = 3  # Window size for moving average
denoised_data = np.apply_along_axis(moving_average, 0, data_np, window_size)
# Pad the denoised data to match the original length
denoised_data = np.pad(denoised_data, ((window_size-1, 0), (0, 0)), mode='edge')

# Add lag features
max_lag = 3  # Maximum lag for lag features
lagged_data = add_lag_features(denoised_data.reshape(1, -1, input_size), max_lag)
lagged_data = lagged_data.reshape(-1, lagged_data.shape[2])

# Prepare data for PyTorch
seq_len = 20
X = []
y = []
for i in range(len(lagged_data) - seq_len):
    X.append(lagged_data[i:i+seq_len])
    y.append(data_np[i+seq_len])

X = np.array(X)
y = np.array(y)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Create DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Initialize model
hidden_size = 128
output_size = input_size
num_layers = 2
dropout = 0.5

# Bagging
num_models = 5
models = [HybridModel(input_size * (max_lag + 1), hidden_size, output_size, num_layers, dropout) for _ in range(num_models)]
criterions = [nn.MSELoss() for _ in range(num_models)]
optimizers = [optim.Adam(model.parameters(), lr=0.001) for model in models]

# Train models
n_epochs = 100
teacher_forcing_ratio = 0.5

for epoch in trange(n_epochs, desc="Epochs"):
    for model, optimizer, criterion in zip(models, optimizers, criterions):
        model.train()
        total_loss = 0

        for batch in train_loader:
            inputs, targets = batch
            optimizer.zero_grad()
            outputs, _ = model(inputs, targets, teacher_forcing_ratio)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        total_loss /= len(train_loader)
        print(f'Epoch {epoch+1}/{n_epochs}, Loss: {total_loss:.4f}')

# Predict with bagging
model.eval()
with torch.no_grad():
    test_input = X_tensor[:1]
    predictions = [model(test_input)[0] for model in models]
    avg_prediction = torch.mean(torch.stack(predictions), dim=0)
    print(avg_prediction)

# Boosting (simple example with sequential training)
boosted_model = HybridModel(input_size * (max_lag + 1), hidden_size, output_size, num_layers, dropout)
criterion = nn.MSELoss()
optimizer = optim.Adam(boosted_model.parameters(), lr=0.001)

for epoch in trange(n_epochs, desc="Epochs"):
    boosted_model.train()
    total_loss = 0

    for batch in train_loader:
        inputs, targets = batch
        optimizer.zero_grad()
        outputs, _ = boosted_model(inputs, targets, teacher_forcing_ratio)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    total_loss /= len(train_loader)
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {total_loss:.4f}')

# Predict with boosting
boosted_model.eval()
with torch.no_grad():
    test_input = X_tensor[:1]
    prediction, _ = boosted_model(test_input)
    print(prediction)