# Description

To create a basic demonstration of how transformers work, let's set up a simple transformer model for sequence-to-sequence generation using PyTorch. We will implement a basic transformer for demonstration purposes, focusing on the core components without diving into advanced customizations

### First, ensure you have PyTorch installed using torch

In [None]:
pip install torch

Basic Python demo illustrating a transformer in action:

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Transformer
import math

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define constants
input_dim = 512  # Dimension of input
seq_length = 10  # Sequence length
batch_size = 16  # Batch size
n_heads = 8      # Number of attention heads
num_layers = 3   # Number of encoder/decoder layers
dropout = 0.1    # Dropout rate

# Positional Encoding for providing position information to sequences
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=seq_length):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

# Define the transformer model
class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, seq_length, n_heads, num_layers, dropout):
        super(SimpleTransformer, self).__init__()
        self.embedding = nn.Embedding(input_dim, input_dim)
        self.positional_encoding = PositionalEncoding(input_dim)
        self.transformer = Transformer(d_model=input_dim, nhead=n_heads, num_encoder_layers=num_layers, 
                                       num_decoder_layers=num_layers, dropout=dropout)
        self.fc_out = nn.Linear(input_dim, input_dim)
    
    def forward(self, src, tgt):
        src = self.embedding(src) * math.sqrt(input_dim)
        src = self.positional_encoding(src)
        tgt = self.embedding(tgt) * math.sqrt(input_dim)
        tgt = self.positional_encoding(tgt)
        output = self.transformer(src, tgt)
        return self.fc_out(output)

# Create random data for demonstration
src = torch.randint(0, input_dim, (seq_length, batch_size)).to(device)
tgt = torch.randint(0, input_dim, (seq_length, batch_size)).to(device)

# Initialize model, criterion and optimizer
model = SimpleTransformer(input_dim, seq_length, n_heads, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop (simple demonstration)
model.train()
for epoch in range(5):  # Run for a few epochs
    optimizer.zero_grad()
    output = model(src, tgt)
    loss = criterion(output.view(-1, input_dim), tgt.view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

print("Demo complete. This shows the basics of using a Transformer model for sequence transformation.")


ModuleNotFoundError: No module named 'torch'

### Explanation:

- **Positional Encoding:** Adds positional information to input embeddings, critical since transformers have no inherent sequential bias.
- **Simple Transformer:** Defines an encoder-decoder structure with self-attention layers and embedding layers.
- **Training:** The model is trained on random data for demonstration purposes.

This is a basic demonstration, and production-level models involve significantly more sophistication, such as masking, padding, and optimized training routines.