# Transformer

<img src='https://machinelearningmastery.com/wp-content/uploads/2021/08/attention_research_1.png' width=800/>

## Content
- [Self- & Multi-Head-Attention](#Self--&-Multi-Head-Attention)
- [Transformer Encoder](#Transformer-Encoder)
- [Input Embedding](#Input-Embedding)
- [Train a Vision Transformer](#Train-a-Vision-Transformer)
  - Positional Encoding

In [1]:
import sys
import torch
import torch.nn.functional as F
from torch import nn, optim, Tensor
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from einops import einsum, rearrange

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from pathlib import Path
from PIL import Image

from tqdm import tqdm

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using {device} device")

Using cuda:0 device


## Self- & Multi-Head Attention

Aus dem Paper: [Attention is all you need](https://arxiv.org/pdf/1706.03762.pdf) (2017)

$$ Attention(Q, K, V) = softmax \left(\frac{QK^T}{\sqrt{d_k}}\right)V$$

In [2]:
class Head(nn.Module):
    def __init__(self, dim: int, head_size: int):
        """
        One Head of Self Attention containing 3 linear layers to
        project an input into query, key and value, and perform
        the self attention mechanism.
        """
        super().__init__()
        self.q = nn.Linear(dim, head_size, bias=False) # query
        self.k = nn.Linear(dim, head_size, bias=False) # key
        self.v = nn.Linear(dim, head_size, bias=False) # value

        # if query and key are unit variance,
        # the scaled dot product will be unit variance too
        self.scale = dim ** -0.5

    def forward(self, x: Tensor) -> Tensor:
        """
        Inputs:
            x: Tensor of shape [B, N, C]

        Returns: Tensor of shape [B, head_size, C]
        """
        q = self.q(x)  # [B, N, C]
        k = self.k(x)  # [B, N, C]
        v = self.v(x)  # [B, N, C]

        scores = einsum(q, k, 'B N C, B M C -> B N M') * self.scale  # [B, N, N]
        weights = scores.softmax(dim=-1)
        context = einsum(weights, v, 'B N M, B M C -> B N C')
        return context

In [3]:
class MultiHeadAttention(nn.Module):
    """
    Multi Head Attention Module which applies 'heads' times SelfAttention
    on the input.
    """
    def __init__(self, dim: int, heads: int, dropout: float = 0.2):
        super().__init__()
        assert dim % heads == 0, "dim must be a multiple of heads"
        headsize = dim // heads
        self.heads = nn.ModuleList([Head(dim, headsize) for _ in range(heads)])
        self.proj  = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)  # Regularization

    def forward(self, x: Tensor) -> Tensor:
        """
        Inputs:
            x: Tensor of shape [B, N, C]

        Returns: Tensor of shape [B, N, C]
        """
        out = torch.cat([h(x) for h in self.heads], dim=2)
        out = self.proj(out)
        out = self.dropout(out)  # Regularization
        return out

## Transformer Encoder

In [4]:
class Block(nn.Module):
    def __init__(self, dim: int, heads: int, ff_dim: int = None, dropout: float = 0.2):
        super().__init__()
        self.attn = MultiHeadAttention(dim, heads, dropout)
        self.ffwd = FeedForward(dim, ff_dim, dropout)
        self.ln1 = nn.LayerNorm(dim)
        self.ln2 = nn.LayerNorm(dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.attn(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [5]:
class FeedForward(nn.Module):
    def __init__(self, dim: int, ff_dim: int = None, dropout: float = 0.2):
        super().__init__()
        ff_dim = ff_dim or dim * 4  # Default to 4x hidden dimension
        self.net = nn.Sequential(
            nn.Linear(dim, ff_dim),
            nn.ReLU(inplace=True),
            nn.Linear(ff_dim, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)

In [6]:
class TransformerEncoder(nn.Module):
    def __init__(self, dim: int, depth: int, heads: int, ff_dim: int = None, dropout: float = 0.2):
        super().__init__()
        self.layers = nn.ModuleList([Block(dim, heads, ff_dim, dropout) for _ in range(depth)])
        self.ln = nn.LayerNorm(dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        for layer in self.layers:
            x = layer(x)
        return self.ln(x)  # Final layer normalization

In [7]:
# Define dimensions
B, N, C = 4, 32, 16  # Batch size, Number of tokens, Embedding dimension

# Create random input tensor
batch = torch.randn(B, N, C)  # Shape: [Batch size, Sequence length, Embedding dimension]

# Define TransformerEncoder parameters
dim = C  # Embedding dimension
depth = 4  # Number of Transformer blocks
heads = 4  # Number of attention heads
ff_dim = 64  # Feed-forward network dimension
dropout = 0.1  # Dropout rate

# Instantiate TransformerEncoder
encoder = TransformerEncoder(dim=dim, depth=depth, heads=heads, ff_dim=ff_dim, dropout=dropout)

# Pass the batch through the TransformerEncoder
output = encoder(batch)

# Print the output shape
print("Input shape:", batch.shape)
print("Output shape:", output.shape)

Input shape: torch.Size([4, 32, 16])
Output shape: torch.Size([4, 32, 16])


# Input Embedding

### Text Embeddings

In [8]:
# Example for Text tokens:

text = "Hallo Welt des Deep Learnings abcdefg"

chars = sorted(set(text))
print(len(chars))
print("".join(chars))

20
 DHLWabcdefgilnoprst


In [9]:
# Tokenizer

stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join(itos[i] for i in l)

In [10]:
batch = torch.tensor([
    encode("abcde"),
    encode("cdefg")
])
batch

tensor([[ 5,  6,  7,  8,  9],
        [ 7,  8,  9, 10, 11]])

In [11]:
embedding = nn.Embedding(20, 16)

In [12]:
emb = embedding(batch)
emb.shape

torch.Size([2, 5, 16])

# Vision Transformer (ViT) for Image Inputs
[An Image Is Worth 16X16 Words](https://arxiv.org/pdf/2010.11929.pdf)

<img src='https://production-media.paperswithcode.com/social-images/UhPqfdxgjZGSAsbC.png' width=1200/>

# Frame Predictor Transformer

In [13]:
class FramePredictor(nn.Module):
    def __init__(self, seq_size=5, img_size=50, patch_size=10, dim=128, depth=4, heads=4):
        super().__init__()
        self.seq_size = seq_size
        self.patch_size = patch_size
        self.num_patches = (img_size // patch_size) ** 2
        self.dim = dim

        # Patch embedding
        self.embedding = nn.Conv2d(seq_size, dim, kernel_size=patch_size, stride=patch_size)

        # Transformer encoder
        self.encoder = TransformerEncoder(dim, depth, heads)

        # Output projection
        self.to_image = nn.ConvTranspose2d(dim, 1, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        """
        x: [batch_size, sequence_length, channels, height, width]
        """
        batch_size, seq_length, channels, height, width = x.shape

        # Combine sequence_length into the channel dimension
        x = x.view(batch_size, seq_length * channels, height, width)  # [batch_size, sequence_length * channels, height, width]

        # Apply Conv2d patch embedding
        x = self.embedding(x)  # [batch_size, dim, num_patches_y, num_patches_x]

        # Flatten patches and prepare for Transformer
        num_patches = x.size(2) * x.size(3)  # Total number of patches
        x = x.flatten(2).transpose(1, 2)  # [batch_size, num_patches, dim]

        # Pass through Transformer encoder
        x = self.encoder(x)  # [batch_size, num_patches, dim]

        # Reshape and reconstruct patches
        x = x.transpose(1, 2).view(batch_size, self.dim, height // self.patch_size, width // self.patch_size)
        return self.to_image(x)  # [batch_size, 1, height, width]

In [14]:
# Initialize the model
model = FramePredictor(seq_size=10, img_size=50, patch_size=10, dim=128, depth=4, heads=4)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

FramePredictor(
  (embedding): Conv2d(10, 128, kernel_size=(10, 10), stride=(10, 10))
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x Block(
        (attn): MultiHeadAttention(
          (heads): ModuleList(
            (0-3): 4 x Head(
              (q): Linear(in_features=128, out_features=32, bias=False)
              (k): Linear(in_features=128, out_features=32, bias=False)
              (v): Linear(in_features=128, out_features=32, bias=False)
            )
          )
          (proj): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (ffwd): FeedForward(
          (net): Sequential(
            (0): Linear(in_features=128, out_features=512, bias=True)
            (1): ReLU(inplace=True)
            (2): Linear(in_features=512, out_features=128, bias=True)
            (3): Dropout(p=0.2, inplace=False)
          )
        )
        (ln1): LayerNorm((128,), eps=1e-05, elementwise_af

In [15]:
!pip install torchinfo
from torchinfo import summary # older depricated 'torchinfo' works

# Assuming the model and input_tensor are defined as shown previously
summary(model, input_sizes=(1, 10, 1, 50, 50))



Layer (type:depth-idx)                             Param #
FramePredictor                                     --
├─Conv2d: 1-1                                      128,128
├─TransformerEncoder: 1-2                          --
│    └─ModuleList: 2-1                             --
│    │    └─Block: 3-1                             197,888
│    │    └─Block: 3-2                             197,888
│    │    └─Block: 3-3                             197,888
│    │    └─Block: 3-4                             197,888
│    └─LayerNorm: 2-2                              256
├─ConvTranspose2d: 1-3                             12,801
Total params: 932,737
Trainable params: 932,737
Non-trainable params: 0

In [16]:
# Dummy input: [batch_size=24, sequence_length=5, channels=1, height=50, width=50]
dummy_input = torch.randn(24, 10, 1, 50, 50).to(device)

# Forward pass
output = model(dummy_input)
print("Output shape:", output.shape)  # Expected: [24, 1, 50, 50]

Output shape: torch.Size([24, 1, 50, 50])


# Data

In [17]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
# Copy zip file from Google Drive to local Colab env. and unzip
!cp "/content/drive/My Drive/game_frames.zip" "/content/game_frames.zip"
!unzip -q "/content/game_frames.zip" -d "/content/game_frames"
!ls "/content/game_frames"

replace /content/game_frames/game_frames/000001.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/game_frames/game_frames/000002.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: nA
replace /content/game_frames/game_frames/000003.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
game_frames


In [20]:
class ImageSequenceDataset(Dataset):
    def __init__(self, root_dir, transform=None, sequence_length=11):
        """
        Args:
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on a sample.
            sequence_length (int): Number of images in each sequence.
        """
        self.root_dir = root_dir
        self.transform = transform
        self.sequence_length = sequence_length
        self.image_filenames = [f for f in sorted(os.listdir(root_dir)) if f.endswith('.png')]

    def __len__(self):
        # Return the number of possible sequences
        return len(self.image_filenames) - (self.sequence_length - 1)

    def __getitem__(self, idx):
        images = []
        for i in range(self.sequence_length):
            img_name = os.path.join(self.root_dir, self.image_filenames[idx + i])
            image = Image.open(img_name).convert('L')  # Convert to grayscale
            if self.transform:
                image = self.transform(image)
            images.append(image)

        # Stack images to create a sequence tensor
        # Assumes that images are transformed to tensors by `transforms`
        sequence = torch.stack(images[:-1])  # All but last for input sequence
        target = images[-1]  # Last image as ground truth
        return sequence, target

# Transform to tensor and resize if necessary
transform = transforms.Compose([
    transforms.Resize((50, 50)),  # Resize all images to the same size
    transforms.ToTensor(),  # Convert images to tensor
    # If BCEWithLogitsLoss do not use normalization
    #transforms.Normalize((0.5,), (0.5,))  # Normalize images; mean and std are tuples with one value per channel
])

In [21]:
dataset = ImageSequenceDataset('/content/game_frames/game_frames', transform=transform)
dataloader = DataLoader(dataset, batch_size=24, shuffle=False)  # Set `shuffle=False` to maintain sequence order !!!

In [22]:
from torch.utils.data import DataLoader
for sequences, targets in dataloader:
    print("Batch of sequences shape:", sequences.shape)
    print("Batch of sequences type:", sequences.dtype)
    print("Batch of targets shape:", targets.shape)
    print("Batch of targets type:", targets.dtype)
    print("")
    print("Sequence min value:", sequences.min().item())
    print("Sequence max value:", sequences.max().item())
    print("Target min value:", targets.min().item())
    print("Target max value:", targets.max().item())

    break

Batch of sequences shape: torch.Size([24, 10, 1, 50, 50])
Batch of sequences type: torch.float32
Batch of targets shape: torch.Size([24, 1, 50, 50])
Batch of targets type: torch.float32

Sequence min value: 0.0
Sequence max value: 1.0
Target min value: 0.0
Target max value: 1.0


# Training

**Binary Cross Entropy (BCE)**

$Loss=−[y⋅log(p)+(1−y)⋅log(1−p)]$

  - $y$ ground truth
  - $p$ predicted

$Loss=−[pos\_weight⋅y⋅log(p)+(1−y)⋅log(1−p)]$

  - positive weight multiplyer

In [32]:
# Loss function and optimizer
#criterion = nn.BCEWithLogitsLoss()  # For binary prediction
#criterion = nn.MSELoss(reduction='sum')
#criterion = nn.L1Loss(reduction='sum')
criterion = nn.BCEWithLogitsLoss(reduction='sum', pos_weight=torch.tensor([4.0]).to(device))
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)


In [33]:
def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    """
    Train a PyTorch model using the provided DataLoader, criterion, and optimizer.

    Args:
        model (nn.Module): The PyTorch model to be trained.
        dataloader (DataLoader): The DataLoader providing training data.
        criterion (nn.Module): The loss function.
        optimizer (torch.optim.Optimizer): The optimizer for training.
        num_epochs (int, optional): Number of epochs to train. Defaults to 10.

    Returns:
        None
    """
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.train()  # Set the model to training mode

    for epoch in range(num_epochs):
        running_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)

        for sequences, targets in progress_bar:
            sequences = sequences.to(device)  # Shape: [batch_size, seq_length-1, channels, height, width]
            targets = targets.to(device)  # Shape: [batch_size, channels, height, width]

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(sequences)
            loss = criterion(outputs, targets)/2500 # manual normalizing

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            # Update running loss
            running_loss += loss.item()

            # Update the progress bar description with the latest loss
            progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs} - Loss: {running_loss / (progress_bar.n + 1)}")

        # Calculate average loss for the epoch
        epoch_loss = running_loss / len(dataloader)
        print(f"Epoch {epoch+1}, Average Loss: {epoch_loss}")

    print('Finished Training')

In [34]:
# Train the model
train_model(model, dataloader, criterion, optimizer, num_epochs=5)



Epoch 1, Average Loss: 0.3328281421005726




Epoch 2, Average Loss: 0.313997524279356




Epoch 3, Average Loss: 0.28100937019586564




Epoch 4, Average Loss: 0.25387811896800994


                                                                                      

Epoch 5, Average Loss: 0.253305420178175
Finished Training




In [35]:
# Save the entire model
torch.save(model, '/content/drive/My Drive/dx_ball_transformer_model.pt')
# Save only the state dictionary (model weights)
torch.save(model.state_dict(), '/content/drive/My Drive/dx_ball_transformer_weights.pt')