In [5]:
import torch.nn as nn
import torch



### 1. 2-D convolutional layer

In [15]:

# Create the input tensor and convert it to float
x = torch.arange(602112).reshape((4, 3, 224, 224)).float()

# Define the convolutional layer
cnn = nn.Conv2d(
    in_channels = 3,
    out_channels = 768,
    kernel_size = 16,
    stride = 16,
    padding = 0  # 'valid' is not a valid value in PyTorch, use 0 for no padding
)

# Apply the convolutional layer to the input tensor
output = cnn(x)

# Output the shape of the resulting tensor
'''
input: (in_channels, image_size, image_size)
output: (out_channels, (image_size-kernel_size)/stride + 1, image_size-kernel_size)/stride + 1)
'''
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")




Input shape: torch.Size([4, 3, 224, 224])
Output shape: torch.Size([4, 768, 14, 14])


### 2. tensor transpose, view, reshape, contiguous, expend, where, masked_scatter

In [64]:
# expend
x = torch.tensor([[1, 1, 0], [1, 0, 0]])
print(x)
x.unsqueeze(-1).expand(-1,-1, 10)

tensor([[1, 1, 0],
        [1, 0, 0]])


tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

        [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]])

In [65]:
# torch.where
# tensor.masked_scatter
16*16

256

### 3. Prepare the input for the PaliGemma

In [10]:
def add_image_tokens_to_prompt(prefix_prompt, bos_token, image_seq_len, image_token):
    # Quoting from the blog (https://huggingface.co/blog/paligemma#detailed-inference-process):
    #   The input text is tokenized normally.
    #   A <bos> token is added at the beginning, and an additional newline token (\n) is appended.
    #   This newline token is an essential part of the input prompt the model was trained with, so adding it explicitly ensures it's always there.
    #   The tokenized text is also prefixed with a fixed number of <image> tokens.
    # NOTE: from the paper it looks like the `\n` should be tokenized separately, but in the HF implementation this is not done.
    #       ref to HF implementation: https://github.com/huggingface/transformers/blob/7f79a97399bb52aad8460e1da2f36577d5dccfed/src/transformers/models/paligemma/processing_paligemma.py#L55-L73
    return f"{image_token * image_seq_len}{bos_token}{prefix_prompt}\n"


prefix_prompt= "What is the person doing?"
bos_token="<bos>"
image_seq_len=10
image_token="<image>"
add_image_tokens_to_prompt(prefix_prompt, bos_token, image_seq_len, image_token)

'<image><image><image><image><image><image><image><image><image><image><bos>What is the person doing?\n'

### 4. tokenizer

In [47]:
from transformers import DistilBertTokenizer

# Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


# Add special token
special_tokens_dict = {
    'additional_special_tokens': ['<image>', '<test>']
}
tokenizer.add_special_tokens(special_tokens_dict)

# Extend the vocabulary with new words or symbols that should be treated as standard tokens
EXTRA_TOKENS = ['<loc0000>', '<loc0001>', '<seg000>', '<seg001>']
tokenizer.add_tokens(EXTRA_TOKENS)
# Define a list of input strings
input_strings = [
    "<image><image><image>[SEP]This is a short sentence.\\n",
    "<image><image><image>[SEP]Here is a slightly longer sentence that might need truncation if it exceeds the maximum length.",
    "<image><image><image>[SEP]Another example sentence.\n",
    "\\n"
]

# Define parameters for padding and truncation
padding = "longest"  # or "max_length"
truncation = True  # or False
# max_length = 10  # Set a max length for the sequences

# Tokenize the input strings
inputs = tokenizer(
    input_strings,
    return_tensors="pt",  # Return as PyTorch tensors
    padding=padding,      # Padding method
    truncation=truncation,  # Truncation method
    add_special_tokens=False # Ignore the "[CLS]", "[SEP]" token in default
)

# Print the tokenized outputs
print("Input IDs:", inputs['input_ids'])
print("Attention Masks:", inputs['attention_mask'])


Input IDs: tensor([[30523, 30523, 30523,   102,  2023,  2003,  1037,  2460,  6251,  1012,
          1032,  1050,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [30523, 30523, 30523,   102,  2182,  2003,  1037,  3621,  2936,  6251,
          2008,  2453,  2342, 19817,  4609, 10719,  2065,  2009, 23651,  1996,
          4555,  3091,  1012],
        [30523, 30523, 30523,   102,  2178,  2742,  6251,  1012,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 1032,  1050,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]])
Attention Masks: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 

### 5. RMSNorm

In [5]:
eps = 1e-6
weight = torch.tensor([0.0, 0.0, 0.0, 0.0]) #  nn.Parameter(torch.zeros(dim))
x = torch.tensor([[[5.0, 5.0, 5.0, 5.0],
                   [6.0, 6.0, 6.0, 6.0]],
                  [[2.0, 4.0, 6.0, 8.0],
                   [3.0, 5.0, 7.0, 9.0]]])

print(f'x: {x}')
means = x.pow(2).mean(-1, keepdim=True)
print(f'mean square: {means}')
rms = torch.rsqrt(means + eps)
print(f'mean square root: {rms}')
normalized_output = x * rms
print(f'normalized_output: {normalized_output}')
scaled_output = normalized_output * (1.0 + weight)
print(f'scaled_output: {scaled_output}')




x: tensor([[[5., 5., 5., 5.],
         [6., 6., 6., 6.]],

        [[2., 4., 6., 8.],
         [3., 5., 7., 9.]]])
mean square: tensor([[[25.],
         [36.]],

        [[30.],
         [41.]]])
mean square root: tensor([[[0.2000],
         [0.1667]],

        [[0.1826],
         [0.1562]]])
normalized_output: tensor([[[1.0000, 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000, 1.0000]],

        [[0.3651, 0.7303, 1.0954, 1.4606],
         [0.4685, 0.7809, 1.0932, 1.4056]]])
scaled_output: tensor([[[1.0000, 1.0000, 1.0000, 1.0000],
         [1.0000, 1.0000, 1.0000, 1.0000]],

        [[0.3651, 0.7303, 1.0954, 1.4606],
         [0.4685, 0.7809, 1.0932, 1.4056]]])


### 6. training mode in torch.nn.Module

In [93]:
x = nn.Linear(2, 3)
print(x.training)
x.eval()
print(x.training)
x.train()
print(x.training)

True
False
True


In [9]:
x[:, -1, :].shape

torch.Size([2, 4])

# Paligemma from Scratch in PyTorch

## Overview
PaliGemma[^1] is a model that excels at interpreting and understanding both images and text. 
It's used for tasks like generating descriptions of images, answering questions based on visual content, 
and analyzing complex visuals like infographics and satellite images. The input to PaliGemma can be an 
image, text, or both, and the output might be a text description, an answer to a question, or information derived 
from the image. It's designed to handle a wide range of vision-language tasks efficiently, 
even though it is smaller in size compared to some other advanced models.

The architecture of PaliGemma-3B, as shown in Figure~\ref{fig:paligemma}, is inspired by the PaLI-3 model and combines the SigLIP visual encoder with the 
Gemma 2B language model. When PaliGemma-3B processes input, images are first converted into "soft tokens" by the SigLIP encoder. 
Simultaneously, any accompanying text, referred to as the "prefix," is tokenized by Gemma's tokenizer. 
These image tokens and text tokens are then combined and fed into the Gemma decoder, which uses full block-attention to generate 
the final output text, or "suffix," in an auto-regressive manner. 


## PaliGemma Architecture

### SigLip (Vision Transformer)
Describe the SigLip component, which utilizes a Vision Transformer architecture. Explain how it processes images or visual data and its role within the PaliGemma architecture.

### Gemma (Language Model)
Detail the Gemma component, focusing on its language model capabilities. Describe how it processes text data, what models it is based on, and its specific contributions to the overall project.

### PaliGemma
Discuss how the SigLip and Gemma components integrate to form the complete PaliGemma system. Highlight any unique interaction mechanisms or features that enhance the system’s performance.

## Advanced Transformer Techniques

### KV-Cache
Explain the KV-Cache mechanism, its purpose, and how it improves the model’s efficiency or response time.

### RMS Normalization
Describe RMS Normalization, its theoretical basis, and its impact on model training and convergence.

### Grouped Query Attention
Detail the Grouped Query Attention technique, its algorithm, and its benefits for the model’s attention mechanism.

### Rotary Positional Embedding
Discuss the implementation and advantages of Rotary Positional Embedding within the transformer architecture.

## How to Use
Provide step-by-step instructions on how to set up, configure, and run the Paligemma project. Include any prerequisites, libraries, or environments that need to be installed or prepared.

## Acknowledgements
Offer thanks to those who contributed to the project, whether through direct development, advice, or inspiration.

## References
[^1]: Beyer, L., et al. (2024). *PaLI: A Jointly-Scaled Multilingual Language-Image Model*.
