<a href="https://colab.research.google.com/github/paryagsahni1845/deeplearning/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Transformers
What: Transformers are deep learning models based on self-attention mechanisms, designed to process sequential data like text or time series.

Why: They efficiently capture long-range dependencies and context, outperforming traditional models like RNNs for tasks like NLP and image processing.

Where: Used in applications like machine translation (e.g., Google Translate), text generation (e.g., ChatGPT), and computer vision (e.g., Vision Transformers).

How: They use Query (Q), Key (K), and Value (V) vectors in attention layers to weigh input relationships, followed by feed-forward networks.
Impact: Transformers revolutionized AI by enabling scalable, high-performance models for diverse tasks with parallelizable computations.



In [1]:
from datasets import load_dataset
dataset = load_dataset("opus_books", "de-en")
print(dataset['train'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.80M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51467 [00:00<?, ? examples/s]

{'id': '0', 'translation': {'de': 'Source: http://www.zeno.org - Contumax GmbH & Co. KG', 'en': 'Source: Project Gutenberg'}}


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def tokenize_batch(batch):
    sources = [x['de'] for x in batch['translation']]
    targets = [x['en'] for x in batch['translation']]

    model_inputs = tokenizer(sources, truncation=True, padding='max_length', max_length=128)
    labels = tokenizer(targets, truncation=True, padding='max_length', max_length=128)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(tokenize_batch, batched=True, remove_columns=["translation"])
print(tokenized_dataset['train'][0])


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/51467 [00:00<?, ? examples/s]

{'id': '0', 'input_ids': [9149, 10, 2649, 1303, 1986, 5, 1847, 32, 5, 1677, 3, 18, 13228, 76, 9128, 7635, 3, 184, 638, 5, 3, 18256, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [9149, 10, 2786, 7756, 11063, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [3]:
import torch
import torch.nn as nn

embedding_dim = 512
sample_input_ids = torch.tensor(tokenized_dataset['train']['input_ids'][:2])

vocab_size = tokenizer.vocab_size
embedding_layer = nn.Embedding(vocab_size, embedding_dim)
embedded_inputs = embedding_layer(sample_input_ids)

print(embedded_inputs.shape)


torch.Size([2, 128, 512])


In [4]:
import math
import torch.nn as nn
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return x

pos_enc = PositionalEncoding(embedding_dim)
embedded_inputs = pos_enc(embedded_inputs)

print(embedded_inputs.shape)


torch.Size([2, 128, 512])


In [5]:
import torch.nn as nn
num_heads = 8
num_layers = 2

x = embedded_inputs.permute(1, 0, 2)

encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

encoder_output = transformer_encoder(x)

print(encoder_output.shape)


torch.Size([128, 2, 512])




In [6]:
sample_target_ids = torch.tensor(tokenized_dataset['train']['labels'][:2])

decoder_embed = nn.Embedding(vocab_size, embedding_dim)
decoder_inputs = decoder_embed(sample_target_ids)

decoder_inputs = pos_enc(decoder_inputs)

decoder_inputs = decoder_inputs.permute(1, 0, 2)

decoder_layer = nn.TransformerDecoderLayer(d_model=embedding_dim, nhead=num_heads)
transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

decoder_output = transformer_decoder(tgt=decoder_inputs, memory=encoder_output)

print(decoder_output.shape)


torch.Size([128, 2, 512])


In [7]:
linear = nn.Linear(embedding_dim, vocab_size)
logits = linear(decoder_output)
probabilities = nn.functional.softmax(logits, dim=-1)

print(logits.shape)
print(probabilities.shape)

torch.Size([128, 2, 32100])
torch.Size([128, 2, 32100])


In [8]:
import torch

def translate_sentence(src_sentence, max_len=50):
    src_ids = tokenizer(src_sentence, return_tensors="pt", padding="max_length",
                         truncation=True, max_length=128)["input_ids"]

    src_embed = embedding_layer(src_ids)
    src_embed = pos_enc(src_embed)
    src_embed = src_embed.permute(1, 0, 2)

    encoder_out = transformer_encoder(src_embed)

    decoder_input_ids = torch.tensor([[tokenizer.pad_token_id]])
    generated_tokens = []

    for _ in range(max_len):
        dec_embed = decoder_embed(decoder_input_ids)
        dec_embed = pos_enc(dec_embed)
        dec_embed = dec_embed.permute(1, 0, 2)

        dec_out = transformer_decoder(tgt=dec_embed, memory=encoder_out)
        logits = linear(dec_out)
        next_token_id = torch.argmax(logits[-1, 0, :]).unsqueeze(0).unsqueeze(0)

        generated_tokens.append(next_token_id.item())

        if next_token_id.item() == tokenizer.eos_token_id:
            break

        decoder_input_ids = torch.cat([decoder_input_ids, next_token_id], dim=1)

    translation = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return translation

user_input = "Hallo, wie geht ?"
print("Translation:", translate_sentence(user_input))


Translation: intensity plate Sozial plate goinglasi handsmeanwhiledov bandsdefended Einfluss gramsearned grams gramsearned grams grams Ground schimbare thrust vechi Idea thinkDeutsch thinkDeutsch Markt vivid bandstool situatii cerintedov Raspberry nations verursachtwasting entering Holdermeanwhilelene Stoke cumpără Logo significancemeanwhilesymptôme vivid


## Pretrained

In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [10]:
src_sentence = "Hallo, wie geht es dir?"
inputs = tokenizer(src_sentence, return_tensors="pt")
translated_ids = model.generate(**inputs)
translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
print("Translation:", translation)


Translation: Hello, how are you?
