Let’s implement a basic attention mechanism in Python to reinforce the concepts.

In [1]:
import numpy as np

# Define query, keys, and values as vectors
query = np.array([1, 0, 1])  # Represents "love"
keys = np.array([[1, 0, 1],  # Represents "pizza"
                 [0, 1, 0],  # Represents "but"
                 [1, 0, -1]])  # Represents "olives"
values = np.array([[5], [0], [-3]])  # Sentiment scores for "pizza," "but," "olives"

# Calculate attention scores (dot product of query and keys)
scores = np.dot(keys, query)

# Apply softmax to get weights
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return exp_x / np.sum(exp_x)

weights = softmax(scores)

# Weighted sum of values
attention_output = np.dot(weights, values)

print("Attention Scores:", scores)
print("Attention Weights:", weights)
print("Attention Output:", attention_output)

Attention Scores: [2 0 0]
Attention Weights: [0.78698604 0.10650698 0.10650698]
Attention Output: [3.61540927]


To improve stability for high-dimensional vectors, we scale the dot product by dividing by the square root of the key dimension

In [None]:
import numpy as np

# Define query, keys, values, and scaling factor
query = np.array([1, 0, 1])
keys = np.array([[1, 0, 1],
                 [0, 1, 0],
                 [1, 0, -1]])
values = np.array([[5], [0], [-3]])
scale = np.sqrt(query.shape[0])  # d_k = dimensionality of the query

# Calculate scaled attention scores
scores = np.dot(keys, query) / scale

# Apply softmax to get weights
weights = softmax(scores)

# Weighted sum of values
attention_output = np.dot(weights, values)

print("Scaled Attention Scores:", scores)
print("Attention Weights:", weights)
print("Attention Output:", attention_output)

Scaled Attention Scores: [1.15470054 0.         0.        ]
Attention Weights: [0.61338261 0.19330869 0.19330869]
Attention Output: [2.48698697]


## Self Attention
Let’s implement a simple self-attention mechanism for a sentence.

In [None]:
import numpy as np

# Define structured sentence embeddings (each row represents a word)
# The embeddings represent semantic roles:
# "The" (determiner), "cat" (subject noun), "sat" (verb), "on" (preposition), "mat" (object noun)
sentence_embeddings = np.array([
    [0.1, 0.1, 0.2],  # "The"  (low influence determiner)
    [0.9, 0.8, 0.7],  # "cat"  (subject noun, strong influence)
    [0.8, 0.9, 0.8],  # "sat"  (verb, central word, strong influence)
    [0.2, 0.2, 0.3],  # "on"   (preposition, weak influence but linked to "mat")
    [0.7, 0.6, 0.9]   # "mat"  (object noun, linked to "on" and "sat", but not "the")
])

# Define structured weight matrices for Query (Q), Key (K), and Value (V)
# These weights are manually structured to enhance word relationships

# Query weight matrix (W_q) (3x3)
# Controls how much influence each word has when "asking for context"
W_q = np.array([
    [0.6, 0.2, 0.1],  # Slight attention to structure words
    [0.8, 0.7, 0.6],  # "cat" has high query influence
    [0.7, 0.9, 0.8]   # "sat" is the central querying word
])

# Key weight matrix (W_k) (3x3)
# Controls how words "store" information for queries to access
W_k = np.array([
    [0.6, 0.3, 0.2],  # "The" contributes weakly to keys
    [0.8, 0.7, 0.5],  # "cat" stores important information
    [0.7, 0.9, 0.6]   # "sat" stores strong reference points
])

# Value weight matrix (W_v) (3x3)
# Controls how much information each word contributes to the final representation
W_v = np.array([
    [0.2, 0.5, 0.3],  # "The" contributes little meaning
    [0.7, 0.8, 0.6],  # "Cat" contributes strongly
    [0.8, 0.9, 0.7]   # "Sat" contributes highly
])

# Compute Queries (Q), Keys (K), and Values (V) for each word
Q = sentence_embeddings @ W_q  # Transform embeddings into Queries
K = sentence_embeddings @ W_k  # Transform embeddings into Keys
V = sentence_embeddings @ W_v  # Transform embeddings into Values

# Compute attention scores using scaled dot-product attention
scores = Q @ K.T  # Compute raw attention scores (similarity between Q and K)

# Scale scores to stabilize training (common practice in attention models)
d_k = K.shape[1]  # Dimension of keys
scores /= np.sqrt(d_k)  # Scaling factor

# Apply softmax to obtain normalized attention weights
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtract max value for numerical stability
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)  # Normalize rows

weights = softmax(scores)  # Compute final attention weights

# Compute weighted sum of values to get attention output
attention_output = weights @ V  # Weighted combination of V based on attention

# Print the attention matrix (how words attend to each other)
print("Attention Weights:")
print(weights)

# Print the final contextualized word representations
print("Attention Output:")
print(attention_output)

Attention Weights:
[[0.13996839 0.23453985 0.24576871 0.15175171 0.22797133]
 [0.01847211 0.30437435 0.39016773 0.02861227 0.25837354]
 [0.01474143 0.30522766 0.39994043 0.02367127 0.25641921]
 [0.10588597 0.2546468  0.27557477 0.12146871 0.24242375]
 [0.02037275 0.30283715 0.38578421 0.03107175 0.25993415]]
Attention Output:
[[1.04688219 1.35331457 0.98510099]
 [1.30097982 1.68605596 1.22570457]
 [1.31055465 1.69852508 1.23472785]
 [1.11290351 1.43982809 1.04766729]
 [1.29621874 1.67979842 1.22119359]]


## Multi-Head Attention
Let’s implement a simple version of multi-head attention.

In [None]:
import numpy as np

# Define sentence embeddings for each word in "The cat sat on the mat"
# Each word is represented as a 3-dimensional vector (for simplicity)
sentence_embeddings = np.array([
    [0.1, 0.1, 0.2],  # "The"  (low influence)
    [0.9, 0.8, 0.7],  # "cat"  (high influence)
    [0.8, 0.9, 0.8],  # "sat"  (central word)
    [0.2, 0.2, 0.3],  # "on"   (context word)
    [0.7, 0.6, 0.9]   # "mat"  (linked to "sat" and "on")
])

# Multi-head attention will have two heads for this example
num_heads = 2
head_dim = sentence_embeddings.shape[1] // num_heads  # Dimension per head

# Function to create weight matrices for each head
# We'll create separate W_q, W_k, W_v for each head
np.random.seed(42)  # Seed for reproducibility
def generate_weights(num_heads, head_dim):
    return [
        (np.random.rand(head_dim, head_dim),  # W_q for this head
         np.random.rand(head_dim, head_dim),  # W_k for this head
         np.random.rand(head_dim, head_dim))  # W_v for this head
        for _ in range(num_heads)
    ]

# Generate separate weight matrices for each head
multi_head_weights = generate_weights(num_heads, head_dim)

# Define softmax function for numerical stability
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Perform multi-head attention
all_attention_outputs = []
for head_index, (W_q, W_k, W_v) in enumerate(multi_head_weights):
    # Slice the sentence embeddings for this head
    # Each head will process a different projection of the embeddings
    head_embeddings = sentence_embeddings[:, head_index * head_dim : (head_index + 1) * head_dim]

    # Compute Queries (Q), Keys (K), and Values (V) for this head
    Q = head_embeddings @ W_q  # Transform embeddings into Queries
    K = head_embeddings @ W_k  # Transform embeddings into Keys
    V = head_embeddings @ W_v  # Transform embeddings into Values

    # Compute attention scores using scaled dot-product attention
    scores = Q @ K.T  # Compute raw attention scores (similarity between Q and K)
    d_k = K.shape[1]  # Dimension of keys
    scores /= np.sqrt(d_k)  # Scale scores for numerical stability

    # Apply softmax to obtain normalized attention weights
    attention_weights = softmax(scores)

    # Compute weighted sum of values to get the attention output for this head
    attention_output = attention_weights @ V

    # Store the attention output for this head
    all_attention_outputs.append(attention_output)

    # Print detailed information for this head
    print(f"Head {head_index + 1}:")
    print("Attention Weights:")
    print(attention_weights)
    print("Attention Output:")
    print(attention_output)
    print("\n")

# Concatenate the outputs from all heads to form the final multi-head attention output
final_attention_output = np.concatenate(all_attention_outputs, axis=1)

# Print the final multi-head attention output
print("Final Multi-Head Attention Output:")
print(final_attention_output)

Head 1:
Attention Weights:
[[0.19687765 0.20256664 0.20184662 0.19757994 0.20112916]
 [0.1727621  0.2232503  0.21620917 0.17838832 0.20939011]
 [0.17568776 0.22065491 0.21445791 0.18076445 0.20843496]
 [0.193778   0.20513864 0.20368291 0.19516293 0.20223752]
 [0.17863977 0.21806136 0.21269322 0.18314844 0.20745723]]
Attention Output:
[[0.39804471]
 [0.41973964]
 [0.41707689]
 [0.40080164]
 [0.41439898]]


Head 2:
Attention Weights:
[[0.19921608 0.20052285 0.20071023 0.19940224 0.20014861]
 [0.19376598 0.20417066 0.20570196 0.19521925 0.20114215]
 [0.19299284 0.20468993 0.20641784 0.19462201 0.20127738]
 [0.19843347 0.20104528 0.20142119 0.1988045  0.20029556]
 [0.19531638 0.20313068 0.20427224 0.19641403 0.20086666]]
Attention Output:
[[0.08126514]
 [0.08229866]
 [0.08244593]
 [0.08141305]
 [0.08200382]]


Final Multi-Head Attention Output:
[[0.39804471 0.08126514]
 [0.41973964 0.08229866]
 [0.41707689 0.08244593]
 [0.40080164 0.08141305]
 [0.41439898 0.08200382]]


## Encoder-Only Models
Let's introduce the Bidirectional encoder representations from transformers (BERT) to create sentence embeddings.  The BERT model represents words with a 768-dimensional vector that captures the contextual meaning of the sentence.

In [11]:
%pip install transformers torch

# Import required libraries
from transformers import BertTokenizer, BertModel  # Pretrained BERT tokenizer and model
import torch  # For tensor manipulation (similar to NumPy)

# Load a pretrained BERT model and tokenizer
# BERT base uncased: lowercase version of the model trained on English text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in evaluation mode (not training)
model.eval()

# Step 1: Define a sentence for embedding
sentence = "The cat sat on the mat."

# Step 2: Tokenize the sentence
# Tokenization converts the sentence into tokens that BERT understands.
# BERT uses WordPiece tokenization, which splits words into subwords if necessary.
inputs = tokenizer(sentence, return_tensors="pt")  # "pt" indicates PyTorch tensors
print("Tokenized Input IDs:", inputs['input_ids'])
print("Attention Mask:", inputs['attention_mask'])

# Explanation of Tokenized Output:
# input_ids: Each word/subword is converted to an integer representing its vocabulary index.
# attention_mask: A binary mask indicating which tokens are real (1) and which are padding (0).

# Step 3: Pass the tokenized inputs through the BERT model
with torch.no_grad():  # Disable gradient calculation (not needed for inference)
    outputs = model(**inputs)

# Step 4: Extract the hidden states from BERT's output
# BERT returns two outputs: the last hidden state and the pooled output.
# The last hidden state contains embeddings for each token in the input sentence.
last_hidden_state = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

# Step 5: Average the token embeddings to create a sentence embedding
# This is a simple way to get a fixed-size vector representing the entire sentence.
sentence_embedding = last_hidden_state.mean(dim=1)  # Average across the sequence length dimension

# Print the sentence embedding
print("Sentence Embedding Shape:", sentence_embedding.shape)
print("Sentence Embedding Vector:", sentence_embedding)


[0m[0m[38;5;48m [39m[38;5;48m_[39m[38;5;48m_[39m[38;5;48m_[39m[38;5;48m_[39m[38;5;48m_[39m[38;5;48m_[39m[38;5;84m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;83m_[39m[38;5;119m_[39m[38;5;118m_[39m[38;5;118m_[39m[38;5;118m_[39m[38;5;118m_[39m[38;5;118m_[39m[38;5;118m_[39m[38;5;118m_[39m[38;5;118m_[39m[38;5;118m_[39m[38;5;154m_[39m[38;5;154m_[39m[38;5;154m_[39m[38;5;154m_[39m[38;5;154m_[39m[38;5;154m [39m[0m[38;5;154m[39m[38;5;154m[39m
[0m[38;5;48m<[39m[38;5;48m [39m[38;5;48mG[39m[38;5;48mo[39m[38;5;84mo[39m[38;5;83md[39m[38;5;83m [39m[38;5;83mm[39m[38;5;83mo[39m[38;5;83mr[39m[38;5;83mn[39m[38;5;83mi[39m[38;5;83mn[39m[38;5;83mg[39m[38;5;83m,[39m[38;5;83m [39m[38;5;119ms[39m[38;5;118mu[39m[38;5;118mn[39m[38;5;118ms[39m[38;5;118mh[39m[38;5;118mi[39m[38;5;118mn[39

  from .autonotebook import tqdm as notebook_tqdm


Tokenized Input IDs: tensor([[  101,  1996,  4937,  2938,  2006,  1996, 13523,  1012,   102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])
Sentence Embedding Shape: torch.Size([1, 768])
Sentence Embedding Vector: tensor([[-1.8180e-01, -2.6618e-01, -2.1887e-01,  2.1089e-01,  2.8473e-01,
         -1.7185e-01, -1.6588e-01,  5.0974e-01, -1.2715e-01, -1.6971e-01,
          3.0352e-02, -4.6905e-01, -3.5898e-02,  1.3398e-01, -1.1766e-01,
         -2.4077e-01,  1.2072e-01,  5.9154e-02, -3.9102e-01,  1.0781e-01,
          2.3168e-01, -2.0653e-01, -5.2181e-01,  9.9232e-02,  2.9413e-01,
         -2.4380e-01,  7.1086e-02, -1.4326e-01, -5.0724e-02, -2.2996e-02,
          2.1027e-01, -5.6707e-02, -1.4975e-01, -2.7953e-01,  5.4398e-02,
         -8.8523e-02,  2.9781e-01,  3.1730e-01, -5.4683e-01,  2.3623e-01,
         -3.6286e-01, -1.8020e-01,  2.5657e-02,  5.8190e-01,  4.0848e-01,
         -2.3206e-01,  4.0671e-01, -2.0782e-01,  6.3488e-01,  1.6949e-01,
         -6.2998e-01,  3.3577e-01, -2

## Text Classification with BERT
Let’s use the Hugging Face Transformers library to fine-tune BERT for text classification.

In [15]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load pre-trained BERT tokenizer and model for classification
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Example input text
texts = ["I love this product!", "The movie was terrible."]

# Tokenize the input text
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities and predictions
probs = torch.softmax(logits, dim=1)
predictions = torch.argmax(probs, dim=1)

print("Probabilities:", probs)
print("Predictions:", predictions)  # 0 for negative, 1 for positive


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Probabilities: tensor([[0.4816, 0.5184],
        [0.5294, 0.4706]])
Predictions: tensor([1, 0])


## Text Generation with GPT
Let’s generate text using a pre-trained GPT model.

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained GPT tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Example prompt
prompt = "Once upon a time in a faraway land,"

# Tokenize the input prompt
inputs = tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = model.generate(inputs, max_length=50, do_sample=True, temperature=0.1)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: Once upon a time in a faraway land, a man named Tiberius, who had been a soldier in the army of the Romans, was sent to the city of Rome to be a priest. He was a man of great wealth and great


## Basic Code: Text Classification with BERT.
The easiest way to use BERT is with the Hugging Face transformers library. This code shows how to load a pre-trained BERT that has already been fine-tuned for sentiment analysis and use it to make a prediction.

In [19]:
# First, you need to install the libraries:
# pip install torch transformers

import torch
# Import the "Auto" classes
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1. Load a Tokenizer and a Model
# We're loading a DistilBERT model, so we let "Auto" figure it out
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

# AutoTokenizer will load DistilBertTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# AutoModelForSequenceClassification will load DistilBertForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 2. Define your text
text = "I hate this movie. The acting was shit!"
# Try changing this text to: "This was the worst film I have ever seen."

# 3. Tokenize the text
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

print("--- Tokenizer Output ---")
print("Input IDs:", inputs['input_ids'])
print("Attention Mask:", inputs['attention_mask'])
print("--------------------------\n")

# 4. Get the Prediction (Inference)
with torch.no_grad():
    outputs = model(**inputs)

# 5. Interpret the Output
logits = outputs.logits
print("Raw Logits:", logits)

predicted_class_id = torch.argmax(logits, dim=1).item()

# The model's config tells us what the class labels mean
# For this model: {0: 'NEGATIVE', 1: 'POSITIVE'}
predicted_label = model.config.id2label[predicted_class_id]

print(f"\nText: '{text}'")
print(f"Predicted Sentiment: {predicted_label} (ID: {predicted_class_id})")

--- Tokenizer Output ---
Input IDs: tensor([[ 101, 1045, 5223, 2023, 3185, 1012, 1996, 3772, 2001, 4485,  999,  102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
--------------------------

Raw Logits: tensor([[ 4.3654, -3.5200]])

Text: 'I hate this movie. The acting was shit!'
Predicted Sentiment: NEGATIVE (ID: 0)


## Other Sample Examples of BERT Applications

In [20]:
# Install the necessary libraries if you haven't already
# !pip install torch transformers

import torch
from transformers import pipeline, BertModel, BertTokenizer
from IPython.display import Markdown, display

def printmd(string):
    """Helper function to print Markdown in a notebook cell."""
    display(Markdown(string))

printmd("## ✅ BERT Applications Code Examples (using Hugging Face Transformers)")
printmd("---")

## 1. ❓ Question Answering (Q&A) Example

printmd("### 1. Question Answering (Q&A)")

# Initialize the Question Answering pipeline
# Uses a model fine-tuned for Q&A (SQuAD dataset)
qa_pipeline = pipeline(
    "question-answering",
    model="distilbert-base-cased-distilled-squad",
    tokenizer="distilbert-base-cased-distilled-squad"
)

# Define the context and question
context = (
    "The 2024 Summer Olympics were held in Paris, France. "
    "These games featured the introduction of breakdancing as an official sport, "
    "and the United States topped the medal count for the second consecutive time."
)
question = "Which new sport was introduced at the 2024 Olympics?"

# Get the prediction
result = qa_pipeline(question=question, context=context)

printmd(f"**Context:** {context}")
printmd(f"**Question:** *{question}*")
printmd(f"**BERT's Extracted Answer:** **{result['answer']}**")
printmd(f"**Confidence Score:** {result['score']:.4f}")

printmd("---")


## 2. 🏷️ Named Entity Recognition (NER) Example

printmd("### 2. Named Entity Recognition (NER)")

# Initialize the Named Entity Recognition pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", aggregation_strategy="simple")

# Define the input text
text = "Angela Merkel met with the CEO of Siemens in Berlin yesterday."

# Get the prediction
results = ner_pipeline(text)

printmd(f"**Input Text:** {text}")
printmd("**Identified Entities:**")

for entity in results:
    printmd(
        f"* **Entity:** **{entity['word']}** "
        f"(Type: `{entity['entity_group']}`, Score: {entity['score']:.4f})"
    )

printmd("---")


## 3. 🧠 Feature Extraction (Embeddings) Example

printmd("### 3. Feature Extraction (Sentence Embeddings)")

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define the text
text = "BERT is a powerful language model."

# Tokenize and prepare input
inputs = tokenizer(text, return_tensors='pt')

# Get the model output
with torch.no_grad():
    outputs = model(**inputs)

# The embedding for the [CLS] token (at index 0) is used as the sentence embedding.
sentence_embedding = outputs.last_hidden_state[:, 0, :]

printmd(f"**Input Text:** '{text}'")
printmd(f"**Sentence Embedding Shape (Vector Size):** {sentence_embedding.shape}")
printmd(f"**First 5 dimensions of the 768-dim vector:**")
print(sentence_embedding[0, :5].numpy())

## ✅ BERT Applications Code Examples (using Hugging Face Transformers)

---

### 1. Question Answering (Q&A)

Device set to use mps:0


**Context:** The 2024 Summer Olympics were held in Paris, France. These games featured the introduction of breakdancing as an official sport, and the United States topped the medal count for the second consecutive time.

**Question:** *Which new sport was introduced at the 2024 Olympics?*

**BERT's Extracted Answer:** **breakdancing**

**Confidence Score:** 0.9941

---

### 2. Named Entity Recognition (NER)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


**Input Text:** Angela Merkel met with the CEO of Siemens in Berlin yesterday.

**Identified Entities:**

* **Entity:** **Angela Merkel** (Type: `PER`, Score: 0.9930)

* **Entity:** **Siemens** (Type: `ORG`, Score: 0.9987)

* **Entity:** **Berlin** (Type: `LOC`, Score: 0.9996)

---

### 3. Feature Extraction (Sentence Embeddings)

**Input Text:** 'BERT is a powerful language model.'

**Sentence Embedding Shape (Vector Size):** torch.Size([1, 768])

**First 5 dimensions of the 768-dim vector:**

[-0.4806293  -0.2720138   0.07131609 -0.16561534 -0.5366288 ]


## Language Translation with T5
Let’s translate a sentence using the T5 model.

In [None]:
# uinstall necessary libraries
!pip install sentencepiece

#load libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Input text for translation
input_text = "translate English to French: The weather is sunny."

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate translation
output = model.generate(inputs.input_ids, max_length=50)

# Decode the generated translation
translation = tokenizer.decode(output[0], skip_special_tokens=True)
print("Translation:", translation)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Translation: Le temps est ensoleillé.


Let's use T5 for text summarization

In [None]:
# Import the necessary libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the pre-trained T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")  # "t5-small" is a smaller, faster version of T5
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Input text for summarization
text = """
The Mona Lisa is a half-length portrait painting by the Italian artist Leonardo da Vinci.
It is considered an archetypal masterpiece of the Italian Renaissance, and it has been described
as the most famous, most visited, most written about, and most sung about work of art in the world.
"""

# Prepare the text for the T5 model
# T5 treats all tasks as text-to-text; for summarization, we prepend "summarize: " to the input text
input_text = "summarize: " + text

# Tokenize the input text and convert it to a PyTorch tensor
inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the summary
summary_ids = model.generate(
    inputs,
    max_length=50,          # Maximum length of the summary
    num_beams=4,            # Beam search with 4 beams for more coherent output
    early_stopping=True     # Stop once an optimal summary is found
)

# Decode the generated summary back to text
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the generated summary
print("Summary:", summary)


Summary: the painting is considered an archetypal masterpiece of the italian Renaissance. it has been described as the most famous, most visited, most written about, and most sung about work of art.
