Let’s implement a basic attention mechanism in Python to reinforce the concepts.

In [2]:
import numpy as np

# Define query, keys, and values as vectors
query = np.array([1, 0, -1])  # Represents "love"
keys = np.array([[1, 1, 0],  # Represents "pizza"
                 [0, -1, 1],  # Represents "but"
                 [1, 0, -1]])  # Represents "olives"
values = np.array([[5], [0], [-3]])  # Sentiment scores for "pizza," "but," "olives"

# Calculate attention scores (dot product of query and keys)
scores = np.dot(keys, query)

# Apply softmax to get weights
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return exp_x / np.sum(exp_x)

weights = softmax(scores)

# Weighted sum of values
attention_output = np.dot(weights, values)

print("Attention Scores:", scores)
print("Attention Weights:", weights)
print("Attention Output:", attention_output)

Attention Scores: [ 1 -1  2]
Attention Weights: [0.25949646 0.03511903 0.70538451]
Attention Output: [-0.81867124]


To improve stability for high-dimensional vectors, we scale the dot product by dividing by the square root of the key dimension

In [None]:
import numpy as np

# Define query, keys, values, and scaling factor
query = np.array([1, 0, 1])
keys = np.array([[1, 0, 1],
                 [0, 1, 0],
                 [1, 0, -1]])
values = np.array([[5], [0], [-3]])
scale = np.sqrt(query.shape[0])  # d_k = dimensionality of the query

# Calculate scaled attention scores
scores = np.dot(keys, query) / scale

# Apply softmax to get weights
weights = softmax(scores)

# Weighted sum of values
attention_output = np.dot(weights, values)

print("Scaled Attention Scores:", scores)
print("Attention Weights:", weights)
print("Attention Output:", attention_output)

## Self Attention
Let’s implement a simple self-attention mechanism for a sentence.

In [None]:
import numpy as np

# Define structured sentence embeddings (each row represents a word)
# The embeddings represent semantic roles:
# "The" (determiner), "cat" (subject noun), "sat" (verb), "on" (preposition), "mat" (object noun)
sentence_embeddings = np.array([
    [0.1, 0.1, 0.2],  # "The"  (low influence determiner)
    [0.9, 0.8, 0.7],  # "cat"  (subject noun, strong influence)
    [0.8, 0.9, 0.8],  # "sat"  (verb, central word, strong influence)
    [0.2, 0.2, 0.3],  # "on"   (preposition, weak influence but linked to "mat")
    [0.7, 0.6, 0.9]   # "mat"  (object noun, linked to "on" and "sat", but not "the")
])

# Define structured weight matrices for Query (Q), Key (K), and Value (V)
# These weights are manually structured to enhance word relationships

# Query weight matrix (W_q) (3x3)
# Controls how much influence each word has when "asking for context"
W_q = np.array([
    [0.6, 0.2, 0.1],  # Slight attention to structure words
    [0.8, 0.7, 0.6],  # "cat" has high query influence
    [0.7, 0.9, 0.8]   # "sat" is the central querying word
])

# Key weight matrix (W_k) (3x3)
# Controls how words "store" information for queries to access
W_k = np.array([
    [0.6, 0.3, 0.2],  # "The" contributes weakly to keys
    [0.8, 0.7, 0.5],  # "cat" stores important information
    [0.7, 0.9, 0.6]   # "sat" stores strong reference points
])

# Value weight matrix (W_v) (3x3)
# Controls how much information each word contributes to the final representation
W_v = np.array([
    [0.2, 0.5, 0.3],  # "The" contributes little meaning
    [0.7, 0.8, 0.6],  # "Cat" contributes strongly
    [0.8, 0.9, 0.7]   # "Sat" contributes highly
])

# Compute Queries (Q), Keys (K), and Values (V) for each word
Q = sentence_embeddings @ W_q  # Transform embeddings into Queries
K = sentence_embeddings @ W_k  # Transform embeddings into Keys
V = sentence_embeddings @ W_v  # Transform embeddings into Values

# Compute attention scores using scaled dot-product attention
scores = Q @ K.T  # Compute raw attention scores (similarity between Q and K)

# Scale scores to stabilize training (common practice in attention models)
d_k = K.shape[1]  # Dimension of keys
scores /= np.sqrt(d_k)  # Scaling factor

# Apply softmax to obtain normalized attention weights
def softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtract max value for numerical stability
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)  # Normalize rows

weights = softmax(scores)  # Compute final attention weights

# Compute weighted sum of values to get attention output
attention_output = weights @ V  # Weighted combination of V based on attention

# Print the attention matrix (how words attend to each other)
print("Attention Weights:")
print(weights)

# Print the final contextualized word representations
print("Attention Output:")
print(attention_output)

## Multi-Head Attention
Let’s implement a simple version of multi-head attention.

In [None]:
import numpy as np

# Define sentence embeddings for each word in "The cat sat on the mat"
# Each word is represented as a 3-dimensional vector (for simplicity)
sentence_embeddings = np.array([
    [0.1, 0.1, 0.2],  # "The"  (low influence)
    [0.9, 0.8, 0.7],  # "cat"  (high influence)
    [0.8, 0.9, 0.8],  # "sat"  (central word)
    [0.2, 0.2, 0.3],  # "on"   (context word)
    [0.7, 0.6, 0.9]   # "mat"  (linked to "sat" and "on")
])

# Multi-head attention will have two heads for this example
num_heads = 2
head_dim = sentence_embeddings.shape[1] // num_heads  # Dimension per head

# Function to create weight matrices for each head
# We'll create separate W_q, W_k, W_v for each head
np.random.seed(42)  # Seed for reproducibility
def generate_weights(num_heads, head_dim):
    return [
        (np.random.rand(head_dim, head_dim),  # W_q for this head
         np.random.rand(head_dim, head_dim),  # W_k for this head
         np.random.rand(head_dim, head_dim))  # W_v for this head
        for _ in range(num_heads)
    ]

# Generate separate weight matrices for each head
multi_head_weights = generate_weights(num_heads, head_dim)

# Define softmax function for numerical stability
def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

# Perform multi-head attention
all_attention_outputs = []
for head_index, (W_q, W_k, W_v) in enumerate(multi_head_weights):
    # Slice the sentence embeddings for this head
    # Each head will process a different projection of the embeddings
    head_embeddings = sentence_embeddings[:, head_index * head_dim : (head_index + 1) * head_dim]

    # Compute Queries (Q), Keys (K), and Values (V) for this head
    Q = head_embeddings @ W_q  # Transform embeddings into Queries
    K = head_embeddings @ W_k  # Transform embeddings into Keys
    V = head_embeddings @ W_v  # Transform embeddings into Values

    # Compute attention scores using scaled dot-product attention
    scores = Q @ K.T  # Compute raw attention scores (similarity between Q and K)
    d_k = K.shape[1]  # Dimension of keys
    scores /= np.sqrt(d_k)  # Scale scores for numerical stability

    # Apply softmax to obtain normalized attention weights
    attention_weights = softmax(scores)

    # Compute weighted sum of values to get the attention output for this head
    attention_output = attention_weights @ V

    # Store the attention output for this head
    all_attention_outputs.append(attention_output)

    # Print detailed information for this head
    print(f"Head {head_index + 1}:")
    print("Attention Weights:")
    print(attention_weights)
    print("Attention Output:")
    print(attention_output)
    print("\n")

# Concatenate the outputs from all heads to form the final multi-head attention output
final_attention_output = np.concatenate(all_attention_outputs, axis=1)

# Print the final multi-head attention output
print("Final Multi-Head Attention Output:")
print(final_attention_output)

## Encoder-Only Models
Let's introduce the Bidirectional encoder representations from transformers (BERT) to create sentence embeddings.  The BERT model represents words with a 768-dimensional vector that captures the contextual meaning of the sentence.

In [None]:
!pip install transformers torch

# Import required libraries
from transformers import BertTokenizer, BertModel  # Pretrained BERT tokenizer and model
import torch  # For tensor manipulation (similar to NumPy)

# Load a pretrained BERT model and tokenizer
# BERT base uncased: lowercase version of the model trained on English text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in evaluation mode (not training)
model.eval()

# Step 1: Define a sentence for embedding
sentence = "The cat sat on the mat."

# Step 2: Tokenize the sentence
# Tokenization converts the sentence into tokens that BERT understands.
# BERT uses WordPiece tokenization, which splits words into subwords if necessary.
inputs = tokenizer(sentence, return_tensors="pt")  # "pt" indicates PyTorch tensors
print("Tokenized Input IDs:", inputs['input_ids'])
print("Attention Mask:", inputs['attention_mask'])

# Explanation of Tokenized Output:
# input_ids: Each word/subword is converted to an integer representing its vocabulary index.
# attention_mask: A binary mask indicating which tokens are real (1) and which are padding (0).

# Step 3: Pass the tokenized inputs through the BERT model
with torch.no_grad():  # Disable gradient calculation (not needed for inference)
    outputs = model(**inputs)

# Step 4: Extract the hidden states from BERT's output
# BERT returns two outputs: the last hidden state and the pooled output.
# The last hidden state contains embeddings for each token in the input sentence.
last_hidden_state = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)

# Step 5: Average the token embeddings to create a sentence embedding
# This is a simple way to get a fixed-size vector representing the entire sentence.
sentence_embedding = last_hidden_state.mean(dim=1)  # Average across the sequence length dimension

# Print the sentence embedding
print("Sentence Embedding Shape:", sentence_embedding.shape)
print("Sentence Embedding Vector:", sentence_embedding)


## Text Classification with BERT
Let’s use the Hugging Face Transformers library to fine-tune BERT for text classification.

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load pre-trained BERT tokenizer and model for classification
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Example input text
texts = ["I love this product!", "The movie was terrible."]

# Tokenize the input text
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Convert logits to probabilities and predictions
probs = torch.softmax(logits, dim=1)
predictions = torch.argmax(probs, dim=1)

print("Probabilities:", probs)
print("Predictions:", predictions)  # 0 for negative, 1 for positive


## Text Generation with GPT
Let’s generate text using a pre-trained GPT model.

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load pre-trained GPT tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Example prompt
prompt = "Once upon a time in a faraway land,"

# Tokenize the input prompt
inputs = tokenizer.encode(prompt, return_tensors="pt")

# Generate text
output = model.generate(inputs, max_length=50, do_sample=True, temperature=0.1)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


## Language Translation with T5
Let’s translate a sentence using the T5 model.

In [None]:
# uinstall necessary libraries
!pip install sentencepiece

#load libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load pre-trained T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Input text for translation
input_text = "translate English to French: The weather is sunny."

# Tokenize the input text
inputs = tokenizer(input_text, return_tensors="pt")

# Generate translation
output = model.generate(inputs.input_ids, max_length=50)

# Decode the generated translation
translation = tokenizer.decode(output[0], skip_special_tokens=True)
print("Translation:", translation)


Let's use T5 for text summarization

In [None]:
# Import the necessary libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the pre-trained T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")  # "t5-small" is a smaller, faster version of T5
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Input text for summarization
text = """
The Mona Lisa is a half-length portrait painting by the Italian artist Leonardo da Vinci.
It is considered an archetypal masterpiece of the Italian Renaissance, and it has been described
as the most famous, most visited, most written about, and most sung about work of art in the world.
"""

# Prepare the text for the T5 model
# T5 treats all tasks as text-to-text; for summarization, we prepend "summarize: " to the input text
input_text = "summarize: " + text

# Tokenize the input text and convert it to a PyTorch tensor
inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate the summary
summary_ids = model.generate(
    inputs,
    max_length=50,          # Maximum length of the summary
    num_beams=4,            # Beam search with 4 beams for more coherent output
    early_stopping=True     # Stop once an optimal summary is found
)

# Decode the generated summary back to text
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the generated summary
print("Summary:", summary)
