In [1]:
# -----------------------------------------------------------------------------
# NOTEBOOK 1: Installation and Basic Summarization (BART)
# -----------------------------------------------------------------------------

from transformers import pipeline
from IPython.display import Markdown, display

def printmd(string):
    """Helper function to print Markdown in a notebook cell."""
    display(Markdown(string))

printmd("# 1. üì• Installation and Basic Summarization (BART) üìù")
printmd("This notebook sets up the environment and demonstrates a core task for Encoder-Decoder models: **Abstractive Summarization**.")
printmd("---")

printmd("## üì• Installation (Run this cell once)")
printmd("```bash\n!pip install transformers torch\n```")

# Initialize the Summarization pipeline using a BART model
try:
    summarizer = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        tokenizer="facebook/bart-large-cnn"
    )
    print("Pipeline initialized successfully with bart-large-cnn.")
except Exception as e:
    print(f"Error initializing pipeline (requires download): {e}")
    summarizer = pipeline(
        "summarization",
        model="sshleifer/distilbart-cnn-6-6", # Fallback to smaller model
        tokenizer="sshleifer/distilbart-cnn-6-6"
    )
    print("Falling back to distilbart-cnn-6-6.")


text_to_summarize = """
The Solar System is the gravitationally bound system of the Sun and the objects that orbit it, either directly or indirectly. 
Of the objects that orbit the Sun directly, the largest are the eight planets, with the remainder being smaller objects, 
such as the five dwarf planets and millions of small Solar System bodies. The Sun, a G-type main-sequence star, accounts 
for 99.86% of the System's known mass and is dominant by gravity.
"""

print(f"\nOriginal Text (Length: {len(text_to_summarize.split())} words):\n> {text_to_summarize.strip()}")

# Generate the summary
summary_result = summarizer(
    text_to_summarize,
    max_length=40,
    min_length=10,
    do_sample=False
)[0]['summary_text']

print(f"\nGenerated Summary:\n> {summary_result}")

printmd("## Key Insight")
printmd("Encoder-Decoder models, like BART, are ideal for **Seq2Seq (Sequence-to-Sequence)** tasks where the output (summary) is structurally different and shorter than the input (original text).")

  from .autonotebook import tqdm as notebook_tqdm


# 1. üì• Installation and Basic Summarization (BART) üìù

This notebook sets up the environment and demonstrates a core task for Encoder-Decoder models: **Abstractive Summarization**.

---

## üì• Installation (Run this cell once)

```bash
!pip install transformers torch
```

Device set to use mps:0


Pipeline initialized successfully with bart-large-cnn.

Original Text (Length: 73 words):
> The Solar System is the gravitationally bound system of the Sun and the objects that orbit it, either directly or indirectly. 
Of the objects that orbit the Sun directly, the largest are the eight planets, with the remainder being smaller objects, 
such as the five dwarf planets and millions of small Solar System bodies. The Sun, a G-type main-sequence star, accounts 
for 99.86% of the System's known mass and is dominant by gravity.

Generated Summary:
> The Solar System is the gravitationally bound system of the Sun and the objects that orbit it, either directly or indirectly. The Sun accounts for 99.86% of the System's


## Key Insight

Encoder-Decoder models, like BART, are ideal for **Seq2Seq (Sequence-to-Sequence)** tasks where the output (summary) is structurally different and shorter than the input (original text).

In [2]:
# -----------------------------------------------------------------------------
# NOTEBOOK 2: Translation (T5) - The Multi-Task Model
# -----------------------------------------------------------------------------

from transformers import pipeline
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

printmd("# 2. üó£Ô∏è Translation (T5) - The Multi-Task Model üåê")
printmd("T5 (Text-to-Text Transfer Transformer) uses a unique approach: framing every task, including translation, as a **text-to-text** problem using prefixes.")
printmd("---")

# Initialize the T5 model for translation
try:
    translator = pipeline(
        "text2text-generation",
        model="t5-small",
        tokenizer="t5-small"
    )
    print("T5 model initialized successfully.")
except Exception as e:
    print(f"Error initializing T5 pipeline: {e}")

# T5 requires a task prefix
english_text = "translate English to German: The cat jumped over the fence."
print(f"Input Prompt: {english_text}")

# Translate
translation_result = translator(
    english_text,
    max_length=50
)[0]['generated_text']

print(f"\nGenerated Translation: {translation_result}")

print("\n" + "=" * 60)

# Demonstrating another T5 task (Q&A) using a different prefix
qa_prompt = "question: What is the largest planet in our solar system? context: Jupiter is the largest planet in our solar system, followed by Saturn."
print(f"Input Prompt: {qa_prompt}")

qa_result = translator(
    qa_prompt,
    max_length=20
)[0]['generated_text']

print(f"\nGenerated Answer: {qa_result}")

printmd("## Key Insight")
printmd("The T5 Encoder-Decoder architecture uses a **task-specific prefix** (e.g., 'translate English to German:') to condition the model. The Encoder processes the entire prefixed input, and the Decoder generates the corresponding output.")

# 2. üó£Ô∏è Translation (T5) - The Multi-Task Model üåê

T5 (Text-to-Text Transfer Transformer) uses a unique approach: framing every task, including translation, as a **text-to-text** problem using prefixes.

---

Device set to use mps:0


T5 model initialized successfully.
Input Prompt: translate English to German: The cat jumped over the fence.


Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Generated Translation: Die Katze sprang √ºber den Zaun.

Input Prompt: question: What is the largest planet in our solar system? context: Jupiter is the largest planet in our solar system, followed by Saturn.

Generated Answer: Jupiter


## Key Insight

The T5 Encoder-Decoder architecture uses a **task-specific prefix** (e.g., 'translate English to German:') to condition the model. The Encoder processes the entire prefixed input, and the Decoder generates the corresponding output.

In [3]:
# -----------------------------------------------------------------------------
# NOTEBOOK 3: Understanding Encoder-Decoder Inputs
# -----------------------------------------------------------------------------

from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

# Load T5 components explicitly
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

printmd("# 3. üß± Understanding Encoder-Decoder Inputs and Outputs üîç")
printmd("We look at how the input is prepared for the Encoder and what the Decoder uses for generation.")
printmd("---")

input_text = "summarize: The Amazon rainforest is the largest tropical rainforest in the world."
target_text_prefix = "The Amazon rainforest is very large."

# 1. Prepare Encoder Input (Source)
encoder_input = tokenizer(input_text, return_tensors="pt")
encoder_input_ids = encoder_input.input_ids
encoder_attention_mask = encoder_input.attention_mask

printmd("### Encoder Input (Source Text)")
print(f"Text: '{input_text}'")
print(f"Input IDs shape: {encoder_input_ids.shape}")
print(f"Attention Mask shape: {encoder_attention_mask.shape}")
print("Encoder reads the entire input to create a rich context vector.")

# 2. Prepare Decoder Input (Target/Prefix)
decoder_input = tokenizer(target_text_prefix, return_tensors="pt")
decoder_input_ids = model._shift_right(decoder_input.input_ids)

printmd("\n### Decoder Input (Target/Output Prefix)")
print(f"Prefix Text (Training Example): '{target_text_prefix}'")
print(f"Shifted Decoder Input IDs shape: {decoder_input_ids.shape}")
print("Decoder input is shifted right for causal generation, similar to GPT.")

printmd("## Key Insight")
printmd("The **Encoder receives the full, non-causal input**. The **Decoder receives a context vector** from the Encoder *plus* the generated tokens so far (or a target prefix), and generates the next token autoregressively.")

# 3. üß± Understanding Encoder-Decoder Inputs and Outputs üîç

We look at how the input is prepared for the Encoder and what the Decoder uses for generation.

---

### Encoder Input (Source Text)

Text: 'summarize: The Amazon rainforest is the largest tropical rainforest in the world.'
Input IDs shape: torch.Size([1, 15])
Attention Mask shape: torch.Size([1, 15])
Encoder reads the entire input to create a rich context vector.



### Decoder Input (Target/Output Prefix)

Prefix Text (Training Example): 'The Amazon rainforest is very large.'
Shifted Decoder Input IDs shape: torch.Size([1, 8])
Decoder input is shifted right for causal generation, similar to GPT.


## Key Insight

The **Encoder receives the full, non-causal input**. The **Decoder receives a context vector** from the Encoder *plus* the generated tokens so far (or a target prefix), and generates the next token autoregressively.

In [4]:
# -----------------------------------------------------------------------------
# NOTEBOOK 4: Cross-Attention: The Bridge
# -----------------------------------------------------------------------------

from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

# Load T5 components explicitly
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

printmd("# 4. ‚û°Ô∏è Cross-Attention: The Bridge Between Encoder and Decoder üåâ")
printmd("Cross-attention is the mechanism that allows the Decoder to condition its output generation on the **entire input sequence** processed by the Encoder.")
printmd("---")

input_text = "translate English to French: Today is a sunny day."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# 1. Get Encoder Output
with torch.no_grad():
    encoder_outputs = model.encoder(input_ids=input_ids)

encoder_hidden_states = encoder_outputs.last_hidden_state
printmd("### Encoder Output (Context Vector)")
print(f"Shape of Encoder's final hidden state: {encoder_hidden_states.shape}")
print("This vector contains the 'meaning' of the input, ready for the Decoder.")

printmd("\n### Cross-Attention Function (Conceptual)")
printmd("In the Decoder block, the **Query (Q)** comes from the **Decoder's self-attention output** (what it has generated so far).")
printmd("The **Key (K) and Value (V)** come from the **Encoder's final hidden state**.")
printmd("> **Cross-Attention = Attention(Q_Decoder, K_Encoder, V_Encoder)**")

printmd("\nThis operation allows the Decoder to decide which parts of the input text (K/V from Encoder) are most relevant when generating the next word (Query from Decoder).")

printmd("## Key Insight")
printmd("Unlike GPT (Decoder-only), Encoder-Decoder models use **Cross-Attention** to directly align generated output tokens with the input source tokens. This is crucial for tasks like translation and summarization.")

# 4. ‚û°Ô∏è Cross-Attention: The Bridge Between Encoder and Decoder üåâ

Cross-attention is the mechanism that allows the Decoder to condition its output generation on the **entire input sequence** processed by the Encoder.

---

### Encoder Output (Context Vector)

Shape of Encoder's final hidden state: torch.Size([1, 13, 512])
This vector contains the 'meaning' of the input, ready for the Decoder.



### Cross-Attention Function (Conceptual)

In the Decoder block, the **Query (Q)** comes from the **Decoder's self-attention output** (what it has generated so far).

The **Key (K) and Value (V)** come from the **Encoder's final hidden state**.

> **Cross-Attention = Attention(Q_Decoder, K_Encoder, V_Encoder)**


This operation allows the Decoder to decide which parts of the input text (K/V from Encoder) are most relevant when generating the next word (Query from Decoder).

## Key Insight

Unlike GPT (Decoder-only), Encoder-Decoder models use **Cross-Attention** to directly align generated output tokens with the input source tokens. This is crucial for tasks like translation and summarization.

In [None]:
# -----------------------------------------------------------------------------
# NOTEBOOK 5: The Encoder's Role (Non-Causal Attention)
# -----------------------------------------------------------------------------

import torch
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

printmd("# 5. üé≠ The Encoder's Role: Bidirectional Self-Attention (Non-Causal) üîÑ")
printmd("The Encoder processes the entire input sequence simultaneously, allowing a token to attend to all other tokens, both preceding and succeeding it.")
printmd("---")

sequence_length = 5
# Conceptual mask where all tokens can see each other (non-causal)
attention_mask = torch.ones(sequence_length, sequence_length)

printmd("### Conceptual Encoder Self-Attention Mask (Non-Causal):")
printmd("`1 = can attend, 0 = cannot attend (masked)`")
print(attention_mask)

print("\nExplanation:")
print("- Row 2 (token 2): can see tokens 0, 1, 2, 3, 4.")
print("- Every token can see every other token in the input sequence (bidirectional context).")

printmd("\n### Decoder Self-Attention Mask (Recap - Causal):")
decoder_mask = torch.tril(torch.ones(sequence_length, sequence_length))
print(decoder_mask)
print("The Decoder's self-attention *still* uses a causal mask, like GPT, to ensure it only uses previously generated tokens.")


printmd("## Key Insight")
printmd("The **Encoder** uses **bidirectional (non-causal) attention** to build a comprehensive understanding of the input. This is its key difference from the GPT-style decoder.")

In [5]:
# -----------------------------------------------------------------------------
# NOTEBOOK 6: Understanding Inputs for Generation vs. Training
# -----------------------------------------------------------------------------

from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

printmd("# 6. üìè Decoder Input: Generation vs. Training Mode üîÑ")
printmd("The way the Decoder receives its initial sequence differs significantly between training (teacher forcing) and inference (autoregression).")
printmd("---")

input_text = "translate English to French: The clock struck midnight."

printmd("### Mode 1: Training (Teacher Forcing)")
printmd("During training, the **target labels (correct translation)** are shifted right to serve as the Decoder's input sequence.")

target_text = "L'horloge sonna minuit."
decoder_input_ids_training = tokenizer(target_text, return_tensors="pt").input_ids
decoder_input_ids_training = model._shift_right(decoder_input_ids_training)

print(f"Target (Labels): {tokenizer.decode(tokenizer(target_text, return_tensors='pt').input_ids[0])}")
print(f"Decoder Input (Training): {tokenizer.decode(decoder_input_ids_training[0])}")
printmd("This allows the Decoder to learn efficiently by seeing the correct answer at every step.")

printmd("\n### Mode 2: Generation (Inference)")
printmd("During generation, the Decoder starts with a single **decoder_start_token** and generates tokens one by one.")

input_ids = tokenizer(input_text, return_tensors="pt").input_ids
output = model.generate(input_ids, max_length=15)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print(f"Input: {input_text}")
print(f"Generated Output: {generated_text}")
printmd("This is a true autoregressive process, relying on the Decoder's own output from the previous step.")

printmd("## Key Insight")
printmd("The Decoder is autoregressive in **both modes**: in training it uses the *correct* previous token, and in inference, it uses the *generated* previous token.")

# 6. üìè Decoder Input: Generation vs. Training Mode üîÑ

The way the Decoder receives its initial sequence differs significantly between training (teacher forcing) and inference (autoregression).

---

### Mode 1: Training (Teacher Forcing)

During training, the **target labels (correct translation)** are shifted right to serve as the Decoder's input sequence.

Target (Labels): L'horloge sonna minuit.</s>
Decoder Input (Training): <pad> L'horloge sonna minuit.


This allows the Decoder to learn efficiently by seeing the correct answer at every step.


### Mode 2: Generation (Inference)

During generation, the Decoder starts with a single **decoder_start_token** and generates tokens one by one.

Input: translate English to French: The clock struck midnight.
Generated Output: L'horloge a frapp√© minuit.


This is a true autoregressive process, relying on the Decoder's own output from the previous step.

## Key Insight

The Decoder is autoregressive in **both modes**: in training it uses the *correct* previous token, and in inference, it uses the *generated* previous token.

In [None]:
# -----------------------------------------------------------------------------
# NOTEBOOK 7: Conditional Generation (Summarization Parameters)
# -----------------------------------------------------------------------------

from transformers import pipeline
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

# Initialize the Summarization pipeline using a smaller BART model
summarizer = pipeline(
    "summarization",
    model="sshleifer/distilbart-cnn-6-6",
    tokenizer="sshleifer/distilbart-cnn-6-6"
)

printmd("# 7. üß© Conditional Generation Parameters (Summarization) ‚öôÔ∏è")
printmd("Encoder-Decoder models rely heavily on generation parameters like Beam Search and length constraints to control the output sequence.")
printmd("---")

long_article = """
The Hubble Space Telescope (HST) is a space telescope that was launched into low Earth orbit in 1990 and remains in operation. 
It was not the first space telescope, but it is one of the largest and most versatile. Hubble is famous for its crucial 
role as a research tool and as a public relations boost for astronomy. It is named after astronomer Edwin Hubble. 
Hubble has provided some of the most detailed visible light images ever taken, allowing a deep view into space 
and time. Hubble's successor, the James Webb Space Telescope (JWST), was launched in December 2021.
"""
print(f"Original Text (approx. 85 words):\n> {long_article.strip()}")

printmd("\n### Example 1: Short Summary (Controlled Length)")
summary_1 = summarizer(
    long_article,
    max_length=20,
    min_length=10,
    do_sample=False,
    num_beams=4
)[0]['summary_text']
print(f"Summary (Max 20 tokens): {summary_1}")

printmd("\n### Example 2: Longer Summary (Controlled Length)")
summary_2 = summarizer(
    long_article,
    max_length=50,
    min_length=30,
    do_sample=False,
    num_beams=4
)[0]['summary_text']
print(f"Summary (Max 50 tokens): {summary_2}")

printmd("## Key Insight")
printmd("In Encoder-Decoder tasks, parameters like **Beam Search** and **length constraints** (`max_length`, `min_length`) are essential because the output sequence length is decoupled from the input sequence length.")

In [None]:
# -----------------------------------------------------------------------------
# NOTEBOOK 8: Visualizing Sequence Length Change
# -----------------------------------------------------------------------------

from transformers import pipeline
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

# Initialize the Translation pipeline using a T5 model
translator = pipeline(
    "translation_en_to_de",
    model="t5-small",
    tokenizer="t5-small"
)

printmd("# 8. üìä Visualizing Sequence Length Change (Encoder vs. Decoder) üìê")
printmd("Encoder-Decoder models allow the output sequence length to be different from the input sequence length (Seq2Seq).")
printmd("---")

english_phrase = "The rapid development of large language models has transformed the landscape of artificial intelligence research."

# 1. Tokenize Input (Encoder Side)
input_token_ids = translator.tokenizer.encode(english_phrase, return_tensors="pt")[0]
input_length = input_token_ids.size(0)

print(f"English Input: {english_phrase}")
print(f"Encoder Input Token Length: {input_length}")

# 2. Generate Output (Decoder Side)
output = translator(english_phrase, max_length=100)
german_translation = output[0]['translation_text']
output_token_ids = translator.tokenizer.encode(german_translation, return_tensors="pt")[0]
output_length = output_token_ids.size(0)

print(f"\nGerman Output: {german_translation}")
print(f"Decoder Output Token Length: {output_length}")

# Conceptual Length Comparison
printmd("\n### Conceptual Length Comparison")
comparison = "Output length can vary"
if output_length > input_length:
    comparison = "Output > Input (Expansion)"
elif output_length < input_length:
    comparison = "Output < Input (Compression)"
else:
    comparison = "Output ‚âà Input"

print(f"Length Difference: {comparison}")
print(f"Input Tokens:  {'‚ñà' * input_length}")
print(f"Output Tokens: {'‚ñà' * output_length}")

printmd("## Key Insight")
printmd("Encoder-Decoder models excel at **Sequence Transformation** because the Decoder's length is determined only by when it predicts the **End-of-Sentence (EOS)** token.")

In [None]:
# -----------------------------------------------------------------------------
# NOTEBOOK 9: Encoder-Decoder Interaction: encoder_hidden_states
# -----------------------------------------------------------------------------

from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

# Load T5 components explicitly
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

printmd("# 9. üîó Encoder-Decoder Interaction: Accessing Hidden States üß†")
printmd("The Encoder's final hidden state is the consolidated **context vector** passed to the Decoder via cross-attention.")
printmd("---")

input_text = "translate English to French: This is a beautiful day."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# 1. Forward Pass through the Encoder
with torch.no_grad():
    encoder_outputs = model.encoder(input_ids=input_ids)

# The final layer's hidden state is the context vector
context_vector = encoder_outputs.last_hidden_state

printmd("### Encoder Output (Context Vector)")
print(f"Input Text: '{input_text}'")
print(f"Input Token Length: {input_ids.shape[1]}")

print(f"\nContext Vector Shape: {context_vector.shape}")
print(f"  - Sequence Length (from input): {context_vector.shape[1]}")
print(f"  - Hidden Dimension: {context_vector.shape[2]}")
print(f"\nFirst 5 Dimensions of the First Token's Vector:")
print(context_vector[0, 0, :5].numpy())

printmd("\n### Conceptual Decoder Usage")
printmd("This `context_vector` is provided to **every layer** of the Decoder. The Decoder uses this vector as the **Key (K)** and **Value (V)** in its cross-attention to condition the output on the entire input.")

printmd("## Key Insight")
printmd("The `context_vector` is the **single piece of information** transferred from the Encoder to the Decoder, containing the full, non-causal understanding of the source text.")

In [None]:
# -----------------------------------------------------------------------------
# NOTEBOOK 10: Conceptual Comparison: T5 vs. GPT
# -----------------------------------------------------------------------------

from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))

printmd("# 10. üí° Conceptual Comparison: Encoder-Decoder (T5) vs. Decoder-Only (GPT) üÜö")
printmd("Summary of architectural differences and use cases.")
printmd("---")

printmd("## üß± Architectural Differences")

printmd("| Feature | Encoder-Decoder (T5) | Decoder-Only (GPT) |")
printmd("|---|---|---|")
printmd("| **Architecture** | Two separate stacks (Encoder & Decoder) | Single stack (Decoder only) |")
printmd("| **Attention (Input)** | Bidirectional (Non-Causal) | Causal (Autoregressive) |")
printmd("| **Attention (Output)** | Causal + Cross-Attention | Causal (Self-Attention only) |")
printmd("| **Primary Tasks** | Seq2Seq: Translation, Summarization, Rewriting | Continuation: Generation, Prompt-based QA |")
printmd("| **Input/Output Length**| Output length can be different from input | Output is a continuation of the input |")


printmd("\n## üéØ Task Suitability")

printmd("### Encoder-Decoder (e.g., T5, BART):")
printmd("**Best for:** Tasks where the output is a **transformation or condensation** of the input. They efficiently process the entire input first before starting generation.")
printmd("E.g.: Rewriting, Abstractive Summarization, Language Translation.")

printmd("### Decoder-Only (e.g., GPT):")
printmd("**Best for:** Tasks where the output is a **continuation** of the input. They excel at creative writing, chat, and prompt-based instruction following.")
printmd("E.g.: Chatbots, Creative Writing, Code Generation.")

printmd("## Key Insight")
printmd("The presence of the **Encoder** and the **Cross-Attention** mechanism is what distinguishes the Encoder-Decoder architecture, making it the preferred choice for sequence transformation tasks.")