#  English Tokenizer & Vector Embeddings
### GPT-2 BPE · Word2Vec CBOW Training · Attention Mechanism · Embedding Arithmetic
---
**Topics Covered:**
1. Simple Regex Tokenizer V1 & V2 (from scratch)
2. GPT-2 BPE Tokenizer via tiktoken
3. Sliding Window Dataset & DataLoader
4. Token + Positional Embeddings
5. Word2Vec CBOW Training
6. Training Loss Curve
7. Cosine Similarity — many word pairs
8. Embedding Arithmetic (king − man + woman)
9. PCA 2D Visualization
10. t-SNE Visualization
11. Cosine Similarity Heatmap
12. Self-Attention from Scratch
13. Masked (Causal) Attention — GPT style
14. Random vs Trained Comparison
15. Full Summary

---
##  Cell 1 — Install & Version Check

In [None]:
# Uncomment to install
# !pip install torch tiktoken requests matplotlib scikit-learn

import sys
from importlib.metadata import version

print("Python   :", sys.version)
print("torch    :", version("torch"))
print("tiktoken :", version("tiktoken"))

---
##  Cell 2 — Download & Load English Dataset

In [None]:
import os, requests

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    with open("the-verdict.txt", "wb") as f:
        f.write(r.content)
    print("Downloaded the-verdict.txt")

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print(f"Total characters : {len(raw_text):,}")
print(f"Total words      : {len(raw_text.split()):,}")
print(f"\nFirst 200 chars:\n{raw_text[:200]}")

---
##  Cell 3 — Regex Tokenizer (From Scratch)

In [None]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [t.strip() for t in preprocessed if t.strip()]
print(f"Total tokens (with dupes): {len(preprocessed):,}")
print(f"Sample: {preprocessed[:15]}")

all_words = sorted(set(preprocessed))
all_words += ["<|endoftext|>", "<|unk|>"]
vocab = {token: i for i, token in enumerate(all_words)}

print(f"\nVocabulary size: {len(vocab):,}")
print("\nSample vocab entries:")
for tok, idx in list(vocab.items())[:10]:
    print(f"  '{tok}' -> {idx}")

---
##  Cell 4 — SimpleTokenizerV1 & V2

In [None]:
class SimpleTokenizerV1:
    """Raises error on unknown tokens"""
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}
    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [t.strip() for t in tokens if t.strip()]
        return [self.str_to_int[t] for t in tokens]
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        return re.sub(r'\s+([,.?!"()\'])', r'\1', text)

class SimpleTokenizerV2:
    """Handles unknown tokens with <|unk|>"""
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}
    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [t.strip() for t in tokens if t.strip()]
        tokens = [t if t in self.str_to_int else "<|unk|>" for t in tokens]
        return [self.str_to_int[t] for t in tokens]
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        return re.sub(r'\s+([,.?!"()\'])', r'\1', text)

tok_v1 = SimpleTokenizerV1(vocab)
tok_v2 = SimpleTokenizerV2(vocab)

s1 = "I HAD always thought Jack Gisburn rather a cheap genius."
ids = tok_v1.encode(s1)
print("=== V1 ===")
print(f"Input  : {s1}")
print(f"IDs    : {ids}")
print(f"Decoded: {tok_v1.decode(ids)}")

s2 = "Hello there! zxqfoo is unknown."
ids2 = tok_v2.encode(s2)
print("\n=== V2 (unknown token) ===")
print(f"Input  : {s2}")
print(f"IDs    : {ids2}")
print(f"Decoded: {tok_v2.decode(ids2)}")

---
##  Cell 5 — GPT-2 BPE Tokenizer via tiktoken

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(f"GPT-2 Vocab Size: {tokenizer.n_vocab:,}")
print(f"Total tokens in dataset: {len(tokenizer.encode(raw_text)):,}")

sample = "Hello, I am learning about tokenizers and word embeddings!"
enc    = tokenizer.encode(sample)
print(f"\nInput   : {sample}")
print(f"IDs     : {enc}")
print(f"Decoded : {tokenizer.decode(enc)}")

print("\n--- Token-by-Token Breakdown ---")
for tid in enc:
    print(f"  ID {tid:6d}  ->  '{tokenizer.decode([tid])}'")

eot = tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})
print(f"\n<|endoftext|> ID: {eot}")

combined = "Hello! <|endoftext|> New document starts."
print(f"Combined IDs: {tokenizer.encode(combined, allowed_special={'<|endoftext|>'})}")

---
##  Cell 6 — GPT Dataset & DataLoader

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids, self.target_ids = [], []
        token_ids = tokenizer.encode(txt)
        print(f"Total tokens: {len(token_ids):,}")
        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i:i+max_length]))
            self.target_ids.append(torch.tensor(token_ids[i+1:i+max_length+1]))
    def __len__(self): return len(self.input_ids)
    def __getitem__(self, idx): return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True):
    tok = tiktoken.get_encoding("gpt2")
    ds  = GPTDatasetV1(txt, tok, max_length, stride)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, drop_last=True)

max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length,
                                   stride=max_length, shuffle=False)
inputs, targets = next(iter(dataloader))

tok = tiktoken.get_encoding("gpt2")
print("\nInput shape:", inputs.shape)
print("Input IDs:\n", inputs)
print("\nFirst sample:")
print("  Input :", tok.decode(inputs[0].tolist()))
print("  Target:", tok.decode(targets[0].tolist()))

---
##  Cell 7 — Token + Positional Embeddings

In [None]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer   = torch.nn.Embedding(max_length, output_dim)

token_embeddings = token_embedding_layer(inputs)
pos_embeddings   = pos_embedding_layer(torch.arange(max_length))
input_embeddings = token_embeddings + pos_embeddings

print("=== Embedding Shapes ===")
print(f"inputs           : {inputs.shape}")
print(f"token_embeddings : {token_embeddings.shape}")
print(f"pos_embeddings   : {pos_embeddings.shape}")
print(f"input_embeddings : {input_embeddings.shape}")
print(f"\nFirst token vector (8 dims): {input_embeddings[0][0][:8].detach().numpy()}")

print("\n--- Dimension vs Parameter Count ---")
print(f"{'Dim':<8} {'Output Shape':<22} {'Params':>12}")
print("-"*44)
for d in [64, 128, 256, 512, 768]:
    print(f"{d:<8} {'[8,4,'+str(d)+']':<22} {vocab_size*d:>12,}")
print("\nGPT-2 Small=768 | GPT-2 Large=1280 | GPT-3=12288")

---
##  Cell 8 — Word2Vec CBOW Training
> CBOW: predict center word from surrounding context words.
> This teaches embeddings to capture semantic meaning.

In [None]:
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import random

# Build vocabulary from raw text
words = raw_text.lower().split()
words = [re.sub(r'[^a-z]', '', w) for w in words]
words = [w for w in words if len(w) > 1]

freq        = Counter(words)
vocab_words = [w for w, c in freq.most_common(500)]
w2i         = {w: i for i, w in enumerate(vocab_words)}
i2w         = {i: w for w, i in w2i.items()}
V           = len(vocab_words)
print(f"Training vocab size: {V}")

# Build CBOW pairs  (context -> center)
WINDOW    = 2
cbow_data = []
filtered  = [w for w in words if w in w2i]

for i in range(WINDOW, len(filtered) - WINDOW):
    ctx = ([filtered[i-j] for j in range(WINDOW, 0, -1)] +
           [filtered[i+j] for j in range(1, WINDOW+1)])
    ctr = filtered[i]
    if all(w in w2i for w in ctx) and ctr in w2i:
        cbow_data.append(([w2i[w] for w in ctx], w2i[ctr]))

random.shuffle(cbow_data)
print(f"Training samples: {len(cbow_data):,}")

# CBOW Model
class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.linear    = nn.Linear(embed_dim, vocab_size)
    def forward(self, ctx):
        return self.linear(self.embedding(ctx).mean(dim=1))

EMBED_DIM = 64
EPOCHS    = 8
BATCH     = 256

model     = CBOWModel(V, EMBED_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

print("\nTraining Word2Vec CBOW...")
loss_history = []

for epoch in range(EPOCHS):
    random.shuffle(cbow_data)
    total_loss = 0
    steps      = 0
    for start in range(0, len(cbow_data), BATCH):
        batch   = cbow_data[start:start+BATCH]
        ctx_t   = torch.tensor([d[0] for d in batch])
        ctr_t   = torch.tensor([d[1] for d in batch])
        optimizer.zero_grad()
        loss = criterion(model(ctx_t), ctr_t)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        steps      += 1
    avg = total_loss / steps
    loss_history.append(avg)
    print(f"  Epoch {epoch+1}/{EPOCHS}  Loss: {avg:.4f}")

trained_embeddings = model.embedding.weight.detach()
print("\nTraining complete!")

---
##  Cell 9 — Training Loss Curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots(figsize=(8, 4))
ax.plot(range(1, EPOCHS+1), loss_history, 'o-',
        color='#3498db', linewidth=2.5, markersize=8)
ax.fill_between(range(1, EPOCHS+1), loss_history, alpha=0.15, color='#3498db')
ax.set_xlabel("Epoch", fontsize=12)
ax.set_ylabel("Cross-Entropy Loss", fontsize=12)
ax.set_title("Word2Vec CBOW Training Loss", fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("training_loss.png", dpi=150)
plt.show()
print("Saved: training_loss.png")

---
##  Cell 10 — Cosine Similarity (Many Examples)

In [None]:
import torch.nn.functional as F

def get_vec(word):
    if word not in w2i: return None
    return trained_embeddings[w2i[word]]

def cosine_sim(w1, w2):
    v1, v2 = get_vec(w1), get_vec(w2)
    if v1 is None or v2 is None: return None
    return F.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0)).item()

def top_similar(word, topn=5):
    if word not in w2i: return []
    vec  = trained_embeddings[w2i[word]].unsqueeze(0)
    sims = F.cosine_similarity(vec, trained_embeddings).detach().numpy()
    top  = np.argsort(sims)[::-1][1:topn+1]
    return [(i2w[i], sims[i]) for i in top]

# Many word pair groups
print("=" * 55)
print("GROUP 1 — Synonyms / Related")
print("=" * 55)
for w1, w2 in [("good","great"),("old","new"),("work","time"),("see","look"),("know","think")]:
    s = cosine_sim(w1, w2)
    if s: print(f"  {w1:<10} <-> {w2:<10}  {s:+.4f}  {'|'*int((s+1)*15)}")

print("\nGROUP 2 — Opposites")
print("-" * 55)
for w1, w2 in [("good","bad"),("come","go"),("old","young"),("right","wrong")]:
    s = cosine_sim(w1, w2)
    if s: print(f"  {w1:<10} <-> {w2:<10}  {s:+.4f}  {'|'*int((s+1)*15)}")

print("\nGROUP 3 — Morphological (same root)")
print("-" * 55)
for w1, w2 in [("man","men"),("see","saw"),("go","went"),("make","made")]:
    s = cosine_sim(w1, w2)
    if s: print(f"  {w1:<10} <-> {w2:<10}  {s:+.4f}  {'|'*int((s+1)*15)}")

print("\nGROUP 4 — Unrelated")
print("-" * 55)
for w1, w2 in [("time","eye"),("door","work"),("hand","old")]:
    s = cosine_sim(w1, w2)
    if s: print(f"  {w1:<10} <-> {w2:<10}  {s:+.4f}  {'|'*int((s+1)*15)}")

# Most similar words
print("\n--- Top 5 Similar Words ---")
for q in ["good", "time", "man", "come"]:
    sim = top_similar(q)
    if sim:
        print(f"  '{q}' -> {', '.join([f\"{w}({s:.2f})\" for w,s in sim])}")

---
##  Cell 11 — Embedding Arithmetic (king − man + woman)
> Word2Vec analogy: vector(king) − vector(man) + vector(woman) ≈ vector(queen)
> We use words available in our small dataset.

In [None]:
def embedding_arithmetic(pos_words, neg_words, topn=5):
    result = torch.zeros(EMBED_DIM)
    used, missing = [], []
    for w in pos_words:
        v = get_vec(w)
        if v is not None: result += v;  used.append(f"+{w}")
        else:             missing.append(w)
    for w in neg_words:
        v = get_vec(w)
        if v is not None: result -= v;  used.append(f"-{w}")
        else:             missing.append(w)
    result = F.normalize(result.unsqueeze(0), dim=1).squeeze()
    sims   = F.cosine_similarity(result.unsqueeze(0), trained_embeddings).detach().numpy()
    excl   = set(pos_words + neg_words)
    ranked = np.argsort(sims)[::-1]
    res    = []
    for idx in ranked:
        w = i2w[idx]
        if w not in excl:
            res.append((w, sims[idx]))
        if len(res) == topn: break
    return used, missing, res

queries = [
    (["men", "good"],    ["man"],    "men + good - man   (gender analogy)"),
    (["good", "great"],  ["bad"],    "good + great - bad"),
    (["see", "know"],    ["think"],  "see + know - think"),
    (["work", "time"],   ["come"],   "work + time - come"),
    (["old"],            ["new"],    "old - new"),
]

print("=" * 60)
print("EMBEDDING ARITHMETIC")
print("=" * 60)
for pos, neg, label in queries:
    used, miss, res = embedding_arithmetic(pos, neg)
    print(f"\n  Query  : {label}")
    if miss: print(f"  Missing: {miss}")
    print(f"  Used   : {' '.join(used)}")
    print(f"  Results:")
    for w, s in res:
        print(f"    {w:<15} {s:.4f}  {'|'*int(s*25)}")

print("\nNote: With a larger dataset (millions of words),")
print("king - man + woman = queen would work precisely.")

---
##  Cell 12 — PCA Visualization (Trained)

In [None]:
import matplotlib.patches as mpatches
from sklearn.decomposition import PCA

word_groups = {
    "Verbs"     : ["come","go","see","know","think","make","give","take"],
    "Adjectives": ["good","great","old","new","little","own","right","long"],
    "Nouns"     : ["man","men","time","work","hand","eye","room","door"],
    "Function"  : ["the","of","and","to","in","was","he","it"],
}
colors = ["#e74c3c","#2ecc71","#3498db","#f39c12"]
all_w, all_v, all_c = [], [], []

for (grp, wlist), col in zip(word_groups.items(), colors):
    for w in wlist:
        v = get_vec(w)
        if v is not None:
            all_w.append(w); all_v.append(v.numpy()); all_c.append(col)

pca    = PCA(n_components=2)
vecs2d = pca.fit_transform(np.array(all_v))

fig, ax = plt.subplots(figsize=(13, 8))
ax.set_facecolor("#f8f9fa")
for w, coord, col in zip(all_w, vecs2d, all_c):
    ax.scatter(coord[0], coord[1], c=col, s=120, zorder=3, alpha=0.85)
    ax.annotate(w, coord, textcoords="offset points", xytext=(6,4),
                fontsize=9, fontweight='bold')

patches = [mpatches.Patch(color=c, label=g) for g,c in zip(word_groups.keys(), colors)]
ax.legend(handles=patches, fontsize=10, loc='upper right')
ax.set_title("Word Embedding Space — PCA (Trained CBOW)", fontsize=14, fontweight='bold')
ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("pca_trained.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: pca_trained.png")

---
##  Cell 13 — t-SNE Visualization

In [None]:
from sklearn.manifold import TSNE

tsne      = TSNE(n_components=2, perplexity=min(15, len(all_w)-1), random_state=42, n_iter=1000)
vecs_tsne = tsne.fit_transform(np.array(all_v))

fig, ax = plt.subplots(figsize=(13, 8))
ax.set_facecolor("#f8f9fa")
for w, coord, col in zip(all_w, vecs_tsne, all_c):
    ax.scatter(coord[0], coord[1], c=col, s=130, zorder=3, alpha=0.85)
    ax.annotate(w, coord, textcoords="offset points", xytext=(6,4),
                fontsize=9, fontweight='bold')

ax.legend(handles=patches, fontsize=10, loc='upper right')
ax.set_title("Word Embedding Space — t-SNE (Trained CBOW)", fontsize=14, fontweight='bold')
ax.set_xlabel("t-SNE Dim 1")
ax.set_ylabel("t-SNE Dim 2")
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("tsne_trained.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: tsne_trained.png")

---
##  Cell 14 — Cosine Similarity Heatmap

In [None]:
from sklearn.metrics.pairwise import cosine_similarity as cos_mat

hmap_words = [w for w in ["good","great","old","new","man","men","come","go","see","know"] if w in w2i]
vecs       = np.array([get_vec(w).numpy() for w in hmap_words])
sim_m      = cos_mat(vecs)

fig, ax = plt.subplots(figsize=(9, 7))
im = ax.imshow(sim_m, cmap="RdYlGn", vmin=-1, vmax=1)
ax.set_xticks(range(len(hmap_words)))
ax.set_yticks(range(len(hmap_words)))
ax.set_xticklabels(hmap_words, rotation=45, ha='right', fontsize=10)
ax.set_yticklabels(hmap_words, fontsize=10)
for i in range(len(hmap_words)):
    for j in range(len(hmap_words)):
        v = sim_m[i,j]
        ax.text(j, i, f"{v:.2f}", ha='center', va='center',
                fontsize=8, fontweight='bold',
                color='white' if abs(v) > 0.5 else 'black')
plt.colorbar(im, ax=ax, label="Cosine Similarity")
ax.set_title("Cosine Similarity Heatmap", fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig("cosine_heatmap.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: cosine_heatmap.png")

---
##  Cell 15 — Self-Attention Mechanism (From Scratch)
> Attention lets each token look at every other token and decide which ones are relevant.
> Formula: Attention(Q, K, V) = softmax(QKᵀ / √d_k) × V

In [None]:
sentence    = "The cat sat on the mat"
words_list  = sentence.lower().split()
embed_dim   = 8
seq_len     = len(words_list)

torch.manual_seed(42)
x = torch.randn(seq_len, embed_dim)   # (6, 8)

# Q = K = V = x  (simplified self-attention)
Q = x;  K = x;  V = x

# Step 1: Attention scores
scores = torch.matmul(Q, K.T) / (embed_dim ** 0.5)   # (6, 6)

# Step 2: Softmax
attn_weights = F.softmax(scores, dim=-1)              # (6, 6)

# Step 3: Context = weighted sum of V
context = torch.matmul(attn_weights, V)               # (6, 8)

print(f"Words       : {words_list}")
print(f"Input  shape: {x.shape}            (seq=6, dim=8)")
print(f"Scores shape: {scores.shape}          (seq x seq)")
print(f"Weights shape:{attn_weights.shape}   (each row sums to 1)")
print(f"Context shape:{context.shape}        (same as input)")
print(f"\nRow sums (should all be 1.0): {attn_weights.sum(dim=-1).detach().numpy().round(3)}")
print("\nAttention Weights (rounded):")
for i, w in enumerate(words_list):
    row = attn_weights[i].detach().numpy()
    row_str = "  ".join([f"{v:.2f}" for v in row])
    print(f"  {w:<6}  [{row_str}]")

---
##  Cell 16 — Attention Visualization

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Full heatmap
ax = axes[0]
im = ax.imshow(attn_weights.detach().numpy(), cmap='Blues', vmin=0, vmax=1)
ax.set_xticks(range(seq_len)); ax.set_yticks(range(seq_len))
ax.set_xticklabels(words_list, rotation=45, ha='right', fontsize=10)
ax.set_yticklabels(words_list, fontsize=10)
for i in range(seq_len):
    for j in range(seq_len):
        v = attn_weights[i,j].item()
        ax.text(j,i,f"{v:.2f}",ha='center',va='center',fontsize=8,
                color='white' if v>0.4 else 'black')
plt.colorbar(im, ax=ax)
ax.set_title("Self-Attention Weights", fontsize=11, fontweight='bold')
ax.set_xlabel("Keys"); ax.set_ylabel("Queries")

# Plot 2: Bar chart for one word
ax2   = axes[1]
idx_w = 1   # 'cat'
vals  = attn_weights[idx_w].detach().numpy()
bars  = ax2.bar(words_list, vals,
               color=['#e74c3c' if i==idx_w else '#3498db' for i in range(seq_len)], alpha=0.85)
for b in bars:
    ax2.annotate(f"{b.get_height():.3f}",
                 xy=(b.get_x()+b.get_width()/2, b.get_height()),
                 xytext=(0,3), textcoords='offset points', ha='center', fontsize=9)
ax2.set_title(f"Attention from '{words_list[idx_w]}' to all tokens",
              fontsize=11, fontweight='bold')
ax2.set_ylabel("Weight"); ax2.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig("attention_weights.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: attention_weights.png")

---
##  Cell 17 — Masked (Causal) Attention — GPT Style
> GPT uses causal masking so token[i] can only attend to tokens[0..i], never future ones.

In [None]:
mask          = torch.triu(torch.ones(seq_len, seq_len), diagonal=1) * float('-inf')
masked_scores = scores + mask
masked_w      = F.softmax(masked_scores, dim=-1)
masked_ctx    = torch.matmul(masked_w, V)

print("Causal Mask (True = masked/blocked):")
print(torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1).numpy())

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, w, title in zip(axes,
    [attn_weights, masked_w],
    ["Full Self-Attention", "Causal Masked Attention (GPT)"]):
    im = ax.imshow(w.detach().numpy(), cmap='Blues', vmin=0, vmax=1)
    ax.set_xticks(range(seq_len)); ax.set_yticks(range(seq_len))
    ax.set_xticklabels(words_list, rotation=45, ha='right', fontsize=9)
    ax.set_yticklabels(words_list, fontsize=9)
    for i in range(seq_len):
        for j in range(seq_len):
            v = w[i,j].item()
            ax.text(j,i,f"{v:.2f}",ha='center',va='center',fontsize=7,
                    color='white' if v>0.4 else 'black')
    plt.colorbar(im, ax=ax)
    ax.set_title(title, fontsize=10, fontweight='bold')

plt.suptitle("Full vs Causal Masked Attention", fontsize=13, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig("masked_attention.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: masked_attention.png")
print("\nIn causal attention: upper triangle = 0 (future tokens are invisible)")

---
##  Cell 18 — Random vs Trained Embedding Comparison

In [None]:
random_emb = torch.nn.Embedding(V, EMBED_DIM).weight.detach()

def cmp_sim(emb, w1, w2):
    if w1 not in w2i or w2 not in w2i: return 0
    v1 = emb[w2i[w1]].unsqueeze(0)
    v2 = emb[w2i[w2]].unsqueeze(0)
    return F.cosine_similarity(v1, v2).item()

pairs  = [("good","great"),("man","men"),("come","go"),("see","know"),("good","door"),("time","eye")]
r_sims = [cmp_sim(random_emb,   w1, w2) for w1, w2 in pairs]
t_sims = [cmp_sim(trained_embeddings, w1, w2) for w1, w2 in pairs]
lbls   = [f"{w1}↔{w2}" for w1, w2 in pairs]

x, w = np.arange(len(lbls)), 0.35
fig, ax = plt.subplots(figsize=(12, 5))
b1 = ax.bar(x-w/2, r_sims, w, label='Random',  color='#e74c3c', alpha=0.8)
b2 = ax.bar(x+w/2, t_sims, w, label='Trained', color='#2ecc71', alpha=0.8)
ax.set_xticks(x)
ax.set_xticklabels(lbls, rotation=15, ha='right', fontsize=9)
ax.set_ylabel("Cosine Similarity")
ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
ax.set_title("Random vs Trained Embeddings: Cosine Similarity", fontsize=13, fontweight='bold')
ax.legend(fontsize=10); ax.grid(axis='y', alpha=0.3)
for bars in [b1, b2]:
    for b in bars:
        ax.annotate(f"{b.get_height():.2f}",
                    xy=(b.get_x()+b.get_width()/2, b.get_height()),
                    xytext=(0,3), textcoords='offset points', ha='center', fontsize=7)
plt.tight_layout()
plt.savefig("random_vs_trained.png", dpi=150, bbox_inches='tight')
plt.show()
print("Saved: random_vs_trained.png")

---
##  Cell 19 — Full Pipeline Summary

In [None]:
print("=" * 65)
print("       ENGLISH TOKENIZER & VECTOR EMBEDDINGS SUMMARY")
print("=" * 65)

s   = "Hello, I am building an LLM from scratch."
ids = tiktoken.get_encoding("gpt2").encode(s)
emb = torch.nn.Embedding(50257, 256)
pos = torch.nn.Embedding(len(ids), 256)
out = emb(torch.tensor(ids)) + pos(torch.arange(len(ids)))

print(f"\nInput        : {s}")
print(f"Token IDs    : {ids}")
print(f"Num tokens   : {len(ids)}")
print(f"Embed shape  : {out.shape}")

print("\n" + "-"*65)
concepts = [
    ("Regex Tokenizer",   "Splits text on whitespace + punctuation"),
    ("BPE Tokenizer",     "GPT-2 via tiktoken — 50,257 vocab"),
    ("Sliding Window",    "DataLoader for next-token prediction"),
    ("Token Embedding",   "nn.Embedding: ID -> dense vector"),
    ("Pos Embedding",     "Absolute position added to token vec"),
    ("Word2Vec CBOW",     "Train embeddings from context words"),
    ("Cosine Similarity", "Angle between vectors = semantic sim"),
    ("Embed Arithmetic",  "king - man + woman = queen"),
    ("Self-Attention",    "softmax(QKT/sqrt(d)) x V"),
    ("Causal Mask",       "GPT: no peeking at future tokens"),
    ("PCA / t-SNE",       "2D projection of embedding space"),
]
for name, desc in concepts:
    print(f"  [OK] {name:<22} {desc}")

print("\n" + "-"*65)
print("SAVED PLOTS")
saved = ["training_loss.png","pca_trained.png","tsne_trained.png",
         "cosine_heatmap.png","attention_weights.png",
         "masked_attention.png","random_vs_trained.png"]
for f in saved:
    status = "OK" if os.path.exists(f) else "MISSING"
    print(f"  [{status}] {f}")
print("=" * 65)