In [2]:
# 📘 Transfer Learning & Word Embedding Practice Notebook
# Works 100% on CPU with Anaconda + Jupyter + minimal setup

# ======================
# 📦 0. Setup + Imports
# ======================
!pip install torch torchvision torchaudio --quiet
!pip install transformers gensim --quiet

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from gensim.models import KeyedVectors

import numpy as np
import matplotlib.pyplot as plt

In [3]:
# ==========================
# 📘 1. Learnable Embeddings
# ==========================
print("\n🔹 Section 1: Embedding Layer Basics\n")

# Define vocab and toy tokenizer
vocab = {"[PAD]": 0, "hello": 1, "world": 2, "bert": 3, "rocks": 4}
inv_vocab = {v: k for k, v in vocab.items()}

def tokenize(text):
    return [vocab.get(token, 0) for token in text.lower().split()]

# TODO: Change this sentence to test different token combos
sentence = "hello world bert"
token_ids = tokenize(sentence)
tokens_tensor = torch.tensor(token_ids).unsqueeze(0)  # batch size 1

# Embedding layer
embedding = nn.Embedding(num_embeddings=len(vocab), embedding_dim=5)
embedded_output = embedding(tokens_tensor)

print("Token IDs:", token_ids)
print("Embeddings:", embedded_output)

# Mini challenge: Try printing the shape of `embedded_output`
# Mini challenge: Try adding another sentence and compare output shape


🔹 Section 1: Embedding Layer Basics

Token IDs: [1, 2, 3]
Embeddings: tensor([[[-0.3462, -0.9656, -0.6613, -1.1851, -0.4164],
         [-0.6087,  0.7011, -0.3392,  0.3363, -1.2427],
         [-0.3030,  1.0460,  1.0224,  0.9657, -0.8577]]],
       grad_fn=<EmbeddingBackward>)


In [4]:
t = torch.tensor([1, 2, 3])
print(t.shape)  # torch.Size([3])

torch.Size([3])


In [5]:
t.unsqueeze(0).shape

torch.Size([1, 3])

In [6]:
print(t)
print(t.unsqueeze(0))

tensor([1, 2, 3])
tensor([[1, 2, 3]])


In [None]:
# ==========================
# 📘 2. Pretrained GloVe Embeddings (via gensim)
# ==========================
print("\n🔹 Section 2: GloVe Embeddings\n")

from gensim.downloader import load
word_vectors = load("glove-wiki-gigaword-50")  # 50d for speed

# TODO: Try checking similarity between different word pairs
print("Similarity between king and queen:", word_vectors.similarity("king", "queen"))
print("Most similar to 'neural':", word_vectors.most_similar("neural", topn=3))

# Mini challenge: Try glove.most_similar(positive=['woman', 'king'], negative=['man'])


In [None]:
# ==========================
# 📘 3. BERT Feature Extraction
# ==========================
print("\n🔹 Section 3: BERT as Feature Extractor\n")

# Load BERT base model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# TODO: Change the sentence to test different contexts
inputs = tokenizer("The cat sat on the mat.", return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

# Extract CLS embedding
cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
print("[CLS] embedding shape:", cls_embedding.shape)

# Mini challenge: Extract the embedding for a specific word token (e.g., "cat")
# Mini challenge: Try two different sentences and compare [CLS] embeddings


In [None]:
# ==========================
# 📘 4. Mini Fine-Tuning Example (Optional CPU-safe)
# ==========================
print("\n🔹 Section 4: Mini Fine-Tune Setup\n")

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# TODO: Add your own text examples below to try different classes
inputs = tokenizer(["This is great!", "This is terrible!"], padding=True, return_tensors="pt")
labels = torch.tensor([1, 0])

# Forward pass + loss
outputs = model(**inputs, labels=labels)
loss = outputs.loss
print("Loss from dummy classification task:", loss.item())

# Mini challenge: Flip the labels and see what happens to the loss
# Mini challenge: Try freezing BERT and only training the classifier





In [None]:
# ==========================
# 📘 5. Freezing Layers + Visualizing
# ==========================
print("\n🔹 Section 5: Freezing BERT Layers\n")

# Freeze everything
for param in model.bert.parameters():
    param.requires_grad = False

# TODO: Try unfreezing only first N layers instead of last 2
for layer in model.bert.encoder.layer[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

# Count trainable params
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable}/{total}")

# Mini challenge: Try freezing/unfreezing different blocks and track parameter count


In [None]:
# ==========================
# 📘 6. Simulate LLRD Setup
# ==========================
print("\n🔹 Section 6: Layer-wise Learning Rates (Simulated)\n")

base_lr = 2e-5
decay = 0.9
optim_groups = []

# TODO: Try changing decay rate or base_lr to see the impact
for i, layer in enumerate(model.bert.encoder.layer):
    lr = base_lr * (decay ** (11 - i))
    optim_groups.append({"params": layer.parameters(), "lr": lr})
    print(f"Layer {i}: LR = {lr:.6f}")

# Add classifier head with higher LR
optim_groups.append({"params": model.classifier.parameters(), "lr": base_lr * 2})

print("\nSimulated optimizer groups created with LLRD-style scaling.")