In [1]:
# 📘 Transfer Learning & Word Embedding Practice Notebook
# Works 100% on CPU with Anaconda + Jupyter + minimal setup

# ======================
# 📦 0. Setup + Imports
# ======================
#!pip install torch torchvision torchaudio --quiet
#!pip install transformers gensim --quiet

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from gensim.models import KeyedVectors

import numpy as np
import matplotlib.pyplot as plt

In [2]:
# ==========================
# 📘 1. Learnable Embeddings
# ==========================
print("\n🔹 Section 1: Embedding Layer Basics\n")

# Define vocab and toy tokenizer
vocab = {"[PAD]": 0, "hello": 1, "world": 2, "bert": 3, "rocks": 4}
inv_vocab = {v: k for k, v in vocab.items()}

def tokenize(text):
    return [vocab.get(token, 0) for token in text.lower().split()]

# TODO: Change this sentence to test different token combos
sentence = "hello world bert"
token_ids = tokenize(sentence)
tokens_tensor = torch.tensor(token_ids).unsqueeze(0)  # batch size 1

# Embedding layer
embedding = nn.Embedding(num_embeddings=len(vocab), embedding_dim=5)
embedded_output = embedding(tokens_tensor)

print("Token IDs:", token_ids)
print("Embeddings:", embedded_output)

# Mini challenge: Try printing the shape of `embedded_output`
# Mini challenge: Try adding another sentence and compare output shape


🔹 Section 1: Embedding Layer Basics

Token IDs: [1, 2, 3]
Embeddings: tensor([[[-1.6880,  1.3319, -1.2108, -0.9847, -1.5990],
         [ 0.0612,  2.6173, -3.0739,  0.7096,  0.1382],
         [ 1.2916, -0.2856,  1.2343, -0.4007,  2.3343]]],
       grad_fn=<EmbeddingBackward0>)


In [3]:
t = torch.tensor([3,2,1]) 

In [4]:
print(t.shape)
print(t.unsqueeze(-2).shape)

torch.Size([3])
torch.Size([1, 3])


In [5]:
print(t.shape)
print(t.unsqueeze(-1).shape)

torch.Size([3])
torch.Size([3, 1])


In [6]:
print(t.shape)
print(t.unsqueeze(0).shape)

torch.Size([3])
torch.Size([1, 3])


In [7]:
print(t.shape)
print(t.unsqueeze(1).shape)

torch.Size([3])
torch.Size([3, 1])


In [8]:
# ==========================
# 📘 2. Pretrained GloVe Embeddings (via gensim)
# ==========================
print("\n🔹 Section 2: GloVe Embeddings\n")

from gensim.downloader import load
word_vectors = load("glove-wiki-gigaword-50")  # 50d for speed

# TODO: Try checking similarity between different word pairs
print("Similarity between king and queen:", word_vectors.similarity("king", "queen"))
print("Most similar to 'neural':", word_vectors.most_similar("neural", topn=3))

# Mini challenge: Try glove.most_similar(positive=['woman', 'king'], negative=['man'])



🔹 Section 2: GloVe Embeddings

Similarity between king and queen: 0.7839043
Most similar to 'neural': [('neuronal', 0.8451142907142639), ('differentiation', 0.8052098751068115), ('neurons', 0.781899631023407)]


In [9]:
word_vectors.most_similar(positive=['woman','king'], negative=['man'])

[('queen', 0.8523604273796082),
 ('throne', 0.7664334177970886),
 ('prince', 0.759214460849762),
 ('daughter', 0.7473882436752319),
 ('elizabeth', 0.7460219860076904),
 ('princess', 0.7424570322036743),
 ('kingdom', 0.7337411642074585),
 ('monarch', 0.721449077129364),
 ('eldest', 0.7184861898422241),
 ('widow', 0.7099431157112122)]

In [12]:
word_vectors.most_similar(word_vectors['woman']+word_vectors['king'] - word_vectors['man'])

[('king', 0.8859834671020508),
 ('queen', 0.8609580993652344),
 ('daughter', 0.7684512138366699),
 ('prince', 0.7640699744224548),
 ('throne', 0.7634970545768738),
 ('princess', 0.7512727975845337),
 ('elizabeth', 0.7506488561630249),
 ('father', 0.7314496636390686),
 ('kingdom', 0.7296158671379089),
 ('mother', 0.7280009984970093)]

In [13]:
# ==========================
# 📘 3. BERT Feature Extraction
# ==========================
print("\n🔹 Section 3: BERT as Feature Extractor\n")

# Load BERT base model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# TODO: Change the sentence to test different contexts
inputs = tokenizer("The cat sat on the mat.", return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

# Extract CLS embedding
cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
print("[CLS] embedding shape:", cls_embedding.shape)

# Mini challenge: Extract the embedding for a specific word token (e.g., "cat")
# Mini challenge: Try two different sentences and compare [CLS] embeddings



🔹 Section 3: BERT as Feature Extractor



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[CLS] embedding shape: torch.Size([1, 768])


In [29]:
cls_1 = tokenizer("I love women with a nice ass.", return_tensors="pt")
cls_2 = tokenizer("The political landscape of the United states is filled with lobbyist.", return_tensors="pt")

outputs1  = model(**cls_1).last_hidden_state[:,0,:]
outputs2 = model(**cls_2).last_hidden_state[:,0,:]

In [30]:
cos = torch.nn.functional.cosine_similarity(outputs1, outputs2)
print("Similarity:", cos.item())

Similarity: 0.8238242864608765


In [43]:
# ==========================
# 📘 4. Mini Fine-Tuning Example (Optional CPU-safe)
# ==========================
print("\n🔹 Section 4: Mini Fine-Tune Setup\n")

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Freeze everything
for param in model.bert.parameters():
    param.requires_grad = False
    
print(model.classifier)

# TODO: Add your own text examples below to try different classes
inputs = tokenizer(["This is great!", "This is terrible!"], padding=True, return_tensors="pt")
labels = torch.tensor([1, 0])

# Forward pass + loss
outputs = model(**inputs, labels=labels)
loss = outputs.loss
print("Loss from dummy classification task:", loss.item())

# Mini challenge: Flip the labels and see what happens to the loss
# Mini challenge: Try freezing BERT and only training the classifier






🔹 Section 4: Mini Fine-Tune Setup



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Linear(in_features=768, out_features=2, bias=True)
Loss from dummy classification task: 0.708653450012207


In [44]:
# ==========================
# 📘 5. Freezing Layers + Visualizing
# ==========================
print("\n🔹 Section 5: Freezing BERT Layers\n")

# Freeze everything
for param in model.bert.parameters():
    param.requires_grad = False

# TODO: Try unfreezing only first N layers instead of last 2
for layer in model.bert.encoder.layer[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

# Count trainable params
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable}/{total}")

# Mini challenge: Try freezing/unfreezing different blocks and track parameter count



🔹 Section 5: Freezing BERT Layers

Trainable parameters: 14177282/109483778


In [45]:
# ==========================
# 📘 6. Simulate LLRD Setup
# ==========================
print("\n🔹 Section 6: Layer-wise Learning Rates (Simulated)\n")

base_lr = 2e-5
decay = 0.9
optim_groups = []

# TODO: Try changing decay rate or base_lr to see the impact
for i, layer in enumerate(model.bert.encoder.layer):
    lr = base_lr * (decay ** (11 - i))
    optim_groups.append({"params": layer.parameters(), "lr": lr})
    print(f"Layer {i}: LR = {lr:.6f}")

# Add classifier head with higher LR
optim_groups.append({"params": model.classifier.parameters(), "lr": base_lr * 2})

print("\nSimulated optimizer groups created with LLRD-style scaling.")


🔹 Section 6: Layer-wise Learning Rates (Simulated)

Layer 0: LR = 0.000006
Layer 1: LR = 0.000007
Layer 2: LR = 0.000008
Layer 3: LR = 0.000009
Layer 4: LR = 0.000010
Layer 5: LR = 0.000011
Layer 6: LR = 0.000012
Layer 7: LR = 0.000013
Layer 8: LR = 0.000015
Layer 9: LR = 0.000016
Layer 10: LR = 0.000018
Layer 11: LR = 0.000020

Simulated optimizer groups created with LLRD-style scaling.
