In [1]:
import torch
from transformers import (
    BertTokenizer,
    BertModel,
    BertForMaskedLM,
    BertForSequenceClassification,
    AdamWeightDecay,
    get_linear_schedule_with_warmup
)
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

# Set random seeds for reproucibility
torch.manual_seed(42)
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# ============================================================================
# TASK 1: SIMPLE BERT MODEL (Feature Extraction)
# ============================================================================

print("\n" + "="*80)
print("TASK 1: SIMPLE BERT MODEL - Feature Extraction")
print("="*80)

model_checkpoint = "bert-base-uncased"

# Step 1.1: Load pre-trained tokenizer
# Tokenizer converts text to token IDs that bert understands
print("\nSTEP 1.1: Loading tokenizer...")
tokenizer: BertTokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name_or_path=model_checkpoint,
    cache_dir="./bert_load_hf"
)

# 'bert-base-uncased': 12-layer, 768-hidden, 12-heads, 110M parameters
# 'uncased': all text converted to lowercase

print(f"Vocabulary size: {tokenizer.vocab_size}")  # 30522 tokens
print(f"Special tokens: {tokenizer.all_special_tokens}")  # [CLS], [SEP], [PAD], [MASK], [UNK]


TASK 1: SIMPLE BERT MODEL - Feature Extraction

STEP 1.1: Loading tokenizer...
Vocabulary size: 30522
Special tokens: ['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [12]:
# STEP 1.2: Prepare sample text data
print("\nSTEP 1.2: Preparing sample text...")
sample_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "BERT is a powerful language model developed by Google.",
    "Natural language processing is fascinating!"
]

print(f"Sample texts: {sample_texts}")

# STEP 1.3: Tokenize the text
print("\nSTEP 1.3: Tokenizing text...")

encoded_inputs = tokenizer(
    text=sample_texts,
    padding=True,
    truncation=True,
    max_length=32,
    return_tensors="pt",
    return_attention_mask=True
)
dict(encoded_inputs).keys()


STEP 1.2: Preparing sample text...
Sample texts: ['The quick brown fox jumps over the lazy dog.', 'BERT is a powerful language model developed by Google.', 'Natural language processing is fascinating!']

STEP 1.3: Tokenizing text...


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [13]:
# Display tokenization results
print(f"\nInput IDs shape: {encoded_inputs['input_ids'].shape}")  # (3, 32)
print(f"Attention mask shape: {encoded_inputs['attention_mask'].shape}")  # (3, 32)

print(f"\nFirst sequence tokens: {tokenizer.convert_ids_to_tokens(encoded_inputs['input_ids'][0])}")
print(f"First sequence IDs: {encoded_inputs['input_ids'][0]}")
print(f"First sequence attention mask: {encoded_inputs['attention_mask'][0]}")


Input IDs shape: torch.Size([3, 12])
Attention mask shape: torch.Size([3, 12])

First sequence tokens: ['[CLS]', 'the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', '[SEP]']
First sequence IDs: tensor([  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
         1012,   102])
First sequence attention mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [17]:
# STEP 1.4: Load pre-trained BERT model
print("\nSTEP 1.4: Loading pre-trained BERT model...")
model: BertModel = BertModel.from_pretrained(
    pretrained_model_name_or_path=model_checkpoint,
    cache_dir="./bert_load_hf"
)
# Put model in evaluation mode (disables dropout)
model.eval()
print(f"Model loaded successfully. Total parameters: {sum(p.numel() for p in model.parameters()):,}")


STEP 1.4: Loading pre-trained BERT model...
Model loaded successfully. Total parameters: 109,482,240


In [20]:
# STEP 1.5: Forward pass through BERT
print("\nSTEP 1.5: Running forward pass...")
with torch.no_grad():  # Don't compute gradients (inference only)
    outputs = model(
        input_ids=encoded_inputs["input_ids"],
        attention_mask=encoded_inputs["attention_mask"]
    )
    
# STEP 1.6: Extract outputs
print("\nSTEP 1.6: Extracting outputs...")
dict(outputs).keys()


STEP 1.5: Running forward pass...

STEP 1.6: Extracting outputs...


dict_keys(['last_hidden_state', 'pooler_output'])

In [21]:
# Output 1: Last hidden state - embeddings for ALL tokens
last_hidden_state = outputs.last_hidden_state  # (batch_size=3, seq_len=32, hidden_size=768)
print(f"Last hidden state shape: {last_hidden_state.shape}")
print(f"This contains contextual embeddings for every token in the sequence")

# Output 2: Pooler output - embedding for [CLS] token (sentence representation)
pooler_output = outputs.pooler_output  # (batch_size=3, hidden_size=768)
print(f"Pooler output shape: {pooler_output.shape}")
print(f"This is the [CLS] token representation, useful for sentence-level tasks")

Last hidden state shape: torch.Size([3, 12, 768])
This contains contextual embeddings for every token in the sequence
Pooler output shape: torch.Size([3, 768])
This is the [CLS] token representation, useful for sentence-level tasks


In [24]:
# STEP 1.7: Use embeddings for various purposes
print("\nSTEP 1.7: Using embeddings...")

# Example 1: Get embedding for a specific token (e.g., token at position 5)
token_5_embedding = last_hidden_state[0, 5, :] # (768,)
print(f"Embedding for token 5 in first sequence: shape {token_5_embedding.shape}")
print(f"Token 5 is: '{tokenizer.convert_ids_to_tokens(encoded_inputs['input_ids'][0][5].item())}'")


STEP 1.7: Using embeddings...
Embedding for token 5 in first sequence: shape torch.Size([768])
Token 5 is: 'jumps'


In [26]:
# Example 2: Get [CLS] token manually (should match pooler_output)
cls_token_manual = last_hidden_state[:, 0, :]  # (3, 768)
print(f"\n[CLS] token (manual extraction): {cls_token_manual.shape}")
cls_token_manual


[CLS] token (manual extraction): torch.Size([3, 768])


tensor([[-0.3608,  0.2271, -0.3030,  ..., -0.4224,  0.6949,  0.6213],
        [-0.0789, -0.5309,  0.2158,  ...,  0.0842, -0.2137,  0.6038],
        [-0.0030,  0.0996, -0.2846,  ..., -0.1934,  0.0864,  0.5263]])

In [43]:
# Example 3: Compute similarity between sentences using [CLS] embeddings
sentence_similarity = F.cosine_similarity(pooler_output[0:1], pooler_output[1:2])
print(f"\nCosine similarity between sentence 1 and 2: {sentence_similarity.item():.4f}")


Cosine similarity between sentence 1 and 2: -0.3177


In [42]:
pooler_output.shape

torch.Size([3, 768])

In [52]:
# Example 4: Mean pooling (alternative to [CLS] token)
# Average all token embeddings (excluding padding)
def mean_pooling(token_embeddings, attention_mask):
    """
    Args:
        token_embeddings: (batch_size, seq_len, hidden_size)
        attention_mask: (batch_size, seq_len)
    Returns:
        mean_pooled: (batch_size, hidden_size)
    """
    # Expand attention mask to match embeddings dimensions
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    # (3, 32, 1) -> (3, 32, 768)
    
    # Sum embeddings (only for non-padded tokens)
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1) # (3, 768)
    
    # Sum mask values to get count of non-padded tokens
    sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9) # (3, 768)
    
    # Divide to get mean
    return sum_embeddings / sum_mask  # (3, 768)

mean_pooled_output = mean_pooling(last_hidden_state, encoded_inputs['attention_mask'])
print(f"\nMean pooled output shape: {mean_pooled_output.shape}")


Mean pooled output shape: torch.Size([3, 768])


In [50]:
encoded_inputs["attention_mask"].unsqueeze(-1).expand(last_hidden_state.size()).float()[0]

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])