# lab 3.2 Part 1: Pre-Training

## 1. Implement the Encoder model

In [2]:
import sys
sys.path.append('./code')
from encoder import Encoder
from data import TextDataset
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
import torch

In [5]:
# Step 0: Test the Encoder 
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
encoder = Encoder(vocab_size=tokenizer.vocab_size)
encoder.eval()

texts = ["This is a test sentence for the encoder."]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=16)

input_ids = inputs['input_ids']
token_type_ids = inputs['token_type_ids']
attention_mask = inputs['attention_mask']

with torch.no_grad():
    hidden = encoder(input_ids, token_type_ids, attention_mask)
    sentence_embeddings = hidden.cpu().numpy().mean(axis=1) # shape: [num_sentences, hidden_dim]

print("Output Shape:", hidden.shape)
print("Sentence Embedding Shape:", sentence_embeddings.shape)

Output Shape: torch.Size([1, 13, 256])
Sentence Embedding Shape: (1, 256)


In [6]:
# Step 1: Load the podcast data
import pickle, os

data_path = "/ocean/projects/mth240012p/shared/data"
raw_text_path = os.path.join(data_path, "raw_text.pkl")

with open(raw_text_path, "rb") as f:
    raw_text = pickle.load(f)
print("Loaded raw_text with", len(raw_text), "stories")

Loaded raw_text with 109 stories


In [10]:
# Step 2: Encoder story with tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
device = "cuda" if torch.cuda.is_available() else "cpu"

encoder = Encoder(vocab_size=tokenizer.vocab_size)
encoder.to(device)
encoder.eval()

encoder_outputs = {}

for story_name in raw_text:
    words = raw_text[story_name].data
    inputs = tokenizer(words, padding=True, truncation=True, return_tensors='pt',max_length=64)

    input_ids = inputs["input_ids"].to(device)
    token_type_ids = inputs["token_type_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        hidden = encoder(input_ids, token_type_ids, attention_mask)

    # Extract mean value of each sentence
    sentence_embeddings = hidden.mean(dim=1).cpu().numpy()
    encoder_outputs[story_name] = sentence_embeddings
    print(f"{story_name} → shape: {sentence_embeddings.shape}")

sweetaspie → shape: (697, 256)
thatthingonmyarm → shape: (2073, 256)
tildeath → shape: (2297, 256)
indianapolis → shape: (1554, 256)
lawsthatchokecreativity → shape: (2084, 256)
golfclubbing → shape: (1211, 256)
jugglingandjesus → shape: (887, 256)
shoppinginchina → shape: (1731, 256)
cocoonoflove → shape: (1984, 256)
hangtime → shape: (1927, 256)
beneaththemushroomcloud → shape: (1916, 256)
dialogue4 → shape: (1692, 256)
thepostmanalwayscalls → shape: (2220, 256)
stumblinginthedark → shape: (2681, 256)
kiksuya → shape: (1699, 256)
haveyoumethimyet → shape: (2985, 256)
theinterview → shape: (1079, 256)
againstthewind → shape: (838, 256)
tetris → shape: (1350, 256)
canplanetearthfeedtenbillionpeoplepart2 → shape: (2532, 256)
alternateithicatom → shape: (2174, 256)
goldiethegoldfish → shape: (1680, 256)
seedpotatoesofleningrad → shape: (1376, 256)
onapproachtopluto → shape: (1357, 256)
canplanetearthfeedtenbillionpeoplepart1 → shape: (2341, 256)
bluehope → shape: (1941, 256)
superheroesj

In [19]:
# Step 3: Down sample and trim
from preprocessing import downsample_word_vectors, make_delayed

# Step 3.1: Downsample
print("Downsampling encoder outputs...")
downsampled_encoder = downsample_word_vectors(
    list(raw_text.keys()), encoder_outputs, raw_text
)

# Step 3.2: Trim with progress bar
print("Trimming")
X_encoder_trimmed = {}
for story in downsampled_encoder:
    X_encoder_trimmed[story] = downsampled_encoder[story][5:-10, :]

# Step 3.3: Create delayed version with progress bar
print("Creating delayed features")
X_encoder_lagged = {}
for story in X_encoder_trimmed:
    X_encoder_lagged[story] = make_delayed(X_encoder_trimmed[story], delays=range(1, 5))
print("Delayed features are created")
# Print one example
story_example = list(X_encoder_lagged.keys())[0]
print(f"Sample story: {story_example}")
print("Lagged shape:", X_encoder_lagged[story_example].shape)

Downsampling encoder outputs...
Trimming
Creating delayed features
Delayed features are created
Sample story: sweetaspie
Lagged shape: (157, 1024)


# lab 3.2 Part 2: Modeling & Evaluation