In [1]:
# !pip install torch transformers datasets

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch

In [4]:
# IMDB - first 500 articles (very small and fast)
dataset = load_dataset("imdb", split="train[:500]")
print(f"Number of examples: {len(dataset)}")
print(f"First text preview: {dataset[0]['text'][:200]}...")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Number of examples: 500
First text preview: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev...


In [5]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
print(f"Vocab size: {len(tokenizer)}")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Vocab size: 50257


In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_ds = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print(f"Sample tokenized output (first 20 tokens): {tokenized_ds[0]['input_ids'][:20]}")

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Sample tokenized output (first 20 tokens): [40, 26399, 314, 3001, 327, 47269, 20958, 12, 56, 23304, 3913, 422, 616, 2008, 3650, 780, 286, 477, 262, 10386]


In [7]:
block_size = 128

def group_texts(examples):
    concatenated_inputs = sum(examples["input_ids"], [])
    concatenated_masks = sum(examples["attention_mask"], [])
    
    total_len = (len(concatenated_inputs) // block_size) * block_size
    
    # Handle case where total_len is 0 (inputs shorter than block_size)
    if total_len == 0:
        return {"input_ids": [], "attention_mask": []}
    
    concatenated_inputs = concatenated_inputs[:total_len]
    concatenated_masks = concatenated_masks[:total_len]
    
    result_input_ids = [concatenated_inputs[i:i+block_size] for i in range(0, total_len, block_size)]
    result_masks = [concatenated_masks[i:i+block_size] for i in range(0, total_len, block_size)]
    
    return {"input_ids": result_input_ids, "attention_mask": result_masks}

lm_ds = tokenized_ds.map(
    group_texts, 
    batched=True, 
    batch_size=1000,
    remove_columns=tokenized_ds.column_names  # Remove all old columns
)
print(f"Number of training sequences: {len(lm_ds)}")

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Number of training sequences: 991


In [8]:
def collate_fn(batch):
    input_ids = torch.tensor([example["input_ids"] for example in batch], dtype=torch.long)
    return {"input_ids": input_ids, "labels": input_ids.clone()}

train_loader = DataLoader(lm_ds, batch_size=8, shuffle=True, collate_fn=collate_fn)
print(f"Total batches: {len(train_loader)}")

Total batches: 124


In [9]:
for batch in train_loader:
    print(f"Input shape: {batch['input_ids'].shape}")
    print(f"Labels shape: {batch['labels'].shape}")
    print(f"Sample tokens: {batch['input_ids'][0][:10]}")
    break

Input shape: torch.Size([8, 128])
Labels shape: torch.Size([8, 128])
Sample tokens: tensor([  262,  2415,   636,   318, 21977,   618, 21769,   262, 18054,   286])
