In [4]:
"""
Tiny BERT Pretraining for QA Tasks - Simplified Version
- Uses a very small BERT model
- Processes a small subset of SQuAD
- Optimized for ~30 min runtime on a quad-core CPU
- Avoids multiprocessing to eliminate scope issues
"""

import os
import time
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    BertConfig, 
    BertForMaskedLM, 
    BertTokenizerFast,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)

print("Starting script...")

# 1. Basic setup
torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cpu")
print(f"Using device: {device}")

# 2. Load a very small dataset subset
print("Loading dataset...")
dataset = load_dataset("squad", split="train[:300]")  # Just 300 examples
print(f"Dataset loaded with {len(dataset)} examples")

# 3. Initialize tokenizer
print("Initializing tokenizer...")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
max_length = 64  # Very short sequences

# 4. Process the dataset WITHOUT multiprocessing
print("Processing dataset...")
processed_texts = []
for i in range(len(dataset)):
    example = dataset[i]
    text = f"Question: {example['question']} Context: {example['context']}"
    processed_texts.append(text)

# 5. Tokenize all texts at once (batched but not parallel)
print("Tokenizing texts...")
tokenized_data = tokenizer(
    processed_texts,
    truncation=True,
    padding="max_length",
    max_length=max_length,
    return_special_tokens_mask=True
)

# 6. Convert to dataset format
from datasets import Dataset
tokenized_dataset = Dataset.from_dict(tokenized_data)
print(f"Tokenized dataset created with {len(tokenized_dataset)} examples")

# 7. Create a tiny BERT config
print("Creating model config...")
tiny_config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=2,
    intermediate_size=256,
    max_position_embeddings=max_length
)

# 8. Initialize model
print("Initializing model...")
model = BertForMaskedLM(config=tiny_config)
print(f"Model has {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")

# 9. Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# 10. Set up training arguments
training_args = TrainingArguments(
    output_dir="./tiny_bert_qa",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=100,
    save_total_limit=1,
    logging_steps=20,
    prediction_loss_only=True,
    report_to="none",
    fp16=False,
    # Set to 0 to avoid multiprocessing issues
    dataloader_num_workers=0
)

# 11. Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

# 12. Train the model with timing
print("Starting training...")
start_time = time.time()
trainer.train()
total_time = time.time() - start_time
print(f"Training completed in {total_time:.2f} seconds ({total_time/60:.2f} minutes)")

# 13. Save the model
model_path = "./tiny_bert_qa_final"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

print("Pretraining complete! You can now use this model for fine-tuning on QA tasks.")

Starting script...
Using device: cpu
Loading dataset...
Dataset loaded with 300 examples
Initializing tokenizer...
Processing dataset...
Tokenizing texts...
Tokenized dataset created with 300 examples
Creating model config...
Initializing model...
Model has 4.23M parameters


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`