In [None]:
import os
import time
import numpy as np
import psutil
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from transformers import (
    BertConfig,
    BertForMaskedLM,
    BertTokenizerFast,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Resource monitoring function
def print_resource_usage():
    process = psutil.Process(os.getpid())
    memory_mb = process.memory_info().rss / (1024 * 1024)
    cpu_percent = psutil.cpu_percent(interval=0.1)
    print(f"Memory usage: {memory_mb:.2f} MB | CPU usage: {cpu_percent:.1f}%")

# Setting device - using CPU
device = torch.device("cpu")
print(f"Using device: {device}")
print(f"CPU: {psutil.cpu_count(logical=True)} logical cores")
print(f"RAM: {psutil.virtual_memory().total / (1024**3):.1f} GB total")
print_resource_usage()

# Configure to use less memory
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Load a tiny dataset (SQuAD is good for QA)
# Using a very small subset for rapid pretraining (~30min)
print("Loading dataset...")
dataset = load_dataset("squad", split="train[:500]")  # Just 500 examples for faster training
print(f"Dataset loaded with {len(dataset)} examples")
print_resource_usage()

# Initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Tokenize the dataset
max_length = 64  # Using very small sequence length to speed up training

def tokenize_function(examples):
    # Combine the question and context/answer for MLM pretraining
    texts = [
        f"Question: {q} Context: {c}" 
        for q, c in zip(examples["question"], examples["context"])
    ]
    
    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_special_tokens_mask=True
    )

print("Tokenizing dataset...")
start_time = time.time()
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    num_proc=2,  # Using 2 processes on your i7 is fine
    remove_columns=dataset.column_names,
)
print(f"Tokenization completed in {time.time() - start_time:.2f} seconds")
print_resource_usage()

# Data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Configure a tiny BERT model for pretraining (for ~30min runtime)
smaller_config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=128,      # Tiny hidden size (default is 768)
    num_hidden_layers=3,  # Minimal layers (default is 12)
    num_attention_heads=2,  # Minimal attention heads (default is 12)
    intermediate_size=256,  # Tiny intermediate size (default is 3072)
    max_position_embeddings=128  # Match our sequence length
)

# Initialize model with smaller config to reduce memory usage
model = BertForMaskedLM(config=smaller_config)
print(f"Model initialized with {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")

# Set up training arguments optimized for ~30min runtime on quad-core i7
training_args = TrainingArguments(
    output_dir="./bert_pretrained_qa",
    overwrite_output_dir=True,
    num_train_epochs=5,  # More epochs since we have a tiny model and dataset
    per_device_train_batch_size=16,  # Larger batch size is fine with your 16GB RAM
    save_steps=250,
    save_total_limit=1,  # Only keep the best model to save disk space
    prediction_loss_only=True,
    logging_dir="./logs",
    logging_steps=50,
    # Optimization for speed
    fp16=False,
    dataloader_num_workers=2,  # Use 2 workers on your i7
    report_to="none",  # Disable wandb/tensorboard
    # Additional speed optimizations
    gradient_accumulation_steps=1,
    warmup_steps=50,
    weight_decay=0.01,
    # Performance monitoring
    evaluation_strategy="no",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

# Train the model with timing
print("Starting training...")
print_resource_usage()
start_time = time.time()

# Create a custom callback to monitor time and resources
class ResourceMonitorCallback(TrainingArguments):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero:
            elapsed = time.time() - start_time
            print(f"Step: {state.global_step} | Time elapsed: {elapsed:.2f}s | Est. remaining: {(elapsed/state.global_step)*(state.max_steps-state.global_step):.2f}s")
            print_resource_usage()

trainer.add_callback(ResourceMonitorCallback())
trainer.train()

total_time = time.time() - start_time
print(f"Training completed in {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
print_resource_usage()

# Save the pretrained model
model_path = "./bert_pretrained_qa_final"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

# Optional: Fine-tune for QA task after pretraining
print("Pretraining complete! You can now fine-tune this model for your specific QA task.")
print("Example fine-tuning code:")
print("""
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments

# Load pretrained model
model = BertForQuestionAnswering.from_pretrained("./bert_pretrained_qa_final")

# Process SQuAD data for QA (with start/end positions)
# ... data processing code ...

# Set up QA training arguments
training_args = TrainingArguments(
    output_dir="./bert_qa_finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    # Add other training arguments
)

# Initialize trainer for QA
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    # Add validation dataset and metrics
)

# Fine-tune for QA
trainer.train()
""")