In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
# ========================================
# CELL 1: Install & Import Libraries
# ========================================
# Run this cell first (only once)
!pip install transformers tqdm -q

import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
from collections import defaultdict

print("✓ All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

[0m✓ All libraries imported successfully!
PyTorch version: 2.4.1+cu121
CUDA available: True
GPU: NVIDIA A100-PCIE-40GB


In [3]:
# ========================================
# CELL 2: Dataset Class
# ========================================
class QADataset(Dataset):
    def __init__(self, data_file, tokenizer, max_len=384, mode='train'):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.mode = mode
        
        # Load data
        print(f"Loading {data_file}...")
        with open(data_file, 'r') as f:
            raw_data = json.load(f)
        
        self.data = raw_data['data'] if 'data' in raw_data else raw_data
        self.samples = []
        self.process_data()
        print(f"✓ Loaded {len(self.samples)} samples")
    
    def process_data(self):
        for article in self.data:
            for paragraph_info in article['paragraphs']:
                context = paragraph_info['context']
                
                for qa in paragraph_info['qas']:
                    question = qa['question']
                    qid = qa['id']
                    
                    if self.mode == 'train' and qa['answers']:
                        answer = qa['answers'][0]
                        answer_start = answer['answer_start']
                        answer_text = answer['text']
                        
                        self.samples.append({
                            'id': qid,
                            'question': question,
                            'context': context,
                            'answer_start': answer_start,
                            'answer_text': answer_text
                        })
                    else:
                        self.samples.append({
                            'id': qid,
                            'question': question,
                            'context': context
                        })
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            sample['question'],
            sample['context'],
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'id': sample['id']
        }
        
        # Add token_type_ids only if available (BERT has it, RoBERTa doesn't)
        if 'token_type_ids' in encoding:
            item['token_type_ids'] = encoding['token_type_ids'].squeeze()
        
        # Add answer positions for training
        if self.mode == 'train':
            # Find answer position in tokens
            answer_encoding = self.tokenizer(
                sample['question'],
                sample['context'],
                max_length=self.max_len,
                truncation=True,
                return_offsets_mapping=True
            )
            
            offsets = answer_encoding['offset_mapping']
            answer_start = sample['answer_start']
            answer_end = answer_start + len(sample['answer_text'])
            
            # Find start and end token positions
            start_position = 0
            end_position = 0
            
            for i, (offset_start, offset_end) in enumerate(offsets):
                if offset_start <= answer_start < offset_end:
                    start_position = i
                if offset_start < answer_end <= offset_end:
                    end_position = i
                    break
            
            item['start_position'] = torch.tensor(start_position)
            item['end_position'] = torch.tensor(end_position)
        
        return item

print("✓ Dataset class defined")

✓ Dataset class defined


In [4]:
# ========================================
# CELL 3: Configuration
# ========================================
CONFIG = {
    'model_name': 'roberta-base',  # Change to 'roberta-base' for better results
    'train_file': 'spoken_train-v1.1.json',
    'test_file': 'spoken_test-v1.1.json',
    'batch_size': 16,  # Reduce to 8 if you get memory errors
    'num_epochs': 3,
    'learning_rate': 3e-5,
    'max_len': 384,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Configuration set! Using device: {device}")



Configuration set! Using device: cuda


In [6]:
# ========================================
# CELL 4: Load Model & Data
# ========================================
print("Loading model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
model = AutoModelForQuestionAnswering.from_pretrained(CONFIG['model_name'])
model.to(device)

print("\nCreating datasets...")
train_dataset = QADataset(CONFIG['train_file'], tokenizer, CONFIG['max_len'], mode='train')
test_dataset = QADataset(CONFIG['test_file'], tokenizer, CONFIG['max_len'], mode='test')

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

print(f"\n✓ Ready to train!")
print(f"Training batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")


Loading model and tokenizer...


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Creating datasets...
Loading spoken_train-v1.1.json...
✓ Loaded 37111 samples
Loading spoken_test-v1.1.json...
✓ Loaded 5351 samples

✓ Ready to train!
Training batches: 2320
Test batches: 335


In [7]:
# ========================================
# CELL 5: Training Function
# ========================================
def train_model(model, train_loader, device, num_epochs, lr):
    optimizer = AdamW(model.parameters(), lr=lr)
    
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )
    
    print(f"Starting training for {num_epochs} epochs...")
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')
        for batch in progress_bar:
            optimizer.zero_grad()
            
            # Move to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_position'].to(device)
            end_positions = batch['end_position'].to(device)
            
            # Get token_type_ids only if available (for BERT)
            token_type_ids = batch.get('token_type_ids')
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                start_positions=start_positions,
                end_positions=end_positions
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        avg_loss = total_loss / len(train_loader)
        print(f'✓ Epoch {epoch+1} complete - Average Loss: {avg_loss:.4f}')
    
    print("\n🎉 Training complete!")
    return model

print("✓ Training function defined")



✓ Training function defined


In [8]:
# ========================================
# CELL 6: Prediction Function
# ========================================
def predict(model, test_loader, tokenizer, device):
    model.eval()
    predictions = {}
    
    print("Making predictions...")
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Predicting'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ids = batch['id']
            
            # Get token_type_ids only if available (for BERT)
            token_type_ids = batch.get('token_type_ids')
            if token_type_ids is not None:
                token_type_ids = token_type_ids.to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )
            
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            
            for i in range(len(ids)):
                # Get best start and end positions
                start_idx = torch.argmax(start_logits[i]).item()
                end_idx = torch.argmax(end_logits[i]).item()
                
                # Make sure end >= start
                if end_idx < start_idx:
                    end_idx = start_idx
                
                # Limit answer length
                if end_idx - start_idx > 30:
                    end_idx = start_idx + 30
                
                # Get answer tokens
                answer_tokens = input_ids[i][start_idx:end_idx+1]
                answer_text = tokenizer.decode(answer_tokens, skip_special_tokens=True)
                
                predictions[ids[i]] = answer_text.strip() if answer_text.strip() else "unknown"
    
    print(f"✓ Generated {len(predictions)} predictions")
    return predictions

print("✓ Prediction function defined")


✓ Prediction function defined


In [9]:
# ========================================
# CELL 7: RUN TRAINING
# ========================================
# This is the main training cell - run this to start training!
trained_model = train_model(
    model,
    train_loader,
    device,
    num_epochs=CONFIG['num_epochs'],
    lr=CONFIG['learning_rate']
)




Starting training for 3 epochs...


Epoch 1/3:   0%|          | 0/2320 [00:00<?, ?it/s]

✓ Epoch 1 complete - Average Loss: 2.1688


Epoch 2/3:   0%|          | 0/2320 [00:00<?, ?it/s]

✓ Epoch 2 complete - Average Loss: 1.1783


Epoch 3/3:   0%|          | 0/2320 [00:00<?, ?it/s]

✓ Epoch 3 complete - Average Loss: 0.8260

🎉 Training complete!


In [10]:
# ========================================
# CELL 8: MAKE PREDICTIONS
# ========================================
# Run this after training completes
predictions = predict(trained_model, test_loader, tokenizer, device)

# Save predictions
with open('predictions.json', 'w') as f:
    json.dump(predictions, f, indent=2)

print("\n✅ All done! predictions.json has been created.")
print(f"Total predictions: {len(predictions)}")

# Show a few examples
print("\nFirst 5 predictions:")
for i, (qid, answer) in enumerate(list(predictions.items())[:5]):
    print(f"{i+1}. {qid}: {answer}")

Making predictions...


Predicting:   0%|          | 0/335 [00:00<?, ?it/s]

✓ Generated 5351 predictions

✅ All done! predictions.json has been created.
Total predictions: 5351

First 5 predictions:
1. 56be4db0acb8001400a502ec: denver broncos
2. 56be4db0acb8001400a502ed: denver broncos
3. 56be4db0acb8001400a502ee: levis stadium
4. 56be4db0acb8001400a502ef: denver broncos
5. 56be4db0acb8001400a502f0: roman numerals
