In [6]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from torch.optim import AdamW
from tqdm import tqdm
from accelerate import Accelerator
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import f1_score
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler

In [7]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("google-bert/bert-large-uncased-whole-word-masking-finetuned-squad")

Some weights of the model checkpoint at google-bert/bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def load_and_encode_squad(file_path, tokenizer):
    contexts, questions, answers = [], [], []
    
    # Load and parse the SQuAD data
    with open(file_path, 'r') as file:
        squad_data = json.load(file)
    
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer_type = 'plausible_answers' if 'plausible_answers' in qa else 'answers'
                
                for answer in qa[answer_type]:
                    contexts.append(context)
                    questions.append(question)
                    # Track answer text and its character positions
                    answers.append({
                        'text': answer['text'],
                        'start': answer['answer_start'],
                        'end': answer['answer_start'] + len(answer['text'])
                    })

    # Tokenize contexts and questions together
    encodings = tokenizer(contexts, questions, truncation=True, padding=True, stride=100)
    
    # Initialize lists for token start and end positions
    start_positions, end_positions = [], []

    # Calculate token positions for each answer
    for i, answer in enumerate(answers):
        start_idx = encodings.char_to_token(i, answer['start']) or tokenizer.model_max_length
        end_idx = encodings.char_to_token(i, max(0, answer['end'] - 1)) or tokenizer.model_max_length
        
        # Adjust end position if out of range
        shift = 1
        while end_idx is None:
            end_idx = encodings.char_to_token(i, max(0, answer['end'] - 1 - shift))
            shift += 1
        
        start_positions.append(start_idx)
        end_positions.append(end_idx)

    # Add start and end positions to encodings and remove 'token_type_ids'
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    encodings.pop('token_type_ids', None)
    
    return encodings

# Load, encode, and add positions in one step
train_encodings = load_and_encode_squad('spokenSquad/train-v1.json', tokenizer)
test_encodings = load_and_encode_squad('spokenSquad/test-v1.json', tokenizer)

# Dataset class remains the same
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = SquadDataset(train_encodings)
test_dataset = SquadDataset(test_encodings)


In [9]:
#Sneak a peak
example = train_dataset[0]
print(example)

# Decode 
input_ids = example['input_ids']
text_decoded = tokenizer.decode(input_ids, skip_special_tokens=True)
print("Decoded context and question:", text_decoded)
print("Start position:", example['start_positions'].item())
print("End position:", example['end_positions'].item())

{'input_ids': tensor([  101,  6549,  2135,  1996,  2082,  2038,  1037,  3234,  2839,  1012,
        10234,  1996,  2364,  2311,  2082,  8514,  2003,  1996,  3585,  6231,
         1997,  1996,  6261,  2984,  1012,  3202,  1999,  2392,  1997,  1996,
         2364,  2311,  1999,  5307,  2009,  2003,  1037,  6967,  6231,  1997,
         4828,  2007,  2608, 10439, 14995,  6924,  2007,  1996,  5722,  1998,
         1996,  2919,  2033,  5004,  3415,  1012,  2279,  2000,  1996,  2364,
         2311,  2003,  1996, 13546,  1997,  1996,  6730,  2540,  1012,  3202,
         2369,  1996, 13546,  2003,  1996, 24665, 23052, 10047,  2984,  1999,
         2173,  1997,  7083,  1998,  9185,  1012,  2009,  2003,  1037, 15059,
         1997,  1996, 24665, 23052,  2012, 10223, 26371,  2605,  2073,  1996,
         6261,  2984, 22353,  2135,  2596,  2000,  2358, 16595,  9648,  4674,
         2145,  5255,  7763,  5595,  2809,  1012,  2012,  1996,  2203,  1997,
         1996,  2364,  3298,  1998,  1999,  1037, 

In [12]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set device and enable mixed precision
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define optimizer and gradient scaler for mixed precision
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

# Set up data loader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
warmup_steps = int(0.1 * num_training_steps)

# Scheduler for learning rate warm-up
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)

# Gradient accumulation settings
accumulation_steps = 2  # Accumulate gradients for 2 batches

for epoch in range(num_epochs):
    model.train()
    loop = tqdm(train_loader, leave=True, desc=f'Epoch {epoch + 1}/{num_epochs}')
    
    for step, batch in enumerate(loop):
        # Move batch to device with asynchronous loading
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        start_positions = batch['start_positions'].to(device, non_blocking=True)
        end_positions = batch['end_positions'].to(device, non_blocking=True)
        
        # Mixed precision forward pass with gradient accumulation
        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask,
                            start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss / accumulation_steps  # Scale loss for gradient accumulation
            
        # Backward pass
        scaler.scale(loss).backward()

        # Update weights and zero gradients every 'accumulation_steps' batches
        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()

        # Update progress bar with current loss
        loop.set_postfix(loss=loss.item())

print("Training completed.")


Epoch 1/3: 100%|██████████| 1160/1160 [09:06<00:00,  2.12it/s, loss=0.401]
Epoch 2/3: 100%|██████████| 1160/1160 [09:05<00:00,  2.13it/s, loss=0.356] 
Epoch 3/3: 100%|██████████| 1160/1160 [09:05<00:00,  2.13it/s, loss=0.0981]

Training completed.





In [13]:
# Switch model to evaluation mode
model.eval()

# Initialize the validation data loader
val_loader = DataLoader(test_dataset, batch_size=16)

# Create arrays to hold true and predicted values
true_starts = []
true_ends = []
pred_starts = []
pred_ends = []

# Progress bar for the validation loop
loop = tqdm(val_loader, desc="Evaluating")
for batch in loop:
    with torch.no_grad():  # Disable gradient calculation
        # Move batch data to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Model predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Store true and predicted values
        true_starts.append(start_true.cpu().numpy())
        true_ends.append(end_true.cpu().numpy())
        pred_starts.append(start_pred.cpu().numpy())
        pred_ends.append(end_pred.cpu().numpy())

# Convert lists to numpy arrays
true_starts = np.concatenate(true_starts)
true_ends = np.concatenate(true_ends)
pred_starts = np.concatenate(pred_starts)
pred_ends = np.concatenate(pred_ends)

# Function to calculate F1 score
def calculate_f1(true_labels, predicted_labels):
    true_pos = np.sum(true_labels == predicted_labels)
    false_pos = np.sum((predicted_labels != true_labels) & (predicted_labels != -1))
    false_neg = np.sum((true_labels != predicted_labels) & (true_labels != -1))

    precision = true_pos / (true_pos + false_pos + 1e-9)
    recall = true_pos / (true_pos + false_neg + 1e-9)
    
    return 2 * (precision * recall) / (precision + recall + 1e-9)

# Calculate F1 scores for start and end predictions
f1_start = calculate_f1(true_starts, pred_starts)
f1_end = calculate_f1(true_ends, pred_ends)

# Average F1 score
f1_average = (f1_start + f1_end) / 2

print(f"F1 score: {f1_average:.4f}")

Evaluating: 100%|██████████| 993/993 [06:08<00:00,  2.69it/s]

F1 score: 0.6932





In [14]:
# Save the modified model
model.save_pretrained( "DLHW3/improved_model" )

# If you want to save the tokenizer as well
tokenizer.save_pretrained("DLHW3/modified_tokenizer")

('DLHW3/modified_tokenizer/tokenizer_config.json',
 'DLHW3/modified_tokenizer/special_tokens_map.json',
 'DLHW3/modified_tokenizer/vocab.txt',
 'DLHW3/modified_tokenizer/added_tokens.json',
 'DLHW3/modified_tokenizer/tokenizer.json')