In [1]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import json
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from torch.optim import AdamW
from tqdm import tqdm
from accelerate import Accelerator
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import f1_score
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
import os

model = AutoModelForQuestionAnswering.from_pretrained('DLHW3/improved_model')
tokenizer = AutoTokenizer.from_pretrained('DLHW3/modified_tokenizer')

  torch.utils._pytree._register_pytree_node(


In [2]:
def load_and_encode_squad(file_path, tokenizer):
    contexts, questions, answers = [], [], []
    
    # Load and parse the SQuAD data
    with open(file_path, 'r') as file:
        squad_data = json.load(file)
    
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer_type = 'plausible_answers' if 'plausible_answers' in qa else 'answers'
                
                for answer in qa[answer_type]:
                    contexts.append(context)
                    questions.append(question)
                    # Track answer text and its character positions
                    answers.append({
                        'text': answer['text'],
                        'start': answer['answer_start'],
                        'end': answer['answer_start'] + len(answer['text'])
                    })

    # Tokenize contexts and questions together
    encodings = tokenizer(contexts, questions, truncation=True, padding=True, stride=100)
    
    # Initialize lists for token start and end positions
    start_positions, end_positions = [], []

    # Calculate token positions for each answer
    for i, answer in enumerate(answers):
        start_idx = encodings.char_to_token(i, answer['start']) or tokenizer.model_max_length
        end_idx = encodings.char_to_token(i, max(0, answer['end'] - 1)) or tokenizer.model_max_length
        
        # Adjust end position if out of range
        shift = 1
        while end_idx is None:
            end_idx = encodings.char_to_token(i, max(0, answer['end'] - 1 - shift))
            shift += 1
        
        start_positions.append(start_idx)
        end_positions.append(end_idx)

    # Add start and end positions to encodings and remove 'token_type_ids'
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    encodings.pop('token_type_ids', None)
    
    return encodings

# Load, encode, and add positions in one step
train_encodings = load_and_encode_squad('spokenSquad/train-v1.json', tokenizer)
test_encodings = load_and_encode_squad('spokenSquad/test-v1.json', tokenizer)

# Dataset class remains the same
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = SquadDataset(train_encodings)
test_dataset = SquadDataset(test_encodings)


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Switch model to evaluation mode
model.eval()

# Initialize the validation data loader
val_loader = DataLoader(test_dataset, batch_size=16)

# Create arrays to hold true and predicted values
true_starts = []
true_ends = []
pred_starts = []
pred_ends = []

# Progress bar for the validation loop
loop = tqdm(val_loader, desc="Evaluating")
for batch in loop:
    with torch.no_grad():  # Disable gradient calculation
        # Move batch data to the appropriate device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Model predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        # Store true and predicted values
        true_starts.append(start_true.cpu().numpy())
        true_ends.append(end_true.cpu().numpy())
        pred_starts.append(start_pred.cpu().numpy())
        pred_ends.append(end_pred.cpu().numpy())

# Convert lists to numpy arrays
true_starts = np.concatenate(true_starts)
true_ends = np.concatenate(true_ends)
pred_starts = np.concatenate(pred_starts)
pred_ends = np.concatenate(pred_ends)

# Function to calculate F1 score
def calculate_f1(true_labels, predicted_labels):
    true_pos = np.sum(true_labels == predicted_labels)
    false_pos = np.sum((predicted_labels != true_labels) & (predicted_labels != -1))
    false_neg = np.sum((true_labels != predicted_labels) & (true_labels != -1))

    precision = true_pos / (true_pos + false_pos + 1e-9)
    recall = true_pos / (true_pos + false_neg + 1e-9)
    
    return 2 * (precision * recall) / (precision + recall + 1e-9)

# Calculate F1 scores for start and end predictions
f1_start = calculate_f1(true_starts, pred_starts)
f1_end = calculate_f1(true_ends, pred_ends)

# Average F1 score
f1_average = (f1_start + f1_end) / 2

print(f"F1 score: {f1_average:.4f}")

Evaluating: 100%|██████████| 993/993 [06:08<00:00,  2.70it/s]

F1 score: 0.6932



