# BERT QA Model Fine-Tuning for DEFNLP

This notebook demonstrates the fine-tuning process for the BERT Question Answering model used in the DEFNLP pipeline to identify hidden-in-plain-sight data citations.

## Overview
- Load and prepare training data
- Create custom QA dataset
- Fine-tune BERT model for question answering
- Save the trained model

In [1]:
import zipfile
import os

zip_path = "/content/dataaa.zip"   # Absolute path to your zip file
extract_folder = "/content/extracted"  # folder where files will be extracted

if not os.path.exists(zip_path):
    print(f"Error: Zip file not found at {zip_path}. Please ensure it is uploaded.")
else:
    os.makedirs(extract_folder, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)

    print("Extraction completed!")

Extraction completed!


## 1. Import Required Libraries

In [2]:
import pandas as pd
import torch
import importlib
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator
)
from typing import List, Dict, Tuple
import config
import utils

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.9.0+cu126
CUDA available: True
CUDA device: Tesla T4


## 2. Define QA Dataset Class

This custom dataset class handles the tokenization and preparation of question-answer pairs for training.

In [3]:
class QADataset(Dataset):
    """Dataset for Question Answering fine-tuning."""

    def __init__(
        self,
        contexts: List[str],
        questions: List[str],
        answers: List[Dict],
        tokenizer,
        max_length: int = 512
    ):
        """
        Initialize QA dataset.

        Args:
            contexts: List of context texts
            questions: List of questions
            answers: List of answer dictionaries with 'text' and 'answer_start'
            tokenizer: Tokenizer to use
            max_length: Maximum sequence length
        """
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.contexts)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]

        # Tokenize with offsets mapping to align characters to tokens
        encoding = self.tokenizer(
            question,
            context,
            max_length=self.max_length,
            truncation="only_second", # Truncate context, not question
            stride=128,               # Handle overlap if using sliding window
            padding="max_length",
            return_offsets_mapping=True, # Crucial for finding answer position
            return_tensors="pt"
        )

        offset_mapping = encoding.pop("offset_mapping").squeeze().tolist()
        input_ids = encoding["input_ids"].squeeze().tolist()

        # Get char start/end
        start_char = answer['answer_start']
        end_char = start_char + len(answer['text'])

        # Find the token index that corresponds to the char index
        sequence_ids = encoding.sequence_ids()

        # Find the start and end of the context in the tokens
        idx = 0
        while sequence_ids[idx] != 1: # 1 indicates the context part
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If answer is not inside the context (due to truncation), label as 0 (CLS)
        if offset_mapping[context_start][0] > start_char or offset_mapping[context_end][1] < end_char:
            start_positions = 0
            end_positions = 0
        else:
            # Map start char to token
            idx = context_start
            while idx <= context_end and offset_mapping[idx][0] <= start_char:
                idx += 1
            start_positions = idx - 1

            # Map end char to token
            idx = context_end
            while idx >= context_start and offset_mapping[idx][1] >= end_char:
                idx -= 1
            end_positions = idx + 1

        encoding['start_positions'] = torch.tensor(start_positions)
        encoding['end_positions'] = torch.tensor(end_positions)

        # Clean up tensors
        return {k: v.squeeze(0) if v.dim() > 1 else v for k, v in encoding.items()}

print("QADataset class defined successfully!")



QADataset class defined successfully!


## 3. Load Training Data

Load the training CSV file containing publication IDs and dataset titles.

In [4]:
importlib.reload(config)
importlib.reload(utils)

<module 'utils' from '/content/utils.py'>

In [5]:
# Load training data
print("Loading training data...")
train_df = pd.read_csv(config.TRAIN_CSV)

print(f"Training data shape: {train_df.shape}")
print(f"\nFirst few rows:")
train_df.head()

Loading training data...
Training data shape: (19661, 5)

First few rows:


Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


## 4. Initialize Model and Tokenizer

Load the pre-trained BERT model for question answering.

In [6]:
# Initialize model and tokenizer
model_name = config.QA_MODEL_NAME
print(f"Loading model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

print(f"Model loaded successfully!")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Loading model: salti/bert-base-multilingual-cased-finetuned-squad


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/822 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

Model loaded successfully!
Model parameters: 177,264,386


## 5. Prepare Training Data

Convert the training DataFrame into contexts, questions, and answers for the QA model.

In [7]:
importlib.reload(config)
importlib.reload(utils)

<module 'utils' from '/content/utils.py'>

In [8]:
def prepare_training_data(train_df: pd.DataFrame) -> Tuple[List, List, List]:
    """
    Prepare training data from DataFrame.

    Args:
        train_df: Training DataFrame with text and labels

    Returns:
        Tuple of (contexts, questions, answers)
    """
    contexts = []
    questions = []
    answers = []

    # Load publication texts
    pub_texts = utils.load_json_publications(
        config.TRAIN_JSON_DIR,
        train_df['Id'].unique().tolist()
    )

    # Create training examples
    for idx, row in train_df.iterrows():
        pub_id = row['Id']
        dataset_title = row.get('dataset_title', '')

        if pub_id not in pub_texts or not dataset_title:
            continue

        context = pub_texts[pub_id]

        # Use multiple questions
        for question in config.QA_QUESTIONS:
            # Find answer in context
            answer_start = context.lower().find(dataset_title.lower())

            if answer_start != -1:
                contexts.append(context)
                questions.append(question)
                answers.append({
                    'text': dataset_title,
                    'answer_start': answer_start
                })

    print(f"Prepared {len(contexts)} training examples")
    return contexts, questions, answers

# Prepare the data
contexts, questions, answers = prepare_training_data(train_df)

# Show sample
print("\nSample training example:")
print(f"Question: {questions[0]}")
print(f"Answer: {answers[0]['text']}")
print(f"Context (first 200 chars): {contexts[0][:200]}...")

Prepared 5715 training examples

Sample training example:
Question: Which datasets are used?
Answer: National Education Longitudinal Study
Context (first 200 chars): Dropping out of high school is not necessarily the end of a student's formal education. Some students who drop out return a short time later to earn a diploma, some may pursue an alternative credentia...


In [9]:
!ls -F /content/

config.py   extracted/	  sample_data/	utils.py
dataaa.zip  __pycache__/  test.csv


In [10]:
!ls -F /content/extracted/

sample_submission.csv  test/  train/  train.csv


In [11]:
with open('/content/config.py', 'r') as f:
    config_content = f.read()
print(config_content)

"""
Configuration module for DEFNLP pipeline.
Contains all hyperparameters, file paths, and model settings.
"""

import os

# FILE PATHS
BASE_DIR = "/content/"
TRAIN_CSV = os.path.join(BASE_DIR, "extracted/train.csv")
TEST_CSV = os.path.join(BASE_DIR, "extracted/sample_submission.csv")
TRAIN_JSON_DIR = os.path.join(BASE_DIR, "extracted/train")
TEST_JSON_DIR = os.path.join(BASE_DIR, "extracted/test")
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
BIG_GOV_DATASETS = os.path.join(BASE_DIR, "big_gov_datasets.txt")

# MODEL CONFIGURATION
# BERT QA Model
QA_MODEL_NAME = "salti/bert-base-multilingual-cased-finetuned-squad"
QA_MAX_SEQ_LENGTH = 256
QA_MAX_ANSWER_LENGTH = 32
QA_BATCH_SIZE = 2
QA_LEARNING_RATE = 1e-5
QA_NUM_EPOCHS = 20
QA_DOC_STRIDE = 64
QA_WARMUP_STEPS = 0
QA_WEIGHT_DECAY = 0.0
QA_GRADIENT_ACCUMULATION = 4
QA_DROPOUT = 0.2
# SpaCy Model
SPACY_MODEL = "en_core_web_sm"

# PHASE I: DATA CLEANING & BASELINE
# Stopwords configuration
USE_STOPWORDS = True
CUSTOM_STOPWORDS = set()  # Ad

## 6. Create Dataset

Instantiate the QADataset with the prepared data.

In [12]:
# Create dataset
dataset = QADataset(
    contexts=contexts,
    questions=questions,
    answers=answers,
    tokenizer=tokenizer,
    max_length=config.QA_MAX_SEQ_LENGTH
)

print(f"Dataset created with {len(dataset)} examples")

# Test dataset
sample = dataset[0]
print(f"\nSample encoding keys: {sample.keys()}")
print(f"Input IDs shape: {sample['input_ids'].shape}")

Dataset created with 5715 examples

Sample encoding keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])
Input IDs shape: torch.Size([256])


## 7. Configure Training Arguments

Set up the training hyperparameters and output directory.

In [13]:
# Training configuration
output_dir = "./models/qa_model"
num_epochs = config.QA_NUM_EPOCHS
batch_size = config.QA_BATCH_SIZE
learning_rate = config.QA_LEARNING_RATE

print("="*60)
print("FINE-TUNING CONFIGURATION")
print("="*60)
print(f"Output directory: {output_dir}")
print(f"Number of epochs: {num_epochs}")
print(f"Batch size: {batch_size}")
print(f"Learning rate: {learning_rate}")
print(f"Max sequence length: {config.QA_MAX_SEQ_LENGTH}")
print("="*60)

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=2e-5,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    warmup_ratio=0.1,               # better than fixed warmup_steps
    logging_steps=50,
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    fp16=True,                      # if GPU supports — big speedup
)

print("\nTraining arguments configured!")

FINE-TUNING CONFIGURATION
Output directory: ./models/qa_model
Number of epochs: 20
Batch size: 2
Learning rate: 1e-05
Max sequence length: 256

Training arguments configured!


## 8. Create Trainer

Initialize the Hugging Face Trainer with the model, dataset, and training arguments.

In [14]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=default_data_collator,
)

print("Trainer initialized successfully!")

Trainer initialized successfully!


## 9. Fine-Tune the Model

Start the training process. This may take some time depending on your hardware and dataset size.

In [15]:
# Train the model
print("\nStarting training...")
print("This may take a while depending on your hardware.\n")

trainer.train()

print("\nTraining complete!")


Starting training...
This may take a while depending on your hardware.



  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mi211377[0m ([33mi211377-fast-nuces[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,2.1037
100,0.3699
150,0.2177
200,0.1698
250,0.1051
300,0.1128
350,0.0943
400,0.0488
450,0.0631
500,0.067



Training complete!


## 10. Save the Fine-Tuned Model

Save the trained model and tokenizer to disk for later use.

In [16]:
# Save model and tokenizer
print(f"Saving model to {output_dir}")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("\n" + "="*60)
print("MODEL SAVED SUCCESSFULLY!")
print("="*60)
print(f"Location: {output_dir}")
print("\nYou can now use this model in the DEFNLP pipeline.")

Saving model to ./models/qa_model

MODEL SAVED SUCCESSFULLY!
Location: ./models/qa_model

You can now use this model in the DEFNLP pipeline.


## 11. Test the Fine-Tuned Model (Optional)

Quick test to verify the model works correctly.

In [17]:
# Test the model
from transformers import pipeline

# Create QA pipeline
qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer
)

# Test with a sample
test_context = contexts[0]
test_question = "What dataset is mentioned in this publication?"

result = qa_pipeline(
    question=test_question,
    context=test_context
)

print("Test Prediction:")
print(f"Question: {test_question}")
print(f"Answer: {result['answer']}")
print(f"Confidence: {result['score']:.4f}")

Device set to use cuda:0


Test Prediction:
Question: What dataset is mentioned in this publication?
Answer: Education Longitudinal Study
Confidence: 0.8803


## Summary

This notebook demonstrated the complete fine-tuning process for the BERT QA model:

1. ✅ Loaded and prepared training data
2. ✅ Created custom QA dataset class
3. ✅ Initialized pre-trained BERT model
4. ✅ Configured training parameters
5. ✅ Fine-tuned the model
6. ✅ Saved the trained model
7. ✅ Tested the model

The fine-tuned model is now ready to be used in the DEFNLP pipeline for identifying hidden-in-plain-sight data citations in scientific publications.

In [18]:
import re
import string
from collections import Counter

def normalize_text(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return text.translate(str.maketrans('', '', string.punctuation))
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, ground_truth):
    return int(normalize_text(prediction) == normalize_text(ground_truth))

def compute_f1_score(prediction, ground_truth):
    prediction_tokens = normalize_text(prediction).split()
    ground_truth_tokens = normalize_text(ground_truth).split()

    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0

    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

print("Evaluation utility functions defined.")

Evaluation utility functions defined.


In [None]:
# Ensure the qa_pipeline from cell 'bE97mDXPTNcP' is available
# If this cell is run independently, you might need to re-create the pipeline:
# from transformers import pipeline
# qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

em_scores = []
f1_scores = []

# Using the 'contexts', 'questions', and 'answers' lists already prepared from the training data
num_samples = len(contexts)

print(f"\nEvaluating model on {num_samples} prepared training examples...")

for i in range(num_samples):
    context = contexts[i]
    question = questions[i]
    ground_truth = answers[i]['text']

    # Get prediction from the pipeline
    # The qa_pipeline function expects 'context' and 'question' as keyword arguments.
    # It returns a dictionary with 'answer', 'start', 'end', and 'score'.
    prediction_result = qa_pipeline(question=question, context=context)
    pred_text = prediction_result['answer']

    em_scores.append(compute_exact_match(pred_text, ground_truth))
    f1_scores.append(compute_f1_score(pred_text, ground_truth))

avg_em = sum(em_scores) / num_samples
avg_f1 = sum(f1_scores) / num_samples

print(f"\nEvaluation Results (on training data subset):")
print(f"  Average Exact Match (EM): {avg_em:.4f}")
print(f"  Average F1 Score: {avg_f1:.4f}")


Evaluating model on 5715 prepared training examples...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
import shutil
from google.colab import files

# 1. name_of_zip: What you want the downloaded file to be called
# 2. folder_to_download: The path to the folder you want (e.g., 'extracted' or 'output')
folder_name = 'models'  # Change this to your folder name

# Create the zip file
shutil.make_archive(folder_name, 'zip', folder_name)

# Download it
files.download(f'{folder_name}.zip')