# Financial Question Answering With FinBERT

# Importing Libraries

Installs and imports all necessary libraries for data handling (Pandas, NumPy), NLP (NLTK + punkt, rank_bm25), and model training (PyTorch, Hugging Face Transformers).

In [None]:
# Standard Libraries
import os
import json
import pickle
import random
from collections import Counter
from statistics import mean

# Data Handling
import pandas as pd
import numpy as np

# Progress Bar
from tqdm import tqdm

# NLP & Transformers
import nltk
from nltk.tokenize import word_tokenize
from torch.optim import AdamW  # <- use this instead of transformers.AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    BertConfig,
    get_linear_schedule_with_warmup
)


# PyTorch
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax

# Download NLTK tokenizer model
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define file paths to dataset stored in Google Drive
data_dir = '/content/drive/MyDrive/FIQA/Data'

file_answers = os.path.join(data_dir, 'FiQA_train_doc_final.tsv')
file_questions = os.path.join(data_dir, 'FiQA_train_question_final.tsv')
file_qid_docid = os.path.join(data_dir, 'FiQA_train_question_doc_final.tsv')

Mounted at /content/drive


In [None]:
# Load files
documents_df = pd.read_csv(file_answers, sep='\t', names=['docid', 'doc'])
questions_df = pd.read_csv(file_questions, sep='\t', names=['qid', 'question'])
qid_docid_df = pd.read_csv(file_qid_docid, sep='\t', names=['qid', 'docid'])

In [None]:
# === 1. Unique Questions ===
num_questions = questions_df['qid'].nunique()

# === 2. Unique Documents (Answer Passages) ===
num_documents = documents_df['docid'].nunique()

# === 3. Total Labeled Question–Document Pairs ===
num_pairs = len(qid_docid_df)

# === 4. Number of Relevant vs Irrelevant Pairs ===
# If you only have relevant pairs (like FiQA Task 2), all pairs are relevant
# If there's a 'label' column with 1s and 0s, count them

if 'label' in qid_docid_df.columns:
    num_relevant = (qid_docid_df['label'] == 1).sum()
    num_irrelevant = (qid_docid_df['label'] == 0).sum()
else:
    num_relevant = num_pairs
    num_irrelevant = 0

# === 5. Print the Summary ===
print("Dataset Statistics")
print(f"  - Unique Questions             : {num_questions}")
print(f"  - Unique Documents             : {num_documents}")
print(f"  - Total Question–Doc Pairs     : {num_pairs}")



Dataset Statistics
  - Unique Questions             : 6649
  - Unique Documents             : 57599
  - Total Question–Doc Pairs     : 17111


# Loading the Dataset

Spliting the Dataset in Train, Validate, Test sets

In [None]:
# Load the sets
import pickle

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

train_set = load_pickle('/content/drive/MyDrive/FIQA/Data/train_set_50.pickle')
valid_set = load_pickle('/content/drive/MyDrive/FIQA/Data/valid_set_50.pickle')
test_set = load_pickle('/content/drive/MyDrive/FIQA/Data/test_set_50.pickle')

# Count
n_train = len(train_set)
n_valid = len(valid_set)
n_test = len(test_set)
total = n_train + n_valid + n_test

# Print actual split breakdown
print("Dataset Split Statistics:")
print(f"  - Train : {n_train} samples ({n_train / total * 100:.2f}%)")
print(f"  - Valid : {n_valid} samples ({n_valid / total * 100:.2f}%)")
print(f"  - Test  : {n_test} samples ({n_test / total * 100:.2f}%)")
print(f"  - Total : {total} samples")


Dataset Split Statistics:
  - Train : 5676 samples (85.48%)
  - Valid : 631 samples (9.50%)
  - Test  : 333 samples (5.02%)
  - Total : 6640 samples


In [None]:
print("Number of answers (documents): {}".format(len(documents_df)))
print("Number of questions: {}".format(len(questions_df)))
print("Number of QA pairs: {}".format(len(qid_docid_df)))

Number of answers (documents): 57639
Number of questions: 6649
Number of QA pairs: 17111


In [None]:
# Identify empty documents
empty_docs, empty_ids = get_empty_docs(documents_df)

# Remove empty answer rows from the document set
documents_cleaned_df = documents_df.drop(empty_ids)

# Remove QA pairs that point to empty documents
qid_docid_df = qid_docid_df[~qid_docid_df['docid'].isin(empty_docs)]

# Lowercase document text only
documents_cleaned_df['doc'] = documents_cleaned_df['doc'].str.lower()

# Lowercase question text only
questions_df['question'] = questions_df['question'].str.lower()

# Print stats after cleaning
print("Number of answers after cleaning: {}".format(len(documents_cleaned_df)))
print("Number of QA pairs after cleaning: {}".format(len(qid_docid_df)))


Number of answers after cleaning: 57639
Number of QA pairs after cleaning: 17111


In [None]:
def get_empty_docs(df):
    """
    Identify documents that are empty or only contain whitespace.

    Args:
        df (pd.DataFrame): DataFrame with columns ['docid', 'doc']

    Returns:
        empty_docids (list): list of docid values with empty docs
        empty_indices (list): list of row indices with empty docs
    """
    empty_indices = df[df['doc'].str.strip().isna() | (df['doc'].str.strip() == '')].index.tolist()
    empty_docids = df.loc[empty_indices, 'docid'].tolist()
    return empty_docids, empty_indices

In [None]:
import re

def simple_tokenize(text):
    """
    Tokenizes text into words using regular expressions.
    Lowercases and splits on word boundaries.
    """
    return re.findall(r'\b\w+\b', text.lower())

In [None]:
def process_questions(df):
    df = df.copy()
    df['q_processed'] = df['question'].str.lower()
    df['tokenized_q'] = df['q_processed'].apply(simple_tokenize)
    df['q_len'] = df['tokenized_q'].apply(len)
    return df

def process_answers(df):
    df = df.copy()
    df['doc_processed'] = df['doc'].str.lower()
    df['tokenized_ans'] = df['doc_processed'].apply(simple_tokenize)
    df['ans_len'] = df['tokenized_ans'].apply(len)
    return df


In [None]:
processed_questions = process_questions(questions_df)
processed_answers = process_answers(documents_cleaned_df)

In [None]:
print("Processed and tokenized questions")
print(processed_questions.head())

print("\n\nProcessed and tokenized answers")
print(processed_answers.head())

avg_q_count = processed_questions['q_len'].mean()
avg_ans_count = processed_answers['ans_len'].mean()

print("\nAverage question length:", round(avg_q_count))
print("Average answer length:", round(avg_ans_count))

print("Total answers:", len(processed_answers))
print("Number of answers with length > 512:", len(processed_answers[processed_answers['ans_len'] > 512]))

Processed and tokenized questions
                                                          qid  \
NaN    qid                                           question   
0.0    0    What is considered a business expense on a bus...   
1.0    1    Claiming business expenses for a business with...   
2.0    2    Transferring money from One business checking ...   
3.0    3    Having a separate bank account for business/in...   

                       question          q_processed  \
NaN    qid            timestamp            timestamp   
0.0    0     nov 8 '11 at 15:14   nov 8 '11 at 15:14   
1.0    1    may 13 '14 at 13:17  may 13 '14 at 13:17   
2.0    2    jan 20 '16 at 20:31  jan 20 '16 at 20:31   
3.0    3          mar 1 at 0:24        mar 1 at 0:24   

                          tokenized_q  q_len  
NaN    qid                [timestamp]      1  
0.0    0     [nov, 8, 11, at, 15, 14]      6  
1.0    1    [may, 13, 14, at, 13, 17]      6  
2.0    2    [jan, 20, 16, at, 20, 31]      6  
3.0

# Loading Model

In [None]:
from transformers import AutoTokenizer

# Load tokenizer (use 'yiyanghkust/finbert-qa' or 'ProsusAI/finbert' or 'roberta-base' etc.)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def prepare_reranker_input(question: str, answer: str, tokenizer, max_length=512):
    """
    Tokenizes question-answer pair for a reranker model in [CLS] question [SEP] answer [SEP] format.
    Truncates to fit within max_length tokens.
    """

    # Tokenize question and answer separately
    q_tokens = tokenizer.tokenize(question)
    a_tokens = tokenizer.tokenize(answer)

    # Reserve space for special tokens: [CLS], [SEP], [SEP] → 3 tokens
    max_input_tokens = max_length - 3

    # Truncate answer if total is too long
    while len(q_tokens) + len(a_tokens) > max_input_tokens:
        # Prefer truncating answer first
        if len(a_tokens) > len(q_tokens):
            a_tokens = a_tokens[:-1]
        else:
            q_tokens = q_tokens[:-1]

    # Reconstruct full tokens with special tokens
    tokens = [tokenizer.cls_token] + q_tokens + [tokenizer.sep_token] + a_tokens + [tokenizer.sep_token]
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    attention_mask = [1] * len(input_ids)

    # Pad if needed
    padding_length = max_length - len(input_ids)
    input_ids += [tokenizer.pad_token_id] * padding_length
    attention_mask += [0] * padding_length

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
question = "What is considered a business expense on taxes?"
answer = "Business expenses typically include costs for operations, employee salaries, office supplies, utilities, travel related to work, etc."

encoded = prepare_reranker_input(question, answer, tokenizer)

print("Input IDs:", encoded["input_ids"][:20])
print("Attention Mask:", encoded["attention_mask"][:20])


Input IDs: [101, 2054, 2003, 2641, 1037, 2449, 10961, 2006, 7773, 1029, 102, 2449, 11727, 4050, 2421, 5366, 2005, 3136, 1010, 7904]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
decoded = tokenizer.convert_ids_to_tokens([
    101, 2054, 2003, 2641, 1037, 2449, 10961, 2006, 7773, 1029,
    102, 2449, 11727, 4050, 2421, 5366, 2005, 3136, 1010, 7904
])
print(decoded)


['[CLS]', 'what', 'is', 'considered', 'a', 'business', 'expense', 'on', 'taxes', '?', '[SEP]', 'business', 'expenses', 'typically', 'include', 'costs', 'for', 'operations', ',', 'employee']


# For a Finbert Model

# Configuration

In [None]:
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Configuration
config = {
    'bert_model_name': 'finbert-domain',
    'max_seq_len': 512,
    'batch_size': 16,
    'learning_rate': 3e-6,
    'weight_decay': 0.01,
    'n_epochs': 2,
    'num_warmup_steps': 10000
}

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

# Load ID-to-text dictionaries
docid_to_text = load_pickle('/content/drive/MyDrive/FIQA/Data/docid_to_text.pickle')
qid_to_text = load_pickle('/content/drive/MyDrive/FIQA/Data/qid_to_text.pickle')

# Load train, validation, and test datasets
train_set = load_pickle('/content/drive/MyDrive/FIQA/Data/train_set_50.pickle')
valid_set = load_pickle('/content/drive/MyDrive/FIQA/Data/valid_set_50.pickle')
test_set = load_pickle('/content/drive/MyDrive/FIQA/Data/test_set_50.pickle')

# Load ground-truth labels for evaluation
labels = load_pickle('/content/drive/MyDrive/FIQA/Data/labels.pickle')

# Display data info
print(f" Number of training questions: {len(train_set)}")
print(f" Number of validation questions: {len(valid_set)}")
print(f" Number of test questions: {len(test_set)}")

# Load FinBERT tokenizer
print("\nLoading FinBERT tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# (Later in your pipeline, load model like this)
# model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=2)


 Number of training questions: 5676
 Number of validation questions: 631
 Number of test questions: 333

Loading FinBERT tokenizer...


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm

def get_input_data(dataset, max_seq_len):
    """
    Creates input parameters for training and validation.

    Assumes:
        - `tokenizer` is a globally available HuggingFace tokenizer
        - `qid_to_text` and `docid_to_text` are global dictionaries

    Args:
        dataset: List of [qid, [positive_doc_ids], [candidate_doc_ids]]
        max_seq_len: Max token length for BERT input

    Returns:
        input_ids, token_type_ids, att_masks, labels
    """
    input_ids = []
    token_type_ids = []
    att_masks = []
    labels = []

    for qid, pos_doc_ids, candidate_doc_ids in tqdm(dataset, desc="Tokenizing QA pairs"):
        q_text = qid_to_text[qid]

        for docid in candidate_doc_ids:
            ans_text = docid_to_text[docid]

            # Tokenize using HuggingFace tokenizer
            encoded_seq = tokenizer.encode_plus(
                q_text,
                ans_text,
                max_length=max_seq_len,
                padding='max_length',
                truncation=True,
                return_token_type_ids=True,
                return_attention_mask=True
            )

            input_ids.append(encoded_seq['input_ids'])
            token_type_ids.append(encoded_seq['token_type_ids'])
            att_masks.append(encoded_seq['attention_mask'])
            labels.append(1 if docid in pos_doc_ids else 0)

    return input_ids, token_type_ids, att_masks, labels

# DataLoaders

In [None]:
def get_dataloader(dataset, split_type, max_seq_len, batch_size):
    """
    Creates DataLoader for training or validation.

    Args:
        dataset: List in format [qid, [positive_doc_ids], [candidate_doc_ids]]
        split_type: 'train' or 'validation'
        max_seq_len: Maximum sequence length
        batch_size: Batch size for loading

    Returns:
        dataloader: PyTorch DataLoader with encoded inputs
    """
    # Use globally defined tokenizer, qid_to_text, and docid_to_text
    input_id, token_type_id, att_mask, label = get_input_data(dataset, max_seq_len)

    # Convert all inputs to torch tensors
    input_ids = torch.tensor(input_id, dtype=torch.long)
    token_type_ids = torch.tensor(token_type_id, dtype=torch.long)
    att_masks = torch.tensor(att_mask, dtype=torch.long)
    labels = torch.tensor(label, dtype=torch.long)

    # Create TensorDataset
    data = TensorDataset(input_ids, token_type_ids, att_masks, labels)

    # Use appropriate sampler
    sampler = RandomSampler(data) if split_type == "train" else SequentialSampler(data)

    # Build DataLoader
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader

In [None]:
from tqdm import tqdm
from transformers import logging
logging.set_verbosity_error()
# Generate DataLoaders
train_dataloader = get_dataloader(
    train_set,
    'train',
    config['max_seq_len'],
    config['batch_size']
)


validation_dataloader = get_dataloader(
    valid_set,
    'validation',
    config['max_seq_len'],
    config['batch_size']
)

# Output stats
print(f"\n Size of training DataLoader: {len(train_dataloader)} batches")
print(f" Size of validation DataLoader: {len(validation_dataloader)} batches")

Tokenizing QA pairs: 100%|██████████| 5676/5676 [06:02<00:00, 15.64it/s]
Tokenizing QA pairs: 100%|██████████| 631/631 [00:35<00:00, 17.65it/s]



 Size of training DataLoader: 17738 batches
 Size of validation DataLoader: 1972 batches


# Model Implementation

In [None]:
from transformers import AutoModelForSequenceClassification

# Config
config = {
    'bert_model_name': 'ProsusAI/finbert',
    'max_seq_len': 512,
    'batch_size': 16,
    'learning_rate': 3e-6,
    'weight_decay': 0.01,
    'n_epochs': 2,
    'num_warmup_steps': 10000
}

# Load FinBERT model with a new binary classification head
model = AutoModelForSequenceClassification.from_pretrained(
    config['bert_model_name'],
    num_labels=2,  # Binary classification
    ignore_mismatched_sizes=True  # This avoids the shape mismatch error
)

# Move model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Loaded ProsusAI/finbert model for binary classification.")


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded ProsusAI/finbert model for binary classification.


# Accuracy, Train, validate

In [None]:
def get_accuracy(preds, labels):
    """
    Compute accuracy for binary classification.

    Args:
        preds (np.ndarray): Model prediction probabilities of shape (batch_size, 2).
        labels (np.ndarray): Ground-truth labels of shape (batch_size,).

    Returns:
        float: Accuracy score.
    """
    # Convert probability predictions to predicted class labels
    predicted_classes = np.argmax(preds, axis=1)

    # Ensure labels are also flattened for comparison
    true_labels = labels.flatten()

    # Calculate accuracy
    accuracy = np.mean(predicted_classes == true_labels)

    return accuracy

In [None]:
from tqdm import tqdm
import torch
import numpy as np

def train(model, train_dataloader, optimizer, scheduler, device):
    """
    Trains the model for one epoch and returns average loss and accuracy.

    Args:
        model (torch.nn.Module): The model to train
        train_dataloader (DataLoader): Dataloader for training set
        optimizer (torch.optim.Optimizer): Optimizer
        scheduler (torch.optim.lr_scheduler): Learning rate scheduler
        device (torch.device): Device to train on (CPU or GPU)

    Returns:
        avg_loss (float): Average training loss
        avg_acc (float): Average training accuracy
    """
    model.train()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = 0

    for batch in tqdm(train_dataloader, desc="Training"):
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        # Reset gradients
        model.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=b_input_ids,
            token_type_ids=b_token_type_ids,
            attention_mask=b_input_mask,
            labels=b_labels
        )

        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Optimizer and scheduler step
        optimizer.step()
        scheduler.step()

        # Accumulate loss
        total_loss += loss.item()

        # Compute accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        batch_accuracy = get_accuracy(logits, label_ids)
        total_accuracy += batch_accuracy
        num_batches += 1

    avg_loss = total_loss / num_batches
    avg_acc = total_accuracy / num_batches

    return avg_loss, avg_acc

In [None]:
def validate(model, validation_dataloader, device):
    """
    Validates the model and returns average loss and accuracy.

    Args:
        model (torch.nn.Module): The trained model
        validation_dataloader (DataLoader): DataLoader for validation set
        device (torch.device): Device to run evaluation on

    Returns:
        avg_loss (float): Average validation loss
        avg_acc (float): Average validation accuracy
    """
    model.eval()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = 0

    for batch in tqdm(validation_dataloader, desc="Validating"):
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=b_input_ids,
                token_type_ids=b_token_type_ids,
                attention_mask=b_input_mask,
                labels=b_labels
            )

        loss = outputs.loss
        logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        batch_accuracy = get_accuracy(logits, label_ids)
        total_accuracy += batch_accuracy
        total_loss += loss.item()
        num_batches += 1

    avg_loss = total_loss / num_batches
    avg_acc = total_accuracy / num_batches

    return avg_loss, avg_acc

In [None]:
from torch.optim import AdamW  # Use AdamW from PyTorch
from transformers import get_linear_schedule_with_warmup


# Optimizer: AdamW is the recommended optimizer for BERT
optimizer = AdamW(
    model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay']
)

# Total training steps = number of batches * number of epochs
n_epochs = config['n_epochs']
total_steps = len(train_dataloader) * n_epochs

# Scheduler: linear warmup followed by linear decay
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config['num_warmup_steps'],  # Typically 0.1 * total_steps
    num_training_steps=total_steps
)

In [None]:
import os
import torch

# Directory to save the model
save_dir = "/content/drive/MyDrive/FIQA/Model"
os.makedirs(save_dir, exist_ok=True)

# Initialize best validation loss
best_valid_loss = float('inf')

# Training loop
for epoch in range(config['n_epochs']):
    print(f"\n Epoch {epoch + 1}/{config['n_epochs']}")

    # === Training ===
    train_loss, train_acc = train(
        model,
        train_dataloader,
        optimizer,
        scheduler,
        device
    )

    # === Validation ===
    valid_loss, valid_acc = validate(
        model,
        validation_dataloader,
        device
    )

    # === Save model if validation improves ===
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        save_path = os.path.join(save_dir, f"prosusfinbert_epoch{epoch + 1}.pt")
        torch.save(model.state_dict(), save_path)
        print(f" New best model saved to: {save_path}")

    # === Log results ===
    print(f"\n Results for Epoch {epoch + 1}:")
    print(f"     Train Loss     : {train_loss:.4f} | Accuracy: {train_acc * 100:.2f}%")
    print(f"     Validation Loss: {valid_loss:.4f} | Accuracy: {valid_acc * 100:.2f}%")



 Epoch 1/2


Training: 100%|██████████| 17738/17738 [1:29:41<00:00,  3.30it/s]
Validating: 100%|██████████| 1972/1972 [03:07<00:00, 10.51it/s]


 New best model saved to: /content/drive/MyDrive/FIQA/Model/prosusfinbert_epoch1.pt

 Results for Epoch 1:
     Train Loss     : 0.1057 | Accuracy: 97.90%
     Validation Loss: 0.0855 | Accuracy: 98.02%

 Epoch 2/2


Training: 100%|██████████| 17738/17738 [1:29:42<00:00,  3.30it/s]
Validating: 100%|██████████| 1972/1972 [03:07<00:00, 10.51it/s]


 Results for Epoch 2:
     Train Loss     : 0.0736 | Accuracy: 98.25%
     Validation Loss: 0.0898 | Accuracy: 98.07%





# Evaluation

In [None]:
import torch
from torch.nn.functional import softmax
import numpy as np

def predict(model, q_text, cands, max_seq_len):
    """
    Re-ranks the candidate answers for a given question using the model.

    Args:
        model (torch.nn.Module): Trained BERT model for classification.
        q_text (str): Question text.
        cands (list): List of candidate doc IDs.
        max_seq_len (int): Max token length for inputs.

    Returns:
        ranked_ans (list): Candidate doc IDs ranked by relevance.
        sorted_scores (list): Corresponding relevance scores.
    """
    model.eval()
    scores = []
    cands_id = np.array(cands)

    for docid in cands:
        ans_text = docid_to_text.get(docid, "")

        encoded = tokenizer.encode_plus(
            q_text,
            ans_text,
            max_length=max_seq_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoded['input_ids'].to(device)
        token_type_ids = encoded['token_type_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            prob = softmax(logits, dim=1)
            scores.append(prob[:, 1].item())  # Probability of relevance (label=1)

    # Sort candidate IDs based on scores
    sorted_idx = np.argsort(scores)[::-1]
    ranked_ans = cands_id[sorted_idx].tolist()
    sorted_scores = np.round(np.array(scores)[sorted_idx], 3).tolist()

    return ranked_ans, sorted_scores

In [None]:
from tqdm import tqdm

def get_rank(model, test_set, max_seq_len):
    """
    Re-ranks candidate answers for each question in the test set using the fine-tuned model.

    Args:
        model (torch.nn.Module): Trained BERT model for sequence classification.
        test_set (list): List of test samples in [qid, [relevant_docids], [candidate_docids]] format.
        max_seq_len (int): Maximum sequence length for input encoding.

    Returns:
        dict: Mapping of qid -> list of ranked candidate docids based on model scores.
    """
    qid_pred_rank = {}
    model.eval()

    for seq in tqdm(test_set, desc=" Re-ranking"):
        qid, label, cands = seq
        q_text = qid_to_text.get(qid, "")

        ranked_ans, _ = predict(model, q_text, cands, max_seq_len)
        qid_pred_rank[qid] = ranked_ans

    return qid_pred_rank

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to the saved checkpoint
checkpoint_path = "/content/drive/MyDrive/FIQA/Model/prosusfinbert_epoch1.pt"  # change to your actual filename

# Initialize model architecture with the same base
model = AutoModelForSequenceClassification.from_pretrained(
    "ProsusAI/finbert",
    num_labels=2,
    ignore_mismatched_sizes=True  # required if model head differs from pretraining
)

# Load saved weights
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
model.to(device)

print(" ProsusFinBERT model loaded successfully from:", checkpoint_path)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 ProsusFinBERT model loaded successfully from: /content/drive/MyDrive/FIQA/Model/prosusfinbert_epoch1.pt


In [None]:
labels = {int(k): [int(docid) for docid in v] for k, v in labels.items()}


In [None]:
qid_pred_rank = get_rank(model, test_set, config['max_seq_len'])


 Re-ranking: 100%|██████████| 333/333 [03:09<00:00,  1.75it/s]


In [None]:
for qid in list(qid_pred_rank.keys())[:3]:
    print(f"Query: {qid}")
    print(f"Predicted: {qid_pred_rank[qid][:5]}")
    print(f"Relevant: {labels.get(qid)}")


Query: 14
Predicted: [398960, 385949, 96910, 513658, 149820]
Relevant: [398960]
Query: 68
Predicted: [545296, 18850, 354716, 192516, 259227]
Relevant: [19183]
Query: 70
Predicted: [397608, 447231, 541809, 40257, 540395]
Relevant: [327002]


In [None]:
hits = 0
for qid in qid_pred_rank:
    if set(qid_pred_rank[qid][:10]) & set(labels.get(qid, [])):
        hits += 1
print(f"\nQueries with at least one hit in top-10: {hits}/{len(qid_pred_rank)}")



Queries with at least one hit in top-10: 186/333


In [None]:
def evaluate(predictions, ground_truth, k=10):
    """
    Evaluate retrieval performance using MRR, nDCG, and Precision@1.

    Args:
        predictions (dict): {qid: [ranked_docids]}
        ground_truth (dict): {qid: [relevant_docids]}
        k (int): Cutoff for metrics

    Returns:
        mrr (float), avg_ndcg (float), precision_at_1 (float), rank_positions (list)
    """
    mrr_total, ndcg_total, precision_total = 0, 0, 0
    rank_positions = []

    for qid in predictions:
        ranked = predictions[qid][:k]
        relevant = set(ground_truth.get(qid, []))

        # MRR
        reciprocal_rank = 0
        for i, docid in enumerate(ranked):
            if docid in relevant:
                reciprocal_rank = 1 / (i + 1)
                rank_positions.append(i + 1)
                break
        else:
            rank_positions.append(k + 1)
        mrr_total += reciprocal_rank

        # Precision@1
        precision_total += 1 if ranked[0] in relevant else 0

        # nDCG@k
        dcg = 0
        for i, docid in enumerate(ranked):
            if docid in relevant:
                dcg += 1 / np.log2(i + 2)
        ideal_dcg = sum([1 / np.log2(i + 2) for i in range(min(len(relevant), k))])
        ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0
        ndcg_total += ndcg

    mrr = mrr_total / len(predictions)
    avg_ndcg = ndcg_total / len(predictions)
    precision_at_1 = precision_total / len(predictions)

    return mrr, avg_ndcg, precision_at_1, rank_positions


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import os

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the model checkpoint path
trained_model_path = "/content/drive/MyDrive/FIQA/Model/prosusfinbert_epoch1.pt"  # update as needed

# Initialize the model architecture
model = AutoModelForSequenceClassification.from_pretrained(
    "ProsusAI/finbert",
    num_labels=2,
    ignore_mismatched_sizes=True
)

# Load fine-tuned weights
model.load_state_dict(torch.load(trained_model_path, map_location=device))
model.to(device)
model.eval()

print(" ProsusFinBERT model loaded and ready for evaluation.")

# === Evaluate on test set ===
print("\n Evaluating on test set...\n")
qid_pred_rank = get_rank(model, test_set, config['max_seq_len'])

# Evaluation parameters
k = 10
num_q = len(test_set)

# `labels` should be a dict {qid: [relevant_docids]} prepared during preprocessing
MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, labels, k)

# Output metrics
print("\n Evaluation Results:")
print(f" Average nDCG@{k} for {num_q} queries: {average_ndcg:.3f}")
print(f" MRR@{k} for {num_q} queries: {MRR:.3f}")
print(f" Precision@1 for {num_q} queries: {precision:.3f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 ProsusFinBERT model loaded and ready for evaluation.

 Evaluating on test set...



 Re-ranking: 100%|██████████| 333/333 [03:09<00:00,  1.76it/s]


 Evaluation Results:
 Average nDCG@10 for 333 queries: 0.342
 MRR@10 for 333 queries: 0.416
 Precision@1 for 333 queries: 0.354





# Answer Re-ranking

In [None]:
from rank_bm25 import BM25Okapi

# Prepare corpus
doc_ids = sorted(docid_to_text.keys())
doc_texts = [docid_to_text[docid] for docid in doc_ids]
tokenized_corpus = [text.split() for text in doc_texts]

# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)


In [None]:
for idx, sample in enumerate(test_set):
    qid, label = sample[0], sample[1]
    query = qid_to_text[qid]
    query_tokens = query.split()

    scores = bm25.get_scores(query_tokens)
    top_indices = sorted(range(len(scores)), key=lambda i: -scores[i])[:50]
    cands = [doc_ids[i] for i in top_indices]

    if any(lab in cands for lab in label):
        print(f"Found good test sample: test_set[{idx}]")
        print("QID:", qid)
        print("Label:", label)
        print("Matching candidates:", set(cands) & set(label))
        break


Found good test sample: test_set[4]
QID: 458
Label: [263485, 218858]
Matching candidates: {263485}


In [None]:
qid, label = test_set[4][0], test_set[4][1]
query = qid_to_text[qid]
query_tokens = query.split()


In [None]:
scores = bm25.get_scores(query_tokens)
top_indices = sorted(range(len(scores)), key=lambda i: -scores[i])[:50]
cands = [doc_ids[i] for i in top_indices]


In [None]:
rank, probs = predict(model, query, cands, config['max_seq_len'])


In [None]:
def get_rel(relevant_docids, candidate_docids):
    return [1 if int(docid) in relevant_docids else 0 for docid in candidate_docids]

cand_rel = get_rel(label, cands)
pred_rel = get_rel(label, rank)


# Retriever

In [None]:
print("Retriever: \n\t Ranking: {}\n\n\t Relevancy: {}\n".format(cands[:10], cand_rel[:10]))
print("Re-ranker: \n\t Ranking: {}\n\n\t Probability: {}\n\n\t Relevancy: {}".format(
    rank[:10], probs[:10], pred_rel[:10]
))
print("\nLabel DocIDs: \n\t{}".format(label))


Retriever: 
	 Ranking: [62869, 263485, 382657, 131451, 297241, 217715, 300254, 272709, 292811, 303411]

	 Relevancy: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

Re-ranker: 
	 Ranking: [263485, 62869, 160340, 131451, 357094, 297241, 300254, 2528, 284809, 292811]

	 Probability: [0.468, 0.331, 0.059, 0.049, 0.014, 0.011, 0.004, 0.002, 0.002, 0.002]

	 Relevancy: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Label DocIDs: 
	[263485, 218858]


In [None]:
print("Question: \n\t{}\n".format(query))
print("Answer Re-ranker:\n\t{}\n".format(docid_to_text[rank[0]]))
print("Answer Retriever:\n\t{}\n".format(docid_to_text[cands[0]]))
print("Label (first):\n\t{}".format(docid_to_text[label[0]]))


Question: 
	How would IRS treat reimbursement in a later year of moving expenses?

Answer Re-ranker:
	IRS pub 521 has all the information you need. Expenses reimbursed. If you are reimbursed for your expenses and you   use the cash method of accounting, you can deduct your expenses either   in the year you paid them or in the year you received the   reimbursement. If you use the cash method of accounting, you can   choose to deduct the expenses in the year you are reimbursed even   though you paid the expenses in a different year. See Choosing when to   deduct, next. If you deduct your expenses and you receive the   reimbursement in a later year, you must include the reimbursement in   your income on Form 1040, line 21 This is not unusual. Anybody who moves near the end of the year can have this problem. The 39 week time test also can be an issue that span over 2 tax years. I would take the deduction for the expenses as soon a I could, and then count the income in the later year if the

# Error Analysis

In [None]:
import pickle

with open('/content/drive/MyDrive/FIQA/Data/qid_to_text.pickle', 'rb') as f:
    qid_to_text = pickle.load(f)


In [None]:
print("Sample keys from qid_to_text:", list(qid_to_text.keys())[:5])


Sample keys from qid_to_text: [0, 1, 2, 3, 4]


In [None]:
false_negatives = []

for qid, ranked_docs in qid_pred_rank.items():
    relevant = set(labels.get(qid, []))
    top_k = set(ranked_docs[:10])
    missed = relevant - top_k

    if missed:
        try:
            query_text = qid_to_text[int(qid)]
        except KeyError:
            print(f"Skipping qid '{qid}' — not found in qid_to_text")
            continue

        false_negatives.append({
            "qid": qid,
            "query": query_text,
            "relevant": list(relevant),
            "predicted_top10": ranked_docs[:10],
            "missed_relevant": list(missed)
        })



In [None]:
for error in false_negatives[:5]:
    print(f"\n Query ID: {error['qid']}")
    print(f" Query Text: {error['query']}")
    print(f" Relevant DocIDs: {error['relevant']}")
    print(f" Top-10 Predicted: {error['predicted_top10']}")
    print(f" Missed Relevant Docs: {error['missed_relevant']}")



 Query ID: 68
 Query Text: Intentions of Deductible Amount for Small Business
 Relevant DocIDs: [19183]
 Top-10 Predicted: [545296, 18850, 354716, 192516, 259227, 381151, 160612, 245447, 477940, 272709]
 Missed Relevant Docs: [19183]

 Query ID: 81
 Query Text: Does revenue equal gross profit for info product business?
 Relevant DocIDs: [451207]
 Top-10 Predicted: [264192, 123170, 166563, 278510, 10882, 86041, 218326, 535673, 294061, 203633]
 Missed Relevant Docs: [451207]

 Query ID: 458
 Query Text: How would IRS treat reimbursement in a later year of moving expenses?
 Relevant DocIDs: [218858, 263485]
 Top-10 Predicted: [263485, 62869, 160340, 131451, 139998, 357094, 297241, 101382, 18850, 300254]
 Missed Relevant Docs: [218858]

 Query ID: 473
 Query Text: Financially Shielded Entity Separating Individuals Behind It From Risks
 Relevant DocIDs: [466037]
 Top-10 Predicted: [103439, 27425, 389516, 18950, 112987, 291749, 163923, 361442, 540334, 531625]
 Missed Relevant Docs: [466037]

In [None]:
import pandas as pd

# Convert to DataFrame
df_errors = pd.DataFrame(false_negatives)

# Save to Drive with the desired filename
output_path = "/content/drive/MyDrive/FIQA/error_analysis_finbert.csv"
df_errors.to_csv(output_path, index=False)

print(f"Saved error analysis to: {output_path}")


Saved error analysis to: /content/drive/MyDrive/FIQA/error_analysis_finbert.csv
