# Financial Question Answering with BERT-QA

This project focuses on developing a Financial Question Answering (QA) system using the BERT-QA framework. The system integrates techniques from Information Retrieval (IR) and Natural Language Processing (NLP).

# Importing Libraries

In [1]:
!nvidia-smi

Sun Jul 20 10:24:12 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Installs and imports all necessary libraries for data handling (Pandas, NumPy), NLP (NLTK + punkt, rank_bm25), and model training (PyTorch, Hugging Face Transformers).

In [2]:
# Standard Libraries
!pip install rank_bm25
import os
import json
import pickle
import random
from collections import Counter
from statistics import mean

# Data Handling
import pandas as pd
import numpy as np

# Progress Bar
from tqdm import tqdm

# NLP & Transformers
import nltk
from nltk.tokenize import word_tokenize
from torch.optim import AdamW  # <- use this instead of transformers.AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    BertConfig,
    get_linear_schedule_with_warmup
)


# PyTorch
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.functional import softmax

# Download NLTK tokenizer model
nltk.download('punkt')


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [1]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")


CUDA available: True
Device name: Tesla T4


# Loading Datasets

This cell mounts Google Drive to access the FIQA dataset and then defines the directory and file‐path variables for the training documents, questions, and question–document mappings.

In [3]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define file paths to dataset stored in Google Drive
data_dir = '/content/drive/MyDrive/FIQA/Data'

file_answers = os.path.join(data_dir, 'FiQA_train_doc_final.tsv')
file_questions = os.path.join(data_dir, 'FiQA_train_question_final.tsv')
file_qid_docid = os.path.join(data_dir, 'FiQA_train_question_doc_final.tsv')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Load files
documents_df = pd.read_csv(file_answers, sep='\t', names=['docid', 'doc'])
questions_df = pd.read_csv(file_questions, sep='\t', names=['qid', 'question'])
qid_docid_df = pd.read_csv(file_qid_docid, sep='\t', names=['qid', 'docid'])

In [6]:
# Load question-document ID pairs
qid_docid_df = pd.read_csv(file_qid_docid, sep='\t', names=['qid', 'docid'])

# Show first 5 QA mappings
print("Question–Document ID Pairs:")
qid_docid_df.head()


Question–Document ID Pairs:


Unnamed: 0,qid,docid
,qid,docid
0.0,0,18850
1.0,1,14255
2.0,2,308938
3.0,3,296717


In [7]:
# Load questions
questions_df = pd.read_csv(file_questions, sep='\t', names=['qid', 'question','timestamp'])

# Show first 5 questions
print(" Question Id and Questions")
questions_df.head()


 Question Id and Questions


Unnamed: 0,qid,question,timestamp
,qid,question,timestamp
0.0,0,What is considered a business expense on a bus...,Nov 8 '11 at 15:14
1.0,1,Claiming business expenses for a business with...,May 13 '14 at 13:17
2.0,2,Transferring money from One business checking ...,Jan 20 '16 at 20:31
3.0,3,Having a separate bank account for business/in...,Mar 1 at 0:24


In [8]:
import pandas as pd

# Load document (answer) passages
documents_df = pd.read_csv(file_answers, sep='\t', names=['docid', 'doc', 'timestamp'])

# Show first 5 rows
print("Document Passages:")
documents_df.head()


Document Passages:


Unnamed: 0,docid,doc,timestamp
,docid,doc,timestamp
0.0,3,I'm not saying I don't like the idea of on-the...,Oct 03 '12 at 14:56
1.0,31,So nothing preventing false ratings besides ad...,Sep 01 '17 at 13:36
2.0,56,You can never use a health FSA for individual ...,Jun 9 '14 at 17:37
3.0,59,Samsung created the LCD and other flat screen ...,Dec 27 at 01:37


In [9]:
#Example
question_index = 5

# Get qid from the questions dataframe
qid = questions_df.iloc[question_index]['qid']
question = questions_df.iloc[question_index]['question']

# Get all docids linked to this qid
linked_docs = qid_docid_df[qid_docid_df['qid'] == qid]['docid'].tolist()

print(f"QID: {qid}")
print(f"Question: {question}\n")

# For each linked docid, fetch the answer (if it exists)
for docid in linked_docs:
    doc_row = documents_df[documents_df['docid'] == docid]
    if not doc_row.empty:
        doc = doc_row.iloc[0]['doc']
        print(f"DocID: {docid}")
        print(f"Answer Passage:\n{doc}\n{'-'*80}")
    else:
        print(f"DocID {docid} not found in document collection.\n{'-'*80}")


QID: 4
Question: Business Expense - Car Insurance Deductible For Accident That Occurred During a Business Trip

DocID: 196463
Answer Passage:
As a general rule, you must choose between a mileage deduction or an actual expenses deduction.  The idea is that the mileage deduction is supposed to cover all costs of using the car.  Exceptions include parking fees and tolls, which can be deducted separately under either method.  You explicitly cannot deduct insurance costs if you claim a mileage deduction.   Separately, you probably won't be able to deduct the deductible for your car as a casualty loss.  You first subtract $100 from the deductible and then divide it by your Adjusted Gross Income (AGI) from your tax return.  If your deductible is over 10% of your AGI, you can deduct it.   Note that even with a $1500 deductible, you won't be able to deduct anything if you made more than $14,000 for the year.  For most people, the insurance deductible just isn't large enough relative to income t

# Data Cleaning

We begin by printing the initial counts of documents, questions, and QA pairs to establish a baseline. Then we identify and drop any empty documents (and their QA mappings), lowercase all text for consistency, and reprint the updated counts to confirm the cleaning.

Before Cleaning

In [10]:
print("Number of answers (documents): {}".format(len(documents_df)))
print("Number of questions: {}".format(len(questions_df)))
print("Number of QA pairs: {}".format(len(qid_docid_df)))



Number of answers (documents): 57639
Number of questions: 6649
Number of QA pairs: 17111


In [11]:
def get_empty_docs(df):
    """
    Identify documents that are empty or only contain whitespace.

    Args:
        df (pd.DataFrame): DataFrame with columns ['docid', 'doc']

    Returns:
        empty_docids (list): list of docid values with empty docs
        empty_indices (list): list of row indices with empty docs
    """
    empty_indices = df[df['doc'].str.strip().isna() | (df['doc'].str.strip() == '')].index.tolist()
    empty_docids = df.loc[empty_indices, 'docid'].tolist()
    return empty_docids, empty_indices

After Cleaning

In [12]:
# Identify empty documents
empty_docs, empty_ids = get_empty_docs(documents_df)

# Remove empty answer rows from the document set
documents_cleaned_df = documents_df.drop(empty_ids)

# Remove QA pairs that point to empty documents
qid_docid_df = qid_docid_df[~qid_docid_df['docid'].isin(empty_docs)]

# Lowercase document text only
documents_cleaned_df['doc'] = documents_cleaned_df['doc'].str.lower()

# Lowercase question text only
questions_df['question'] = questions_df['question'].str.lower()

# Print stats after cleaning
print("Number of answers after cleaning: {}".format(len(documents_cleaned_df)))
print("Number of QA pairs after cleaning: {}".format(len(qid_docid_df)))


Number of answers after cleaning: 57601
Number of QA pairs after cleaning: 17073


In [None]:
save_path = '/content/drive/MyDrive/FIQA/Data'

# Save cleaned documents
documents_cleaned_df.to_csv(f'{save_path}/cleaned_documents.tsv', sep='\t', index=False)

# Save cleaned qid-docid mappings
qid_docid_df.to_csv(f'{save_path}/cleaned_qid_docid.tsv', sep='\t', index=False)

# Save cleaned questions (lowercased)
questions_df.to_csv(f'{save_path}/cleaned_questions.tsv', sep='\t', index=False)


KeyboardInterrupt: 

# Anserini

We convert the cleaned documents into JSON‐lines with `id` and `contents` fields for Anserini indexing. A simple regex tokenizer lowercases and splits text into word tokens, which we apply to both questions and answers while recording their token counts. Finally, we display sample processed entries, compute average question/answer lengths, and count how many answers exceed 512 tokens for downstream filtering.


In [None]:
import json

def convert_to_anserini_json(df, output_file):
    docs = []
    for _, row in df.iterrows():
        doc = {
            'id': str(row['docid']),
            'contents': row['doc']
        }
        docs.append(doc)

    with open(output_file, 'w') as f:
        for doc in docs:
            f.write(json.dumps(doc) + '\n')
save_path = '/content/drive/MyDrive/FIQA/Data'
# Save JSON for Anserini
convert_to_anserini_json(documents_cleaned_df, f'{save_path}/docs_cleaned.json')


In [13]:
import re

def simple_tokenize(text):
    """
    Tokenizes text into words using regular expressions.
    Lowercases and splits on word boundaries.
    """
    return re.findall(r'\b\w+\b', text.lower())


In [14]:
def process_questions(df):
    df = df.copy()
    df['q_processed'] = df['question'].str.lower()
    df['tokenized_q'] = df['q_processed'].apply(simple_tokenize)
    df['q_len'] = df['tokenized_q'].apply(len)
    return df

def process_answers(df):
    df = df.copy()
    df['doc_processed'] = df['doc'].str.lower()
    df['tokenized_ans'] = df['doc_processed'].apply(simple_tokenize)
    df['ans_len'] = df['tokenized_ans'].apply(len)
    return df


In [15]:
processed_questions = process_questions(questions_df)
processed_answers = process_answers(documents_cleaned_df)


In [16]:
print("Processed and tokenized questions")
print(processed_questions.head())

print("\n\nProcessed and tokenized answers")
print(processed_answers.head())

avg_q_count = processed_questions['q_len'].mean()
avg_ans_count = processed_answers['ans_len'].mean()

print("\nAverage question length:", round(avg_q_count))
print("Average answer length:", round(avg_ans_count))

print("Total answers:", len(processed_answers))
print("Number of answers with length > 512:", len(processed_answers[processed_answers['ans_len'] > 512]))


Processed and tokenized questions
     qid                                           question  \
NaN  qid                                           question   
0.0    0  what is considered a business expense on a bus...   
1.0    1  claiming business expenses for a business with...   
2.0    2  transferring money from one business checking ...   
3.0    3  having a separate bank account for business/in...   

               timestamp                                        q_processed  \
NaN            timestamp                                           question   
0.0   Nov 8 '11 at 15:14  what is considered a business expense on a bus...   
1.0  May 13 '14 at 13:17  claiming business expenses for a business with...   
2.0  Jan 20 '16 at 20:31  transferring money from one business checking ...   
3.0        Mar 1 at 0:24  having a separate bank account for business/in...   

                                           tokenized_q  q_len  
NaN                                         [que

# Volabulary Creation

We aggregate all tokens from both answers and questions, count their frequencies with a Counter, and assign each unique token a numeric index to build our vocabulary. Finally, we report the total vocabulary size and list the top 35 most common tokens.

In [17]:
from collections import Counter

def build_vocabulary(answer_df, question_df):
    """
    Builds a vocabulary from tokenized answer and question DataFrames.

    Returns:
        token_to_index (dict): Mapping from word to unique index
        token_frequency (Counter): Word frequency count
    """
    all_tokens = []

    # Collect tokens from answer texts
    for tokens in answer_df['tokenized_ans']:
        all_tokens.extend(tokens)

    # Collect tokens from question texts
    for tokens in question_df['tokenized_q']:
        all_tokens.extend(tokens)

    # Count token frequency
    token_frequency = Counter(all_tokens)

    # Assign unique index to each token
    token_to_index = {token: idx for idx, (token, _) in enumerate(token_frequency.items())}

    return token_to_index, token_frequency

# Create vocabulary
token_to_index, token_frequency = build_vocabulary(processed_answers, processed_questions)

# Display summary
print("Vocabulary size:", len(token_to_index))
print("Top 35 most common tokens:", token_frequency.most_common(35))


Vocabulary size: 80534
Top 35 most common tokens: [('the', 370840), ('to', 233388), ('a', 201661), ('you', 183916), ('and', 162963), ('of', 157276), ('is', 129802), ('in', 119895), ('that', 118440), ('it', 102796), ('i', 92618), ('for', 89211), ('your', 68124), ('are', 67216), ('if', 60721), ('be', 59318), ('on', 58305), ('s', 56810), ('have', 55736), ('t', 51598), ('as', 50035), ('this', 49872), ('not', 49228), ('they', 49120), ('or', 46089), ('with', 45859), ('can', 43993), ('but', 41743), ('will', 36827), ('at', 35536), ('an', 31344), ('money', 31117), ('so', 30013), ('would', 28819), ('from', 28421)]


# Saving Files

In [18]:
# Build ID → raw text maps
qid_to_question_text = dict(zip(questions_df['qid'], questions_df['question']))
docid_to_answer_text = dict(zip(documents_cleaned_df['docid'], documents_cleaned_df['doc']))

# Build ID → tokenized text maps
qid_to_question_tokens = dict(zip(processed_questions['qid'], processed_questions['tokenized_q']))
docid_to_answer_tokens = dict(zip(processed_answers['docid'], processed_answers['tokenized_ans']))








---



In [None]:
# Define a generic save function
def save_pickle(path, obj):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

# Save all to your specified path
save_pickle(f"{save_path}/token_to_index.pickle", token_to_index)
save_pickle(f"{save_path}/token_frequency.pickle", token_frequency)
save_pickle(f"{save_path}/qid_to_text.pickle", qid_to_question_text)
save_pickle(f"{save_path}/docid_to_text.pickle", docid_to_answer_text)
save_pickle(f"{save_path}/qid_to_question_tokens.pickle", qid_to_question_tokens)
save_pickle(f"{save_path}/docid_to_answer_tokens.pickle", docid_to_answer_tokens)

# Evaluate the Performance BM25



---



We load the ground-truth labels and BM25 test data from pickle files, build a mapping of each query to its ranked candidate documents, and normalize all IDs to strings.

In [4]:
import pickle
import numpy as np

# --- 1. Load Data ---
with open("/content/drive/MyDrive/FIQA/labels.pickle", "rb") as f:
    ground_truth_labels = pickle.load(f)

with open("/content/drive/MyDrive/FIQA/Data/test_set_50.pickle", "rb") as f:
    bm25_test_data = pickle.load(f)

# --- 2. Create BM25 Rankings Dictionary ---
def create_qid_pred_rank(test_data):
    """
    Converts test set into {qid: [top_50_candidate_docids]}
    """
    qid_to_ranked_docids = {}
    for example in test_data:
        qid = example[0]
        top_candidate_docids = example[2]
        qid_to_ranked_docids[qid] = top_candidate_docids
    return qid_to_ranked_docids

bm25_predictions = create_qid_pred_rank(bm25_test_data)

# --- 3. Normalize IDs to strings ---
bm25_rankings = {
    str(qid): [str(docid) for docid in docids]
    for qid, docids in bm25_predictions.items()
}

ground_truth_labels = {
    str(qid): {str(docid) for docid in docids}
    for qid, docids in ground_truth_labels.items()
}

# --- 4. Evaluation Functions ---
def evaluate(ranked_docs, relevant_docs, k):
    """
    Compute nDCG@k, MRR@k, and Precision@k for each query
    """
    def dcg(rels):
        return sum((1 / np.log2(i + 2) if rel else 0) for i, rel in enumerate(rels))

    def ndcg(preds, gold):
        rels = [1 if doc in gold else 0 for doc in preds[:k]]
        ideal_rels = sorted(rels, reverse=True)
        return dcg(rels) / dcg(ideal_rels) if dcg(ideal_rels) > 0 else 0

    def mrr(preds, gold):
        for i, doc in enumerate(preds[:k]):
            if doc in gold:
                return 1 / (i + 1)
        return 0

    def precision_at_k(preds, gold):
        return sum(1 for doc in preds[:k] if doc in gold) / k

    ndcgs, mrrs, precisions = [], [], []
    for qid, pred_docs in ranked_docs.items():
        gold_docs = relevant_docs.get(qid, set())
        ndcgs.append(ndcg(pred_docs, gold_docs))
        mrrs.append(mrr(pred_docs, gold_docs))
        precisions.append(precision_at_k(pred_docs, gold_docs))

    return {
        f"nDCG@{k}": np.mean(ndcgs),
        f"MRR@{k}": np.mean(mrrs),
        f"Precision@{k}": np.mean(precisions),
    }

# --- 5. Run Evaluation ---
k = 10  # Top-k documents to evaluate
results = evaluate(bm25_rankings, ground_truth_labels, k)

# --- 6. Print Results ---
print("\n BM25 Retrieval Evaluation:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")


 BM25 Retrieval Evaluation:
nDCG@10: 0.3348
MRR@10: 0.3046
Precision@10: 0.0664


**We evaluated the performance of the BM25 retriever by comparing its top-ranked answer candidates against ground truth labels using standard ranking metrics such as nDCG@10, MRR@10, and Precision@1.**

In [None]:
# --- Retriever Analysis (BM25) ---

total_questions = len(ground_truth_labels)

# Count questions where **no** relevant document was retrieved
no_relevant_count = sum(
    1 for qid, rel_docs in ground_truth_labels.items()
    if not any(doc in bm25_rankings.get(qid, []) for doc in rel_docs)
)
no_relevant_percent = round((no_relevant_count / total_questions) * 100)

# Count questions where **not all** relevant documents were retrieved
partial_relevant_count = sum(
    1 for qid, rel_docs in ground_truth_labels.items()
    if not all(doc in bm25_rankings.get(qid, []) for doc in rel_docs)
)
partial_relevant_percent = round((partial_relevant_count / total_questions) * 100)

# Display results
print("\nRetriever Analysis Report")
print(f"Questions with NO relevant answers retrieved: {no_relevant_count}/{total_questions} ({no_relevant_percent}%)")
print(f"Questions with MISSING some relevant answers: {partial_relevant_count}/{total_questions} ({partial_relevant_percent}%)")



Retriever Analysis Report
Questions with NO relevant answers retrieved: 6444/6648 (97%)
Questions with MISSING some relevant answers: 6550/6648 (99%)




---



# **Configuration**



---



We set BERT fine-tuning hyperparameters and define a helper to load pickled files. Then we load the FIQA train/validation/test splits and labels from Drive and initialize the BERT tokenizer.

In [None]:
# Configuration
config = {
    'bert_model_name': 'bert-qa',
    'max_seq_len': 512,
    'batch_size': 16,
    'learning_rate': 3e-6,
    'weight_decay': 0.01,
    'n_epochs': 2,
    'num_warmup_steps': 10000
}

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

#  Load ID-to-text dictionaries
docid_to_text = load_pickle('/content/drive/MyDrive/FIQA/Data/docid_to_text.pickle')
qid_to_text = load_pickle('/content/drive/MyDrive/FIQA/Data/qid_to_text.pickle')

#  Load train, validation, and test datasets
train_set = load_pickle('/content/drive/MyDrive/FIQA/Data/train_set_50.pickle')
valid_set = load_pickle('/content/drive/MyDrive/FIQA/Data/valid_set_50.pickle')
test_set = load_pickle('/content/drive/MyDrive/FIQA/Data/test_set_50.pickle')

#  Load ground-truth labels for evaluation
labels = load_pickle('/content/drive/MyDrive/FIQA/Data/labels.pickle')

#  Display data info
print(f" Number of training questions: {len(train_set)}")
print(f" Number of validation questions: {len(valid_set)}")
print(f" Number of test questions: {len(test_set)}")

#  Load BERT tokenizer
print("\n Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


 Number of training questions: 5676
 Number of validation questions: 631
 Number of test questions: 333

 Loading BERT tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

# DataLoaders

We tokenize all question–answer pairs into BERT inputs (IDs, token types, masks) with relevance labels, then wrap them in PyTorch DataLoaders using random sampling for training and sequential sampling for validation. Transformer warnings are silenced before instantiating the train and validation loaders with our config’s max sequence length and batch size. Finally, we print the number of batches in each loader to verify setup.


In [None]:
from tqdm import tqdm

def get_input_data(dataset, max_seq_len):
    """
    Creates input parameters for training and validation.

    Assumes:
        - `tokenizer` is a globally available HuggingFace tokenizer
        - `qid_to_text` and `docid_to_text` are global dictionaries

    Args:
        dataset: List of [qid, [positive_doc_ids], [candidate_doc_ids]]
        max_seq_len: Max token length for BERT input

    Returns:
        input_ids, token_type_ids, att_masks, labels
    """
    input_ids = []
    token_type_ids = []
    att_masks = []
    labels = []

    for qid, pos_doc_ids, candidate_doc_ids in tqdm(dataset, desc="Tokenizing QA pairs"):
        q_text = qid_to_text[qid]

        for docid in candidate_doc_ids:
            ans_text = docid_to_text[docid]

            # Tokenize using HuggingFace tokenizer
            encoded_seq = tokenizer.encode_plus(
                q_text,
                ans_text,
                max_length=max_seq_len,
                padding='max_length',
                truncation=True,
                return_token_type_ids=True,
                return_attention_mask=True
            )

            input_ids.append(encoded_seq['input_ids'])
            token_type_ids.append(encoded_seq['token_type_ids'])
            att_masks.append(encoded_seq['attention_mask'])
            labels.append(1 if docid in pos_doc_ids else 0)

    return input_ids, token_type_ids, att_masks, labels


In [None]:

def get_dataloader(dataset, split_type, max_seq_len, batch_size):
    """
    Creates DataLoader for training or validation.

    Args:
        dataset: List in format [qid, [positive_doc_ids], [candidate_doc_ids]]
        split_type: 'train' or 'validation'
        max_seq_len: Maximum sequence length
        batch_size: Batch size for loading

    Returns:
        dataloader: PyTorch DataLoader with encoded inputs
    """
    # Use globally defined tokenizer, qid_to_text, and docid_to_text
    input_id, token_type_id, att_mask, label = get_input_data(dataset, max_seq_len)

    # Convert all inputs to torch tensors
    input_ids = torch.tensor(input_id, dtype=torch.long)
    token_type_ids = torch.tensor(token_type_id, dtype=torch.long)
    att_masks = torch.tensor(att_mask, dtype=torch.long)
    labels = torch.tensor(label, dtype=torch.long)

    # Create TensorDataset
    data = TensorDataset(input_ids, token_type_ids, att_masks, labels)

    # Use appropriate sampler
    sampler = RandomSampler(data) if split_type == "train" else SequentialSampler(data)

    # Build DataLoader
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader


In [None]:
from tqdm import tqdm
from transformers import logging
logging.set_verbosity_error()
# Generate DataLoaders
train_dataloader = get_dataloader(
    train_set,
    'train',
    config['max_seq_len'],
    config['batch_size']
)


validation_dataloader = get_dataloader(
    valid_set,
    'validation',
    config['max_seq_len'],
    config['batch_size']
)

# Output stats
print(f"\n Size of training DataLoader: {len(train_dataloader)} batches")
print(f" Size of validation DataLoader: {len(validation_dataloader)} batches")

Tokenizing QA pairs:   1%|▏         | 85/5676 [00:15<16:53,  5.52it/s]


KeyboardInterrupt: 

# **Model Implementation**

We specify BERT fine-tuning hyperparameters and load a pre-trained bert-base-uncased model for binary sequence classification from Hugging Face. The model is then moved onto GPU if available, and a confirmation message prints the loaded model name.

In [1]:
from transformers import BertForSequenceClassification

# Config
config = {
    'bert_model_name': 'bert-base-uncased',  # Public Hugging Face model
    'max_seq_len': 512,
    'batch_size': 16,
    'learning_rate': 3e-6,
    'weight_decay': 0.01,
    'n_epochs': 2,
    'num_warmup_steps': 10000
}

# Load model
model = BertForSequenceClassification.from_pretrained(
    config['bert_model_name'],
    num_labels=2  # Binary classification
)

# Move model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(" Loaded model from Hugging Face:", config['bert_model_name'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Loaded model from Hugging Face: bert-base-uncased


In [None]:
from transformers import BertForSequenceClassification

# Define a function to download your model if needed
def get_model(model_name):
    if model_name == 'bert-base-uncased':
        # If using base BERT from HuggingFace, no need to download manually
        print(f"Using HuggingFace pretrained model: {model_name}")
    else:
        # If using a custom or fine-tuned model, you could implement a download here
        # For now we assume it's saved under the `model/` directory
        print(f"Using custom model from path: model/{model_name}")

# Call the function to handle model path logic
get_model(config['bert_model_name'])

# Determine model path
if config['bert_model_name'] == 'bert-base-uncased':
    model_path = config['bert_model_name']
else:
    model_path = "model/" + config['bert_model_name']  # e.g., model/bert-qa or model/finbert-task

# Load the model with 2 labels (binary classification: relevant or not)
model = BertForSequenceClassification.from_pretrained(
    model_path,
    cache_dir=None,
    num_labels=2
)

# Move model to the correct device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f" Model loaded on device: {device}")


Using HuggingFace pretrained model: bert-base-uncased
 Model loaded on device: cuda


# Accuracy, Training, Validation

We upgrade the Transformers library and import PyTorch’s AdamW optimizer along with a linear warmup‐and‐decay scheduler. The optimizer and scheduler are configured using our hyperparameters and the total number of training steps. We ensure the model save directory exists on Drive, then loop over epochs to train and validate the model. After each epoch, we save a new checkpoint if the validation loss decreases and print the training/validation loss and accuracy.

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
def get_accuracy(preds, labels):
    """
    Compute accuracy for binary classification.

    Args:
        preds (np.ndarray): Model prediction probabilities of shape (batch_size, 2).
        labels (np.ndarray): Ground-truth labels of shape (batch_size,).

    Returns:
        float: Accuracy score.
    """
    # Convert probability predictions to predicted class labels
    predicted_classes = np.argmax(preds, axis=1)

    # Ensure labels are also flattened for comparison
    true_labels = labels.flatten()

    # Calculate accuracy
    accuracy = np.mean(predicted_classes == true_labels)

    return accuracy

In [None]:
from tqdm import tqdm
import torch
import numpy as np

def train(model, train_dataloader, optimizer, scheduler, device):
    """
    Trains the model for one epoch and returns average loss and accuracy.

    Args:
        model (torch.nn.Module): The model to train
        train_dataloader (DataLoader): Dataloader for training set
        optimizer (torch.optim.Optimizer): Optimizer
        scheduler (torch.optim.lr_scheduler): Learning rate scheduler
        device (torch.device): Device to train on (CPU or GPU)

    Returns:
        avg_loss (float): Average training loss
        avg_acc (float): Average training accuracy
    """
    model.train()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = 0

    for batch in tqdm(train_dataloader, desc="Training"):
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        # Reset gradients
        model.zero_grad()

        # Forward pass
        outputs = model(
            input_ids=b_input_ids,
            token_type_ids=b_token_type_ids,
            attention_mask=b_input_mask,
            labels=b_labels
        )

        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Optimizer and scheduler step
        optimizer.step()
        scheduler.step()

        # Accumulate loss
        total_loss += loss.item()

        # Compute accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        batch_accuracy = get_accuracy(logits, label_ids)
        total_accuracy += batch_accuracy
        num_batches += 1

    avg_loss = total_loss / num_batches
    avg_acc = total_accuracy / num_batches

    return avg_loss, avg_acc


In [None]:
def validate(model, validation_dataloader, device):
    """
    Validates the model and returns average loss and accuracy.

    Args:
        model (torch.nn.Module): The trained model
        validation_dataloader (DataLoader): DataLoader for validation set
        device (torch.device): Device to run evaluation on

    Returns:
        avg_loss (float): Average validation loss
        avg_acc (float): Average validation accuracy
    """
    model.eval()
    total_loss = 0.0
    total_accuracy = 0.0
    num_batches = 0

    for batch in tqdm(validation_dataloader, desc="Validating"):
        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=b_input_ids,
                token_type_ids=b_token_type_ids,
                attention_mask=b_input_mask,
                labels=b_labels
            )

        loss = outputs.loss
        logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        batch_accuracy = get_accuracy(logits, label_ids)
        total_accuracy += batch_accuracy
        total_loss += loss.item()
        num_batches += 1

    avg_loss = total_loss / num_batches
    avg_acc = total_accuracy / num_batches

    return avg_loss, avg_acc


# **Fine** -**Tuning**

In [None]:
!pip install --upgrade transformers




In [None]:
from torch.optim import AdamW  # Use AdamW from PyTorch
from transformers import get_linear_schedule_with_warmup


# Optimizer: AdamW is the recommended optimizer for BERT
optimizer = AdamW(
    model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay']
)

# Total training steps = number of batches * number of epochs
n_epochs = config['n_epochs']
total_steps = len(train_dataloader) * n_epochs

# Scheduler: linear warmup followed by linear decay
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config['num_warmup_steps'],  # Typically 0.1 * total_steps
    num_training_steps=total_steps
)


In [None]:
import os

if not os.path.exists("/content/drive/MyDrive/FIQA/Model"):
    print(" 'model/' directory does not exist.")
else:
    print(" 'model/' directory exists.")


 'model/' directory exists.


In [None]:
# Set correct save directory
save_dir = "/content/drive/MyDrive/FIQA/Model"
os.makedirs(save_dir, exist_ok=True)

# Initialize best validation loss
best_valid_loss = float('inf')

# Training loop
for epoch in range(n_epochs):
    print(f"\n Epoch {epoch + 1}/{n_epochs}")

    # Training
    train_loss, train_acc = train(model, train_dataloader, optimizer, scheduler, device)

    # Validation
    valid_loss, valid_acc = validate(model, validation_dataloader, device)

    # Save model if validation improves
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        save_path = os.path.join(save_dir, f"{epoch + 1}_finbert-qa.pt")
        torch.save(model.state_dict(), save_path)
        print(f"New best model saved to {save_path}")

    # Print results
    print(f"\nResults for Epoch {epoch + 1}:")
    print(f"    Train Loss: {train_loss:.4f} | Accuracy: {train_acc * 100:.2f}%")
    print(f"    Validation Loss: {valid_loss:.4f} | Accuracy: {valid_acc * 100:.2f}%")



 Epoch 1/2


Training: 100%|██████████| 17738/17738 [7:21:27<00:00,  1.49s/it]
Validating: 100%|██████████| 1972/1972 [15:54<00:00,  2.07it/s]


New best model saved to /content/drive/MyDrive/FIQA/Model/1_finbert-qa.pt

Results for Epoch 1:
    Train Loss: 0.1098 | Accuracy: 97.74%
    Validation Loss: 0.0807 | Accuracy: 98.01%

 Epoch 2/2


Training: 100%|██████████| 17738/17738 [7:21:26<00:00,  1.49s/it]
Validating: 100%|██████████| 1972/1972 [15:48<00:00,  2.08it/s]


Results for Epoch 2:
    Train Loss: 0.0733 | Accuracy: 98.26%
    Validation Loss: 0.0891 | Accuracy: 98.03%





# Evaluate

We implement `predict()` to encode each question–candidate pair, run them through the fine-tuned BERT model to obtain relevance probabilities, and then sort candidates by their score. Using `get_rank()`, we re-rank every test query’s candidates, load the best checkpoint, perform the full test-set evaluation, and report average nDCG\@10, MRR\@10, and Precision\@1 as the final retrieval metrics.


In [None]:
import torch
from torch.nn.functional import softmax
import numpy as np

def predict(model, q_text, cands, max_seq_len):
    """
    Re-ranks the candidate answers for a given question using the model.

    Args:
        model (torch.nn.Module): Trained BERT model for classification.
        q_text (str): Question text.
        cands (list): List of candidate doc IDs.
        max_seq_len (int): Max token length for inputs.

    Returns:
        ranked_ans (list): Candidate doc IDs ranked by relevance.
        sorted_scores (list): Corresponding relevance scores.
    """
    model.eval()
    scores = []
    cands_id = np.array(cands)

    for docid in cands:
        ans_text = docid_to_text.get(docid, "")

        encoded = tokenizer.encode_plus(
            q_text,
            ans_text,
            max_length=max_seq_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoded['input_ids'].to(device)
        token_type_ids = encoded['token_type_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=input_ids,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            prob = softmax(logits, dim=1)
            scores.append(prob[:, 1].item())  # Probability of relevance (label=1)

    # Sort candidate IDs based on scores
    sorted_idx = np.argsort(scores)[::-1]
    ranked_ans = cands_id[sorted_idx].tolist()
    sorted_scores = np.round(np.array(scores)[sorted_idx], 3).tolist()

    return ranked_ans, sorted_scores


In [None]:
from tqdm import tqdm

def get_rank(model, test_set, max_seq_len):
    """
    Re-ranks candidate answers for each question in the test set using the fine-tuned model.

    Args:
        model (torch.nn.Module): Trained BERT model for sequence classification.
        test_set (list): List of test samples in [qid, [relevant_docids], [candidate_docids]] format.
        max_seq_len (int): Maximum sequence length for input encoding.

    Returns:
        dict: Mapping of qid -> list of ranked candidate docids based on model scores.
    """
    qid_pred_rank = {}
    model.eval()

    for seq in tqdm(test_set, desc=" Re-ranking"):
        qid, label, cands = seq
        q_text = qid_to_text.get(qid, "")

        ranked_ans, _ = predict(model, q_text, cands, max_seq_len)
        qid_pred_rank[qid] = ranked_ans

    return qid_pred_rank


In [None]:
from transformers import BertForSequenceClassification
import torch

# Path to the saved checkpoint
checkpoint_path = "/content/drive/MyDrive/FIQA/Model/1_finbert-qa.pt"

# Initialize model architecture
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Load saved weights
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
model.to(device)

print(" Model loaded successfully from:", checkpoint_path)


 Model loaded successfully from: /content/drive/MyDrive/FIQA/Model/1_finbert-qa.pt


In [None]:
from transformers import BertForSequenceClassification
import torch
import os

# Define the model path (use your actual saved model)
trained_model_path = "/content/drive/MyDrive/FIQA/Model/1_finbert-qa.pt"

# Ensure the model is initialized before loading weights
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_state_dict(torch.load(trained_model_path, map_location=device))
model.to(device)
model.eval()

print("Model loaded and ready for evaluation.")

# Evaluate on test set
print("\n Evaluating on test set...\n")
qid_pred_rank = get_rank(model, test_set, config['max_seq_len'])

# Evaluation parameters
k = 10
num_q = len(test_set)

# `labels` should be a dict {qid: [relevant_docids]} prepared during preprocessing
MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, labels, k)

# Output metrics
print("\n Evaluation Results:")
print(f" Average nDCG@{k} for {num_q} queries: {average_ndcg:.3f}")
print(f" MRR@{k} for {num_q} queries: {MRR:.3f}")
print(f" Precision@1 for {num_q} queries: {precision:.3f}")


Model loaded and ready for evaluation.

 Evaluating on test set...



 Re-ranking: 100%|██████████| 333/333 [09:35<00:00,  1.73s/it]


 Evaluation Results:
 Average nDCG@10 for 333 queries: 0.344
 MRR@10 for 333 queries: 0.417
 Precision@1 for 333 queries: 0.342





In [None]:
import os
os.makedirs('/content/drive/MyDrive/FIQA/Data/json_docs', exist_ok=True)
os.rename('/content/drive/MyDrive/FIQA/Data/docs_cleaned.json',
          '/content/drive/MyDrive/FIQA/Data/json_docs/docs_cleaned.json')


# Answer Re-ranking

In [None]:
!pip install -q pyserini


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/194.6 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m120.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m105.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m56.8 MB/s[0m eta 

In [None]:
!pip install pyserini==0.20.0

# Install minimal required packages to satisfy import checks
!pip install faiss-cpu torch==2.0.1
!pip install onnxruntime

# Step 1: Set classpath before anything else
import os

os.environ['CLASSPATH'] = (
    '/usr/local/lib/python3.11/dist-packages/pyserini/resources/jars/pyserini-0.20.0-fatjar.jar:'
    '/content/lucene-backward-codecs-9.9.1.jar'
)

!wget https://repo1.maven.org/maven2/org/apache/lucene/lucene-backward-codecs/9.9.1/lucene-backward-codecs-9.9.1.jar -P /content/

!sudo apt-get update
!sudo apt-get install openjdk-21-jdk -y

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Also re-apply CLASSPATH
os.environ["CLASSPATH"] = (
    "/usr/local/lib/python3.11/dist-packages/pyserini/resources/jars/pyserini-0.20.0-fatjar.jar:"
    "/content/lucene-backward-codecs-9.9.1.jar"
)

Collecting pyserini==0.20.0
  Downloading pyserini-0.20.0-py3-none-any.whl.metadata (4.5 kB)
Collecting pyjnius>=1.4.0 (from pyserini==0.20.0)
  Downloading pyjnius-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting nmslib>=2.1.1 (from pyserini==0.20.0)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnxruntime>=1.8.1 (from pyserini==0.20.0)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting pybind11<2.6.2 (from nmslib>=2.1.1->pyserini==0.20.0)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl.metadata (8.7 kB)
Collecting coloredlogs (from onnxruntime>=1.8.1->pyserini==0.20.0)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredl

Collecting onnxruntime
  Using cached onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Using cached coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Using cached humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Using cached onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.4 MB)
Using cached coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
Using cached humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
Installing collected packages: humanfriendly, coloredlogs, onnxruntime
Successfully installed coloredlogs-15.0.1 humanfriendly-10.0 onnxruntime-1.22.0
--2025-07-07 10:51:24--  https://repo1.maven.org/maven2/org/apache/lucene/lucene-backward-codecs/9.9.1/lucene-backward-codecs-9.9.1.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
C

In [None]:
# Remove existing Java
!apt remove openjdk-11-jdk -y || true
!apt remove openjdk-17-jdk -y || true

# Install Java 21
!sudo add-apt-repository ppa:openjdk-r/ppa -y
!sudo apt-get update
!sudo apt install openjdk-21-jdk -y


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following package was automatically installed and is no longer required:
  openjdk-11-jre
Use 'apt autoremove' to remove it.
The following packages will be REMOVED:
  openjdk-11-jdk
0 upgraded, 0 newly installed, 1 to remove and 38 not upgraded.
After this operation, 3,046 kB disk space will be freed.
(Reading database ... 127252 files and directories currently installed.)
Removing openjdk-11-jdk:amd64 (11.0.27+6~us1-0ubuntu1~22.04) ...
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Package 'openjdk-17-jdk' is not installed, so not removed
The following package was automatically installed and is no longer required:
  openjdk-11-jre
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.
PPA publishes dbgsym, you may need to include 'main/debug' component
Repository: 'deb https://ppa.launchpadcon

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-21-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

!java -version


openjdk version "11.0.27" 2025-04-15
OpenJDK Runtime Environment (build 11.0.27+6-post-Ubuntu-0ubuntu122.04)
OpenJDK 64-Bit Server VM (build 11.0.27+6-post-Ubuntu-0ubuntu122.04, mixed mode, sharing)


In [None]:
!python -m pyserini.index.lucene \
  --collection JsonCollection \
  --input /content/drive/MyDrive/FIQA/Data/json_docs \
  --index /content/drive/MyDrive/FIQA/Indexes/lucene-index-fiqa \
  --generator DefaultLuceneDocumentGenerator \
  --threads 4 \
  --verbose


/usr/bin/python3: Error while finding module specification for 'pyserini.index.lucene' (ModuleNotFoundError: No module named 'pyserini')


# Retriever

In [None]:
from pyserini.search.lucene import LuceneSearcher
import pickle

# Load test set and query text map
with open('/content/drive/MyDrive/FIQA/Data/test_set.pickle', 'rb') as f:
    test_set = pickle.load(f)

with open('/content/drive/MyDrive/FIQA/Data/qid_to_text.pickle', 'rb') as f:
    qid_to_text = pickle.load(f)

# Lucene index path
FIQA_INDEX = "/content/drive/MyDrive/FIQA/Indexes/lucene-index-fiqa"

# Retriever using Pyserini
searcher = LuceneSearcher(FIQA_INDEX)

# Get a sample from the test set
seq = test_set[91]
qid, label = seq  # your test_set has 2 elements per row
q_text = qid_to_text[qid]
query = q_text
print(query)

# Retrieve top-50 answer candidates
hits = searcher.search(query, k=50)
cands = []

for i in range(len(hits)):
    cands.append(int(hits[i].docid))  # store doc IDs as ints


Are credit histories/scores international?


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_name = 'bert-base-uncased'  # or whatever you used to train

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.load_state_dict(torch.load(trained_model_path, map_location='cpu'))
model.eval()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

# Re-Ranker

This section reloads the fine‐tuned BERT tokenizer and sequence‐classification model, loads the saved checkpoint weights, and defines a `predict()` function that encodes each query–document pair, runs them through the model to get relevance scores, and then ranks the documents by those scores. Finally, it demonstrates reranking on a sample query by loading the document texts, invoking `predict()`, and printing the top-5 most relevant answers.


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Use the correct base: bert-base-uncased (since vocab size = 30522)
model_name = 'bert-base-uncased'

# Load tokenizer and model architecture
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load the trained weights (state_dict)
trained_model_path = "/content/drive/MyDrive/FIQA/Model/1_finbert-qa.pt"
state_dict = torch.load(trained_model_path, map_location='cpu')
model.load_state_dict(state_dict)
model.eval()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
def predict(model, query, docid_list, max_len=512):
    model.eval()
    inputs = []
    for docid in docid_list:
        doc_text = docid_to_text.get(docid, "[Missing document]")
        encoded = tokenizer.encode_plus(
            query,
            doc_text,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        inputs.append(encoded)

    scores = []
    with torch.no_grad():
        for encoded in inputs:
            output = model(**encoded)
            logits = output.logits
            score = logits[0][1].item()  # Score for class 1 = relevant
            scores.append(score)

    # Rank docids by score
    ranked = [doc for _, doc in sorted(zip(scores, docid_list), reverse=True)]
    return ranked, scores


In [None]:
import pickle

with open('/content/drive/MyDrive/FIQA/Data/docid_to_text.pickle', 'rb') as f:
    docid_to_text = pickle.load(f)

# Run the reranking
rank, scores = predict(model, query, cands, max_len=512)

# Print the Top-k answers
k = 5
print("Query:\n\t{}\n".format(query))
print("Top-{} Answers: \n".format(k))
for i in range(k):
    docid = rank[i]
    print("{}. {}\n".format(i + 1, docid_to_text.get(docid, "[Text missing]")))


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Query:
	Are credit histories/scores international?

Top-5 Answers: 

1. Currently the credit history are not International but are local. Many countries don't have a concept of credit history yet.   Having said that, if you are moving to US, depending on your history in your country, you can ask the same bank to provide you with a card and then start building history. For example in India I had a card with Citi Bank and when I moved to US for a short period, I was given a card based on my India Card, with equivalent credit in USD. If you are moving often internationally, it would make sense to Bank with a leading bank that provide services in geographies of your interest [Citi, HSBC, etc] and then in a new country approach these institutions to get you some starting credit for you to build a history.

2. It's not just that credit history is local; it's that it's a private business run for profit. The "big three" credit bureaus in the US are Experian, Equifax and Transunion.  They colle

# Analysis

This analysis section defines `get_rel()` to map any ranked list of doc IDs to a binary relevance vector based on the gold labels. The `predict()` function is reused to re-rank BM25 candidates by their model-predicted relevance probabilities, and we then compare the original BM25 ranking and the re-ranked list side by side. Finally, we print both rankings, their binary relevancies, and the model’s probability scores to illustrate how the re-ranker improves on the initial retrieval.


In [None]:
def get_rel(labels, ranked_list):
    """Return a binary vector indicating if docid in ranked_list is in labels."""
    return [1 if docid in labels else 0 for docid in ranked_list]


In [None]:
import torch
import torch.nn.functional as F

def predict(model, query, docid_list, max_len=512):
    model.eval()
    probs = []

    for docid in docid_list:
        doc_text = docid_to_text.get(docid, "[Missing document]")
        encoded = tokenizer.encode_plus(
            query,
            doc_text,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        with torch.no_grad():
            logits = model(**encoded).logits  # shape [1, 2]
            prob = F.softmax(logits, dim=1)[0][1].item()  # prob of class 1 (relevant)
            probs.append(prob)

    # Sort docids by probability (descending)
    ranked_docids = [doc for _, doc in sorted(zip(probs, docid_list), reverse=True)]
    sorted_probs = sorted(probs, reverse=True)

    return ranked_docids, sorted_probs


In [None]:
# Re-rank top BM25 candidates
rank, scores = predict(model, query, cands, max_len=512)

# Compute relevance vectors
cand_rel = get_rel(label, cands)
pred_rel = get_rel(label, rank)

# Print
print("Retriever: \n\t Ranking: {}\n\n\t Relevancy: {}\n".format(cands[:10], cand_rel[:10]))
print("Re-ranker: \n\t Ranking: {}\n\n\t Probability: {}\n\n\t Relevancy: {}".format(
    rank[:10],
    [round(s, 4) for s in scores[:10]],
    pred_rel[:10]
))
print("\nLabel: \n\t{}".format(label))


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Retriever: 
	 Ranking: [111466, 509739, 166875, 206267, 304578, 267422, 293363, 192641, 336468, 82472]

	 Relevancy: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

Re-ranker: 
	 Ranking: [346166, 184365, 304578, 245729, 501826, 509739, 75300, 592379, 111466, 23016]

	 Probability: [0.5703, 0.3482, 0.0913, 0.0045, 0.0038, 0.0021, 0.0016, 0.0014, 0.0011, 0.001]

	 Relevancy: [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]

Label: 
	[346166, 184365, 267422, 50080]




---



---





---



---



# **Financial Roberta Model**

This project focuses on developing a Financial Question Answering (QA) system using the Financial RoBERTa-QA framework. The system integrates techniques from Information Retrieval (IR) and Natural Language Processing (NLP).

In [None]:
import torch
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pickle

# Path to saved model weights (state_dict)
trained_model_path = "/content/drive/MyDrive/FIQA/Model/finroberta-qa.pt"

# Load supporting data
with open('/content/drive/MyDrive/FIQA/Data/docid_to_text.pickle', 'rb') as f:
    docid_to_text = pickle.load(f)

with open('/content/drive/MyDrive/FIQA/Data/qid_to_text.pickle', 'rb') as f:
    qid_to_text = pickle.load(f)

with open('/content/drive/MyDrive/FIQA/Data/test_set.pickle', 'rb') as f:
    test_set = pickle.load(f)


# Configure

We load all FIQA data mappings and train/validation/test splits from pickle files, print their sizes for verification, and initialize the Financial-RoBERTa tokenizer (`soleimanian/financial-roberta-large-sentiment`) for downstream encoding. Finally, we set up a new config dict tailored to fine-tuning this Financial-RoBERTa model with updated batch size and hyperparameters.


In [31]:
from transformers import RobertaTokenizer
import pickle
import os

# Helper function to load pickle files
def load_pickle(file_path):
    with open(file_path, 'rb') as handle:
        return pickle.load(handle)

# Base path
base_path = '/content/drive/MyDrive/FIQA/Data'

# Load data
docid_to_text = load_pickle(os.path.join(base_path, 'docid_to_text.pickle'))
qid_to_text = load_pickle(os.path.join(base_path, 'qid_to_text.pickle'))

train_set = load_pickle(os.path.join(base_path, 'train_set_50.pickle'))
valid_set = load_pickle(os.path.join(base_path, 'valid_set_50.pickle'))
test_set  = load_pickle(os.path.join(base_path, 'test_set_50.pickle'))

labels = load_pickle(os.path.join(base_path, 'labels.pickle'))

# Print dataset stats
print("Number of questions in the training set: {}".format(len(train_set)))
print("Number of questions in the validation set: {}".format(len(valid_set)))
print("Number of questions in the test set: {}".format(len(test_set)))

# Load tokenizer for the same model you're using
print('\nLoading Financial-RoBERTa tokenizer...')
tokenizer = RobertaTokenizer.from_pretrained('soleimanian/financial-roberta-large-sentiment')

config = {
    'bert_model_name': 'soleimanian/financial-roberta-large-sentiment',
    'max_seq_len': 512,
    'batch_size': 4,
    'learning_rate': 3e-6,
    'weight_decay': 0.01,
    'n_epochs': 2,
    'num_warmup_steps': 10000
}




Number of questions in the training set: 5676
Number of questions in the validation set: 631
Number of questions in the test set: 333

Loading Financial-RoBERTa tokenizer...


# Data Prepare

We define `get_input_data_roberta()` to tokenize each question–document pair with the Financial-RoBERTa tokenizer, producing per-pair input IDs, attention masks, and binary relevance labels. Then `get_dataloader_roberta()` wraps these tensors in PyTorch DataLoaders—with random sampling for training and sequential for validation—before silencing verbose Transformer logs and instantiating the train/validation loaders, whose batch counts are printed.


In [None]:
from tqdm import tqdm

def get_input_data_roberta(dataset, max_seq_len, tokenizer, qid_to_text, docid_to_text):
    """
    Creates input parameters for FinRoBERTa.

    Returns:
        input_ids: list of token ID sequences
        att_masks: list of attention masks
        labels: binary labels indicating document relevance
    Args:
        dataset: list of lists in the format [qid, [pos_docids], [candidate_docids]]
    """
    input_ids = []
    att_masks = []
    labels = []

    for i, seq in enumerate(tqdm(dataset)):
        qid, ans_labels, cands = seq[0], seq[1], seq[2]
        q_text = qid_to_text[qid]

        for docid in cands:
            ans_text = docid_to_text[docid]

            encoded_seq = tokenizer.encode_plus(
                q_text,
                ans_text,
                max_length=max_seq_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_token_type_ids=False
            )

            input_id = encoded_seq['input_ids']
            att_mask = encoded_seq['attention_mask']
            label = 1 if docid in ans_labels else 0

            assert len(input_id) == max_seq_len, "Input id dimension incorrect!"
            assert len(att_mask) == max_seq_len, "Attention mask dimension incorrect!"

            input_ids.append(input_id)
            att_masks.append(att_mask)
            labels.append(label)

    return input_ids, att_masks, labels


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def get_dataloader_roberta(dataset, split_type, max_seq_len, batch_size, tokenizer, qid_to_text, docid_to_text):
    """
    Creates train/validation/test DataLoader for FinRoBERTa.

    Args:
        dataset: List of lists in the form [qid, [pos ans], [ans cands]]
        split_type: str - 'train', 'validation', or 'test'
        max_seq_len: int
        batch_size: int
        tokenizer: RobertaTokenizer
        qid_to_text: dict mapping qid to question text
        docid_to_text: dict mapping docid to document text

    Returns:
        DataLoader object
    """
    input_ids, att_masks, labels = get_input_data_roberta(
        dataset, max_seq_len, tokenizer, qid_to_text, docid_to_text
    )

    # Convert to torch tensors
    input_ids = torch.tensor(input_ids)
    att_masks = torch.tensor(att_masks)
    labels = torch.tensor(labels)

    # Create dataset and sampler
    data = TensorDataset(input_ids, att_masks, labels)
    sampler = RandomSampler(data) if split_type == "train" else SequentialSampler(data)

    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader


In [None]:
import transformers
transformers.logging.set_verbosity_error()


In [None]:
# Create DataLoaders using FinRoBERTa-compatible function
train_dataloader = get_dataloader_roberta(
    train_set, 'train',
    config['max_seq_len'],
    config['batch_size'],
    tokenizer,
    qid_to_text,
    docid_to_text
)

validation_dataloader = get_dataloader_roberta(
    valid_set, 'validation',
    config['max_seq_len'],
    config['batch_size'],
    tokenizer,
    qid_to_text,
    docid_to_text
)

# Print DataLoader sizes
print("\n\nSize of the training DataLoader: {}".format(len(train_dataloader)))
print("Size of the validation DataLoader: {}".format(len(validation_dataloader)))


100%|██████████| 5676/5676 [09:24<00:00, 10.06it/s]
100%|██████████| 631/631 [01:02<00:00, 10.14it/s]



Size of the training DataLoader: 70950
Size of the validation DataLoader: 7888





# Model

We load the `soleimanian/financial-roberta-large-sentiment` tokenizer and its corresponding sequence‐classification model from Hugging Face. Setting the model to evaluation mode (`model.eval()`) prepares it for inference on financial QA pairs.


In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

model_name = "soleimanian/financial-roberta-large-sentiment"



tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)
model.eval()


pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [None]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")
!nvidia-smi


CUDA available: True
GPU: NVIDIA A100-SXM4-40GB
Sun Jul  6 22:22:26 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   33C    P0             48W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+


# Training and Validation

We ensure the FinRoBERTa model save directory exists and initialize `best_valid_loss` to track improvements. For each epoch, we record the start time, train the model with `train_roberta()`, and validate it with `validate_roberta()`. If the validation loss improves, we save the model checkpoint to Drive. Finally, we print each epoch’s duration, training/validation loss, and accuracy to monitor progress.


In [None]:
import numpy as np
import torch
import torch.nn.functional as F

def get_accuracy(preds, labels):
    """
    Compute binary classification accuracy for FinRoBERTa.

    Args:
        preds (torch.Tensor or np.ndarray): Logits or probabilities of shape [batch_size, 2]
        labels (torch.Tensor or np.ndarray): Ground truth labels (0 or 1)

    Returns:
        accuracy (float): Classification accuracy
    """
    # Convert logits to probabilities if needed
    if isinstance(preds, torch.Tensor):
        preds = preds.detach().cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()

    # Convert logits to predicted class
    probs = F.softmax(torch.tensor(preds), dim=1).numpy()  # shape [batch_size, 2]
    pred_labels = np.argmax(probs, axis=1)

    # Flatten labels just in case
    labels = labels.flatten()
    pred_labels = pred_labels.flatten()

    # Calculate accuracy
    accuracy = np.mean(pred_labels == labels)

    return accuracy


In [None]:
from tqdm import tqdm
import torch
import numpy as np

def train_roberta(model, train_dataloader, optimizer, scheduler, device):
    """
    Train FinRoBERTa model and return average loss and accuracy.

    Args:
        model: Pre-trained Roberta model with classification head
        train_dataloader: DataLoader
        optimizer: Optimizer
        scheduler: Learning rate scheduler
        device: torch.device ('cuda' or 'cpu')

    Returns:
        avg_loss (float)
        avg_acc (float)
    """
    total_loss = 0
    train_accuracy = 0
    num_steps = 0

    model.train()  # Set model to training mode

    for step, batch in enumerate(tqdm(train_dataloader)):
        # FinRoBERTa batches: [input_ids, attention_mask, labels]
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        # Forward pass (FinRoBERTa does not need token_type_ids)
        outputs = model(
            input_ids=b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels
        )

        # Unpack loss and logits
        loss = outputs.loss
        logits = outputs.logits

        # Calculate accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_accuracy = get_accuracy(logits, label_ids)

        train_accuracy += tmp_accuracy
        total_loss += loss.item()
        num_steps += 1

        # Backward + optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_loss = total_loss / len(train_dataloader)
    avg_acc = train_accuracy / num_steps

    return avg_loss, avg_acc


In [None]:
from tqdm import tqdm
import torch
import numpy as np

def validate_roberta(model, validation_dataloader, device):
    """
    Validate FinRoBERTa model and return average loss and accuracy.

    Args:
        model: Pre-trained Roberta model with classification head
        validation_dataloader: DataLoader
        device: torch.device ('cuda' or 'cpu')

    Returns:
        avg_loss (float)
        avg_acc (float)
    """
    model.eval()  # Set model to eval mode

    total_loss = 0
    eval_accuracy = 0
    num_steps = 0

    for batch in tqdm(validation_dataloader):
        # Move inputs to device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(
                input_ids=b_input_ids,
                attention_mask=b_input_mask,
                labels=b_labels
            )

        loss = outputs.loss
        logits = outputs.logits

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.detach().cpu().numpy()

        tmp_eval_accuracy = get_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        total_loss += loss.item()
        num_steps += 1

    avg_loss = total_loss / len(validation_dataloader)
    avg_acc = eval_accuracy / num_steps

    return avg_loss, avg_acc


# Fine Tune FinRoberta

In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

# Optimizer for FinRoBERTa
optimizer = AdamW(
    model.parameters(),
    lr=config['learning_rate'],
    weight_decay=config['weight_decay']
)

# Number of epochs
n_epochs = config['n_epochs']

# Total training steps
total_steps = len(train_dataloader) * n_epochs

# Linear learning rate scheduler with warm-up
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config['num_warmup_steps'],
    num_training_steps=total_steps
)


In [None]:
import torch

# Automatically use GPU if available, else fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Move model to the selected device
model.to(device)


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [None]:
import os
import torch
import time

save_dir = '/content/drive/MyDrive/FIQA/Model'
os.makedirs(save_dir, exist_ok=True)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    print(f"\n======== Epoch {epoch + 1} / {n_epochs} ========")
    start_time = time.time()

    # === Train ===
    train_loss, train_acc = train_roberta(
        model, train_dataloader, optimizer, scheduler, device
    )

    # === Validate ===
    valid_loss, valid_acc = validate_roberta(
        model, validation_dataloader, device
    )

    # === Save if Best ===
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        save_path = os.path.join(save_dir, f'{epoch + 1}_finroberta-qa.pt')
        torch.save(model.state_dict(), save_path)
        print(f" Saved best model to {save_path}")

    # === Epoch Summary ===
    elapsed = time.time() - start_time
    print(f"\n Epoch {epoch + 1} Summary:")
    print(f"    Duration:        {elapsed:.2f} sec")
    print(f"    Train Loss:      {train_loss:.4f} | Accuracy: {train_acc * 100:.2f}%")
    print(f"    Validation Loss: {valid_loss:.4f} | Accuracy: {valid_acc * 100:.2f}%\n")





100%|██████████| 70950/70950 [5:34:04<00:00,  3.54it/s]
100%|██████████| 7888/7888 [11:30<00:00, 11.42it/s]


 Saved best model to /content/drive/MyDrive/FIQA/Model/1_finroberta-qa.pt

 Epoch 1 Summary:
    Duration:        20757.88 sec
    Train Loss:      0.1557 | Accuracy: 97.40%
    Validation Loss: 0.1249 | Accuracy: 98.11%




100%|██████████| 70950/70950 [5:34:04<00:00,  3.54it/s]
100%|██████████| 7888/7888 [11:30<00:00, 11.42it/s]


 Epoch 2 Summary:
    Duration:        20735.54 sec
    Train Loss:      0.0858 | Accuracy: 98.56%
    Validation Loss: 0.1268 | Accuracy: 98.17%






# Evaluate

We implement `predict_roberta()` to tokenize each question–document pair, run it through the Financial‐RoBERTa model to get relevance probabilities, and sort candidates by their score. Then `get_rank_roberta()` applies this to every test query, and `evaluate()` computes MRR, nDCG\@10, and Precision\@1 over the re‐ranked results, finally printing the average metrics.


In [None]:
import pickle

import warnings
warnings.filterwarnings("ignore")

from transformers.utils import logging
logging.set_verbosity_error()

def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

qid_to_text = load_pickle("/content/drive/MyDrive/FIQA/qid_to_text.pickle")
docid_to_text = load_pickle("/content/drive/MyDrive/FIQA/docid_to_text.pickle")
labels = load_pickle("/content/drive/MyDrive/FIQA/labels.pickle")
test_set = load_pickle("/content/drive/MyDrive/FIQA/test_set_50.pickle")

# Normalize all IDs to string
qid_to_text = {str(k): v for k, v in qid_to_text.items()}
docid_to_text = {str(k): v for k, v in docid_to_text.items()}
labels = {str(k): set(map(str, v)) for k, v in labels.items()}
test_set = [(str(qid), q_text, list(map(str, cands))) for qid, q_text, cands in test_set]



In [None]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from tqdm import tqdm
import numpy as np
from torch.nn.functional import softmax
import warnings

# --- Setup ---
checkpoint_path = "/content/drive/MyDrive/FIQA/Model/1_finroberta-qa.pt"
model_name = "soleimanian/financial-roberta-large-sentiment"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_seq_len = 512
k = 10  # Top-k cutoff for evaluation

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_name)

# Load model and weights
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
model.to(device)
model.eval()

print("FinRoBERTa model loaded and ready for evaluation.\n")

# --- Reranking Function ---
def predict_roberta(model, q_text, cands, max_seq_len, tokenizer, docid_to_text, device):
    cands_id = np.array(cands)
    scores = []

    for docid in cands:
        ans_text = docid_to_text.get(docid, "[Missing document]")
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            encoded = tokenizer.encode_plus(
                q_text, ans_text,
                max_length=max_seq_len,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )

        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

        probs = softmax(logits, dim=1)
        score = probs[:, 1].item() if probs.shape[1] > 1 else probs[:, 0].item()
        scores.append(score)

    sorted_idx = np.argsort(scores)[::-1]
    ranked_ans = list(cands_id[sorted_idx])
    sorted_scores = list(np.around(np.array(scores)[sorted_idx], 3))
    return ranked_ans, sorted_scores

# --- Wrapper for full test set ---
def get_rank_roberta(model, test_set, max_seq_len, tokenizer, qid_to_text, docid_to_text, device):
    qid_pred_rank = {}
    model.eval()

    for seq in tqdm(test_set, desc="Re-ranking"):
        # Handle both 2-item and 3-item formats
        if len(seq) == 3:
            qid, _, cands = seq
        elif len(seq) == 2:
            qid, cands = seq
        else:
            print(f"Malformed entry: {seq}")
            continue

        q_text = qid_to_text[qid]
        ranked_ans, _ = predict_roberta(
            model, q_text, cands, max_seq_len, tokenizer, docid_to_text, device
        )
        qid_pred_rank[qid] = ranked_ans
    return qid_pred_rank

# --- Evaluation ---
def evaluate(qid_pred_rank, qrels, k=10):
    MRR = 0.0
    ndcg = 0.0
    precision = 0.0
    rank_positions = []

    for qid, ranked_docs in qid_pred_rank.items():
        rel_docs = set(qrels.get(qid, []))

        # Reciprocal Rank
        rr = 0.0
        for i, docid in enumerate(ranked_docs[:k]):
            if docid in rel_docs:
                rr = 1.0 / (i + 1)
                rank_positions.append(i + 1)
                break
        else:
            rank_positions.append(0)
        MRR += rr

        # Precision@1
        if ranked_docs and ranked_docs[0] in rel_docs:
            precision += 1

        # nDCG@k
        dcg = 0.0
        for i, docid in enumerate(ranked_docs[:k]):
            if docid in rel_docs:
                dcg += 1.0 / np.log2(i + 2)
        idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(rel_docs), k))])
        ndcg += dcg / idcg if idcg > 0 else 0.0

    num_q = len(qid_pred_rank)
    return (
        MRR / num_q,
        ndcg / num_q,
        precision / num_q,
        rank_positions
    )

# --- Re-rank and Evaluate ---
# Make sure `test_set`, `qid_to_text`, `docid_to_text`, and `labels` are already defined

qid_pred_rank = get_rank_roberta(
    model, test_set, max_seq_len, tokenizer, qid_to_text, docid_to_text, device
)

MRR, average_ndcg, precision, rank_pos = evaluate(qid_pred_rank, labels, k)

# --- Results ---
num_q = len(qid_pred_rank)
print("\nEvaluation Results:")
print(f"Average nDCG@{k} for {num_q} queries: {average_ndcg:.3f}")
print(f"MRR@{k} for {num_q} queries: {MRR:.3f}")
print(f"Precision@1 for {num_q} queries: {precision:.3f}")

pytorch_model.bin:  30%|###       | 619M/2.04G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

FinRoBERTa model loaded and ready for evaluation.



Re-ranking: 100%|██████████| 333/333 [09:01<00:00,  1.63s/it]


Evaluation Results:
Average nDCG@10 for 333 queries: 0.362
MRR@10 for 333 queries: 0.436
Precision@1 for 333 queries: 0.366





# Answer Reranking

In [30]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
from tqdm import tqdm
import numpy as np
from torch.nn.functional import softmax
import warnings

# --- Setup ---
checkpoint_path = "/content/drive/MyDrive/FIQA/Model/1_finroberta-qa.pt"
model_name = "soleimanian/financial-roberta-large-sentiment"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_seq_len = 512
k = 10  # Top-k cutoff for evaluation

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained(model_name)

# Load model and weights
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.load_state_dict(torch.load(checkpoint_path, map_location=device))
model.to(device)
model.eval()

print("FinRoBERTa model loaded and ready for evaluation.\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/936 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

FinRoBERTa model loaded and ready for evaluation.



In [18]:
import pickle

# Base path
base_path = "/content/drive/MyDrive/FIQA"

# Load docid_to_text and convert keys to str
with open(f"{base_path}/docid_to_text.pickle", "rb") as f:
    docid_to_text = pickle.load(f)
docid_to_text = {str(k): v for k, v in docid_to_text.items()}

# Load qid_to_text and convert keys to str
with open(f"{base_path}/qid_to_text.pickle", "rb") as f:
    qid_to_text = pickle.load(f)
qid_to_text = {str(k): v for k, v in qid_to_text.items()}

# Load test set
with open(f"{base_path}/test_set_50.pickle", "rb") as f:
    test_set = pickle.load(f)

# Load ground truth labels
with open(f"{base_path}/labels.pickle", "rb") as f:
    labels = pickle.load(f)
labels = {str(k): v for k, v in labels.items()}

In [23]:
from rank_bm25 import BM25Okapi

# Prepare corpus
doc_ids = sorted(docid_to_text.keys())
doc_texts = [docid_to_text[docid] for docid in doc_ids]
tokenized_corpus = [text.split() for text in doc_texts]
bm25 = BM25Okapi(tokenized_corpus)

# Search for one good test sample
for idx, sample in enumerate(test_set):
    qid, label = sample[0], sample[1]
    qid = str(qid)  # Fix type mismatch here
    query = qid_to_text[qid]
    query_tokens = query.split()
    scores = bm25.get_scores(query_tokens)
    top_indices = sorted(range(len(scores)), key=lambda i: -scores[i])[:50]
    cands = [doc_ids[i] for i in top_indices]

    if any(str(lab) in cands for lab in label):
        print(f"Found good test sample: test_set[{idx}]")
        print("QID:", qid)
        print("Label:", label)
        print("Matching candidates:", set(cands) & set(map(str, label)))
        break


Found good test sample: test_set[4]
QID: 458
Label: [263485, 218858]
Matching candidates: {'263485'}


In [24]:
qid, label = str(test_set[idx][0]), test_set[idx][1]
query = qid_to_text[qid]
query_tokens = query.split()

scores = bm25.get_scores(query_tokens)
top_indices = sorted(range(len(scores)), key=lambda i: -scores[i])[:50]
cands = [doc_ids[i] for i in top_indices]


In [28]:
def predict_roberta(model, q_text, cands, max_seq_len, tokenizer, docid_to_text, device):
    from torch.nn.functional import softmax
    import torch

    scores = []
    for docid in cands:
        ans_text = docid_to_text.get(str(docid), "")  # Ensure key is str
        encoded = tokenizer.encode_plus(
            q_text,
            ans_text,
            max_length=max_seq_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)

        with torch.no_grad():
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            prob = softmax(output.logits, dim=1)
            relevance_score = prob[0][1].item()  # Assuming label 1 = relevant
            scores.append(relevance_score)

    sorted_indices = sorted(range(len(scores)), key=lambda i: -scores[i])
    ranked_docids = [cands[i] for i in sorted_indices]
    sorted_scores = [scores[i] for i in sorted_indices]
    return ranked_docids, sorted_scores


In [None]:
def get_rel(relevant_docids, candidate_docids):
    return [1 if int(docid) in relevant_docids else 0 for docid in candidate_docids]
rank, probs = predict_roberta(
    model,
    query,
    cands,
    config['max_seq_len'],
    tokenizer,
    docid_to_text,
    device
)

cand_rel = get_rel(label, cands)
pred_rel = get_rel(label, rank)

In [None]:
print("Question: \n\t{}\n".format(query))

print("Retriever: \n\t Ranking: {}\n\n\t Relevancy: {}\n".format(cands[:10], cand_rel[:10]))

print("Re-ranker (FinRoBERTa):\n\t Ranking: {}\n\n\t Probability: {}\n\n\t Relevancy: {}\n".format(
    rank[:10], [round(p, 3) for p in probs[:10]], pred_rel[:10]))

print("Label DocIDs:\n\t{}\n".format(label))

print("Answer Re-ranker:\n\t{}\n".format(docid_to_text[str(rank[0])]))
print("Answer Retriever:\n\t{}\n".format(docid_to_text[str(cands[0])]))
print("Label (first):\n\t{}\n".format(docid_to_text[str(label[0])]))

Question: 
	How would IRS treat reimbursement in a later year of moving expenses?

Retriever: 
	 Ranking: ['62869', '263485', '382657', '131451', '297241', '217715', '300254', '272709', '292811', '303411']

	 Relevancy: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

Re-ranker (FinRoBERTa):
	 Ranking: ['263485', '62869', '357094', '131451', '160340', '42999', '300254', '2528', '97211', '21910']

	 Probability: [0.994, 0.059, 0.006, 0.005, 0.003, 0.0, 0.0, 0.0, 0.0, 0.0]

	 Relevancy: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Label DocIDs:
	[263485, 218858]

Answer Re-ranker:
	IRS pub 521 has all the information you need. Expenses reimbursed. If you are reimbursed for your expenses and you   use the cash method of accounting, you can deduct your expenses either   in the year you paid them or in the year you received the   reimbursement. If you use the cash method of accounting, you can   choose to deduct the expenses in the year you are reimbursed even   though you paid the expenses in a different year. See Choo

# Error Analysis

This section loads the FIQA test queries and labels, generates BM25 candidates, reranks them with FinRoBERTa, and then identifies any “false negative” relevant documents that didn’t appear in the top-10 predictions. It compiles these error cases into a Pandas DataFrame and saves the results as a CSV on Google Drive for detailed error analysis.


In [2]:
import pickle

# Base path
base_path = "/content/drive/MyDrive/FIQA"

# Load docid_to_text and convert keys to str
with open(f"{base_path}/docid_to_text.pickle", "rb") as f:
    docid_to_text = pickle.load(f)
docid_to_text = {str(k): v for k, v in docid_to_text.items()}

# Load qid_to_text and convert keys to str
with open(f"{base_path}/qid_to_text.pickle", "rb") as f:
    qid_to_text = pickle.load(f)
qid_to_text = {str(k): v for k, v in qid_to_text.items()}

# Load test set
with open(f"{base_path}/test_set_50.pickle", "rb") as f:
    test_set = pickle.load(f)

# Load ground truth labels
with open(f"{base_path}/labels.pickle", "rb") as f:
    labels = pickle.load(f)
labels = {str(k): v for k, v in labels.items()}

# Sample verification
print(" Loaded all data successfully!")
print(f"Sample docid: {list(docid_to_text.keys())[0]}")
print(f"Sample qid: {list(qid_to_text.keys())[0]}")
print(f"Sample test sample: {test_set[0]}")
print(f"Sample label for QID {test_set[0][0]}: {labels.get(str(test_set[0][0]), [])}")

 Loaded all data successfully!
Sample docid: 3
Sample qid: 0
Sample test sample: [14, [398960], [350497, 398960, 442533, 557838, 302722, 527120, 21219, 106185, 159660, 96910, 403501, 490170, 342073, 461526, 136857, 565827, 430120, 367391, 448405, 263464, 16646, 385949, 493939, 552163, 237215, 546115, 196683, 513658, 544172, 538860, 149820, 464560, 237800, 237911, 462831, 435404, 151442, 528034, 216494, 143247, 158864, 209754, 273381, 73283, 313590, 530110, 11148, 507829, 250873, 509659]]
Sample label for QID 14: [np.int64(398960)]


In [32]:
finroberta_results = {}

for sample in test_set:
    qid = str(sample[0])
    query = qid_to_text[int(qid)] if int(qid) in qid_to_text else qid_to_text[qid]
    query_tokens = query.split()

    # BM25 candidate docs
    scores = bm25.get_scores(query_tokens)
    top_indices = sorted(range(len(scores)), key=lambda i: -scores[i])[:50]
    cands = [doc_ids[i] for i in top_indices]

    # Rerank with FinRoBERTa
    rank, probs = predict_roberta(
        model, query, cands, config['max_seq_len'], tokenizer, docid_to_text, device
    )

    finroberta_results[qid] = rank

In [33]:
def create_ranked_docids(predictions_dict):
    return {str(qid): [str(docid) for docid in docids] for qid, docids in predictions_dict.items()}

finroberta_pred_rank = create_ranked_docids(finroberta_results)

In [34]:
false_negatives = []

for qid, ranked_docs in finroberta_pred_rank.items():
    relevant = set(labels.get(str(qid), []))
    top_k = set(ranked_docs[:10])
    missed = relevant - top_k

    if missed:
        try:
            query_text = qid_to_text[int(qid)] if int(qid) in qid_to_text else qid_to_text[str(qid)]
        except KeyError:
            print(f"Skipping qid '{qid}' — not found in qid_to_text")
            continue

        false_negatives.append({
            "qid": qid,
            "query": query_text,
            "relevant": list(relevant),
            "predicted_top10": ranked_docs[:10],
            "missed_relevant": list(missed)
        })

In [35]:
# Convert np.int64 to regular int or str for cleaner printing
for error in false_negatives:
    error["relevant"] = [int(x) for x in error["relevant"]]
    error["missed_relevant"] = [int(x) for x in error["missed_relevant"]]
    error["predicted_top10"] = [str(x) for x in error["predicted_top10"]]

In [36]:
# Pretty-print first 5 errors like example
for error in false_negatives[:5]:
    print(f"\nQuery ID: {error['qid']}")
    print(f"Query Text: {error['query']}")
    print(f"Relevant DocIDs: {error['relevant']}")
    print(f"Top-10 Predicted: {error['predicted_top10']}")
    print(f"Missed Relevant Docs: {error['missed_relevant']}")


Query ID: 14
Query Text: What are 'business fundamentals'?
Relevant DocIDs: [398960]
Top-10 Predicted: ['231254', '271150', '512151', '110394', '500034', '381103', '155074', '531626', '477208', '487094']
Missed Relevant Docs: [398960]

Query ID: 68
Query Text: Intentions of Deductible Amount for Small Business
Relevant DocIDs: [19183]
Top-10 Predicted: ['92232', '278702', '171276', '305982', '181412', '418999', '115274', '415899', '447167', '37134']
Missed Relevant Docs: [19183]

Query ID: 70
Query Text: Car as business expense, but not because of driving
Relevant DocIDs: [327002]
Top-10 Predicted: ['397608', '231279', '202315', '216077', '237827', '398141', '131334', '117274', '118280', '140714']
Missed Relevant Docs: [327002]

Query ID: 81
Query Text: Does revenue equal gross profit for info product business?
Relevant DocIDs: [451207]
Top-10 Predicted: ['324306', '447641', '503114', '288074', '505854', '240252', '430610', '58244', '34812', '260519']
Missed Relevant Docs: [451207]

Q

In [37]:
import pandas as pd

# Convert to DataFrame
df_errors = pd.DataFrame(false_negatives)

# Save to Drive with the desired filename
output_path = "/content/drive/MyDrive/FIQA/error_analysis_finRoberta.csv"
df_errors.to_csv(output_path, index=False)

print(f"Saved error analysis to: {output_path}")

Saved error analysis to: /content/drive/MyDrive/FIQA/error_analysis_finRoberta.csv
