In [1]:
import pyterrier as pt
import pickle
import pandas as pd
import shutil
import os
import re
from pathlib import Path
from pyterrier.measures import *

# Data exploration

In [2]:
dataset = pt.get_dataset('irds:beir/fiqa/test')

In [3]:
text = dataset.get_corpus_iter()
display(text)

beir/fiqa/test documents:   0%|          | 0/57638 [00:00<?, ?it/s]

<pyterrier.utils.GeneratorLen at 0x1c377daa780>

In [4]:
queries = dataset.get_topics()
display(queries)

Unnamed: 0,qid,query
0,4641,Where should I park my rainy-day / emergency f...
1,5503,Tax considerations for selling a property belo...
2,7803,Can the Delta be used to calculate the option ...
3,7017,Basic Algorithmic Trading Strategy
4,10152,What does a high operating margin but a small ...
...,...,...
643,4102,How can I determine if my rate of return is “g...
644,3566,Where can I buy stocks if I only want to inves...
645,94,Using credit card points to pay for tax deduct...
646,2551,How to find cheaper alternatives to a traditio...


In [5]:
qrels = dataset.get_qrels()
display(qrels)

Unnamed: 0,qid,docno,label,iteration
0,8,566392,1,0
1,8,65404,1,0
2,15,325273,1,0
3,18,88124,1,0
4,26,285255,1,0
...,...,...,...,...
1701,11039,330058,1,0
1702,11039,91183,1,0
1703,11054,155053,1,0
1704,11054,321015,1,0


In [6]:
def ensure_java_home():
    if os.environ.get("JAVA_HOME"):
        return
    java = shutil.which("java.exe" if os.name == "nt" else "java")
    if java:
        jdk_parent = Path(java).resolve().parents[1]
        cand = jdk_parent / ("bin/java.exe" if os.name == "nt" else "bin/java")
        if cand.exists():
            os.environ["JAVA_HOME"] = str(jdk_parent)
            return
    if not os.environ.get("JAVA_HOME"):
        raise EnvironmentError(
            "JAVA_HOME is not set. Install a JDK and set JAVA_HOME to the JDK folder."
        )

ensure_java_home()

def indexing(index_path, ds):
    index_path = str(Path(f"./{index_path}").resolve())
    if os.path.exists(index_path):
        shutil.rmtree(index_path)
    indexer = pt.index.IterDictIndexer(index_path, meta=["docno", "text"]) 
    indexref = indexer.index(ds.get_corpus_iter())
    index = pt.IndexFactory.of(indexref)
    stats = index.getCollectionStatistics()
    return indexref, stats

def rmv_problems(text: str) -> str:
    text = str(text)
    text = text.replace(":", " ")
    text = text.replace("'", " ")
    text = text.replace('"', " ")
    text = re.sub(r"\s+", " ", text).strip()

    return text



In [7]:
indexref, stats = indexing('beir-index', dataset)

Java started (triggered by TerrierIndexer.__init__) and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]




beir/fiqa/test documents: 100%|██████████| 57638/57638 [00:35<00:00, 1627.97it/s]


22:34:05.582 [main] WARN org.terrier.structures.indexing.Indexer -- Indexed 39 empty documents


In [8]:
print("Index folder:", 'beir-index')
print("Number of documents:", stats.getNumberOfDocuments())
print("Number of postings:", stats.getNumberOfPostings())
print("Number of tokens:", stats.getNumberOfTokens())
print("Number of unique terms:", stats.getNumberOfUniqueTerms())
print(f"Average document length:{stats.getAverageDocumentLength():.2f}")


Index folder: beir-index
Number of documents: 57638
Number of postings: 2714611
Number of tokens: 3783214
Number of unique terms: 51260
Average document length:65.64


# Baseline models

In [9]:
bm25 = pt.terrier.Retriever(indexref, wmodel='BM25', controls={"qtype": "matchop"})
tfidf = pt.terrier.Retriever(indexref, wmodel="TF_IDF", controls={"qtype": "matchop"})

In [10]:
queries['query'] = queries['query'].apply(rmv_problems)

In [11]:
res_bm25 = bm25.transform(queries[['qid','query']])
res_tfidf = tfidf.transform(queries[['qid','query']])

In [16]:
metrics = [P@1, P@5, P@10, R@5, R@10, nDCG@5, nDCG@10, AP]

1st baseline experiment

In [None]:

pt.Experiment(
    [tfidf, bm25],
    queries,
    qrels,
    eval_metrics=metrics,
    names=["TF_IDF", "BM25"],
    #baseline=0, 
    verbose=True,
)

pt.Experiment: 100%|██████████| 2/2 [00:38<00:00, 19.26s/system]


Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,TF_IDF,0.209962,0.236111,0.108642,0.071142,0.249805,0.313278,0.231547,0.253659
1,BM25,0.210385,0.236111,0.106481,0.07037,0.247471,0.309708,0.23006,0.252589


2nd baseline experiment: custom formulas + RRF

In [None]:
import pyterrier_alpha as pta
combined = bm25 + tfidf
dph = pt.terrier.Retriever(indexref, wmodel="DPH")

rrf = pta.RRFusion(bm25, tfidf, k=60, num_results=1000)
weighted_linear = bm25 + 2 * dph

In [16]:
systems = [bm25, tfidf, combined, weighted_linear, rrf]
names   = ["BM25", "TF_IDF", "(BM25+TFIDF)", "Weighted(BM25+2*TFIDF)", "RRF"]

results = pt.Experiment(
    systems,
    queries,
    qrels,
    metrics,
    names,
    perquery=False  
)
display(results)

Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,BM25,0.210385,0.236111,0.106481,0.07037,0.247471,0.309708,0.23006,0.252589
1,TF_IDF,0.209962,0.236111,0.108642,0.071142,0.249805,0.313278,0.231547,0.253659
2,(BM25+TFIDF),0.210414,0.236111,0.108025,0.070216,0.249713,0.30825,0.231324,0.252182
3,Weighted(BM25+2*TFIDF),0.208011,0.239198,0.107099,0.070216,0.242903,0.31158,0.228979,0.252283
4,RRF,0.209778,0.236111,0.108025,0.069907,0.24862,0.307249,0.23067,0.251379


3rd baseline experiment

In [18]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "microsoft/Phi-3.5-mini-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
`torch_dtype` is deprecated! Use `dtype` instead!
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Fetching 2 files: 100%|██████████| 2/2 [18:32<00:00, 556.43s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.25it/s]
Some parameters are on the meta device because they were offloaded to the cpu and disk

In [12]:
import re

def llm_expand_query(q: str, n_terms=10) -> str:
    prompt = f"""You are generating query expansion terms for searching financial documents.

Return ONLY a comma-separated list of 12 short keywords/phrases (1-3 words each).
Rules:
- Use synonyms, accounting terms, regulatory terms, report/filing terms, and common abbreviations.
- Do NOT introduce new company names, tickers, years, currencies, numbers, or events.
- No explanations, no numbering, no quotes.

Query: {q}
Expansion:"""

    out = pipe(prompt, max_new_tokens=60, do_sample=False, return_full_text=False)[0]["generated_text"]

    out = out.strip()
    out = out.replace('"', '').replace("'", "")
    out = re.sub(r'^\s*(expansion\s*:)?\s*', '', out, flags=re.I)

    terms = [t.strip() for t in re.split(r"[,\n;]+", out) if t.strip()]
    terms = [t for t in terms if 1 <= len(t.split()) <= 4][:n_terms]

    return q + " " + " ".join(terms)


In [None]:
queries_exp = queries.copy()
queries_exp["query"] = queries_exp["query"].map(lambda x: llm_expand_query(x, n_terms=10)) 


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


KeyboardInterrupt: 

In [20]:
queries_exp = pd.read_csv('./data/expanded_llm.csv')

In [17]:
import pyterrier as pt

# Ensure both are DataFrames with qid + query
queries = queries[["qid","query"]].copy()
queries_exp = queries_exp[["qid","query"]].copy()

# map: qid -> expanded query
qid2exp = dict(zip(queries_exp["qid"], queries_exp["query"]))

# transformer: replace query text based on qid
rewrite_from_precomputed = pt.apply.generic(
    lambda df: df.assign(
        query=df["qid"].map(qid2exp).where(
            df["qid"].map(qid2exp).notna(),
            df["query"]
        ).astype(str)
    )
)
bm25_exp = rewrite_from_precomputed >> bm25

results = pt.Experiment(
    [bm25, bm25_exp],
    queries,          
    qrels,
    eval_metrics=metrics,
    names=["BM25", "BM25 + LLM expansion"],
    verbose=True
)


pt.Experiment: 100%|██████████| 2/2 [00:45<00:00, 22.71s/system]


In [18]:
results

Unnamed: 0,name,AP,P@1,P@5,P@10,R@5,R@10,nDCG@5,nDCG@10
0,BM25,0.210385,0.236111,0.106481,0.07037,0.247471,0.309708,0.23006,0.252589
1,BM25 + LLM expansion,0.210385,0.236111,0.106481,0.07037,0.247471,0.309708,0.23006,0.252589


# Advanced models

In [None]:
from transformers import pipeline

ner_pipe = pipeline(
    "token-classification",
    model="dslim/bert-base-NER",
    aggregation_strategy="simple"
)

def extract_ner_hf(text: str):
    ents = ner_pipe(text[:2000])  
    return [{"text": e["word"], "label": e["entity_group"], "score": float(e["score"])} for e in ents]


queries['ner'] = queries['query'].apply(lambda x: extract_ner_hf(x,model))



In [None]:
!pip install keybert

In [None]:
from keybert import KeyBERT

kw_model = KeyBERT()

def keyBert_extraction(query,kw_model):
  keywords_list = kw_model.extract_keywords(query, keyphrase_ngram_range=(1, 2), stop_words='english',
                              use_maxsum=False, nr_candidates=25, top_n=20)

  return [{"tag": k, "score": float(s)} for k, s in keywords_list]


queries['keybert_tags'] = queries['query'].apply(lambda x: keyBert_extraction(x,kw_model))

display(queries)

In [None]:
raw_indexref = indexing('raw_docs',dataset)
ner_indexref= indexing('entities',dataset)
tag_indexref = indexing('tags',dataset)

Advanced model 2

In [None]:
from FlagEmbedding import FlagLLMReranker

reranker = FlagLLMReranker("BAAI/bge-reranker-v2-gemma", use_fp16=True)