In [None]:
!pip install tiktoken
!pip install -U voyageai
!pip install rank-bm25

In [87]:
import os
import re
import ast
import json
import torch
import random
import tiktoken
import voyageai
import zipfile
import requests
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from rag_utils import *

In [None]:
!git clone https://github.com/RegNLP/ObliQADataset.git

# loading the documents and extracting passages
documents_path = "ObliQADataset/StructuredRegulatoryDocuments"

docs = []
for i in range(1, 41):
    file_path = os.path.join(documents_path, f'{i}.json')
    with open(file_path, 'r') as file:
        docs.append(json.load(file))

docs = [passage for doc in docs for passage in doc]

print(f'Loaded 40 documents containing {len(docs)} passages.')

In [48]:
from spacy.lang.en import English
from transformers import AutoTokenizer

def enhance(docs: list, model_name: str = 'voyageai/voyage-law-2')->list:

    nlp = English()
    nlp.add_pipe("sentencizer")

    tokenizer = AutoTokenizer.from_pretrained(model_name) if model_name != "cl100k_base" else tiktoken.get_encoding("cl100k_base")

    for passage in tqdm(docs):
        passage['Combined'] = passage['PassageID'] + " " + passage['Passage']
        passage['char_count'] = len(passage['Combined'])
        passage['tokens_count'] = len(tokenizer(passage['Combined'], truncation=True)[0]) if model_name != "cl100k_base" else len(tokenizer.encode(passage['Combined'])) #TODO change this to tokenize maybe?

        passage["sentences"] = list(nlp(passage["Combined"]).sents)
        passage["sentences"] = [str(sentence) for sentence in passage["sentences"]] # making sure all sentences are strings (the default type is a spaCy datatype)

        passage["sentence_count"] = len(passage["sentences"])

    return docs

In [52]:
def token_based_split(passage: dict, model_name: str, tokenizer, max_tokens: int)->list:
    sentence_chunks = []
    current_chunk = []
    current_chunk_tokens = 0

    for sentence in passage['sentences']:
        sentence_tokens = len(tokenizer(sentence, truncation=True)[0]) if model_name != "cl100k_base" else len(tokenizer.encode(sentence))

        if current_chunk_tokens + sentence_tokens > max_tokens:
            sentence_chunks.append(current_chunk)
            current_chunk = []
            current_chunk_tokens = 0

        current_chunk.append(sentence)
        current_chunk_tokens += sentence_tokens

    sentence_chunks.append(current_chunk)
    return sentence_chunks

In [None]:
docs = enhance(docs, 'voyageai/voyage-law-2')

In [None]:
# split sentences into chunks
model_name = 'voyageai/voyage-law-2'
tokenizer = AutoTokenizer.from_pretrained(model_name) if model_name != "cl100k_base" else tiktoken.get_encoding("cl100k_base")
for pssg in tqdm(docs):
    pssg["sentence_chunks"] = token_based_split(pssg, model_name, tokenizer, max_tokens=16000) # 8191 for openai and 16000 for law and finance
    pssg["num_chunks"] = len(pssg["sentence_chunks"])

In [None]:
import re

# Split each chunk into its own item
psg_chunks = []
for item in tqdm(docs):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict['ID'] = item['ID']
        chunk_dict['DocumentID'] = item["DocumentID"]
        chunk_dict['PassageID'] = item['PassageID']

        # Join the sentences together into a paragraph-like structure, aka join the list of sentences into one paragraph
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" => ". A" (will work for any captial letter)

        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get some stats on our chunks
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(tokenizer(chunk_dict["sentence_chunk"], truncation=True)[0]) if model_name != "cl100k_base" else len(tokenizer.encode(chunk_dict["sentence_chunk"]))

        psg_chunks.append(chunk_dict)

len(psg_chunks)

In [None]:
df = pd.DataFrame(psg_chunks)
df.describe().round(2)

# Embeddings creation

In [22]:
from openai import OpenAI

open_api_key = "" # your OpenAI api key

client = OpenAI(api_key=open_api_key)
GPT_MODEL = "gpt-4o-2024-08-06"
EMBEDDING_MODEL = "text-embedding-3-large"

voyage_api_key = '' # your VoyageAI api key
vo = voyageai.Client(api_key=voyage_api_key)

In [23]:
from tenacity import retry, wait_random_exponential, stop_after_attempt

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model=EMBEDDING_MODEL) -> list[float]:
    return client.embeddings.create(input=[text], model=model).data[0].embedding

# Simple function to take in a list of text objects and return them as a list of embeddings
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(10))
def get_embeddings(input):
    response = client.embeddings.create(
        input=input,
        model=EMBEDDING_MODEL
    ).data
    return [data.embedding for data in response]

@retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(6))
def embed_with_backoff(**kwargs):
    return vo.embed(**kwargs)

In [36]:
def create_embeddings(psg_chunks: list, content_key: str, emb_model_name: str, batch_size: int = 100, input_type="document")->list:

    embeddings_list = []
    for i in tqdm(range(0, len(psg_chunks), batch_size)):
        batch_sentences = [item[content_key] for item in psg_chunks[i:i+batch_size]]

        batch_embeddings = get_embeddings(batch_sentences) if emb_model_name == 'text-embedding-3-large' else embed_with_backoff(texts=batch_sentences, model=emb_model_name, input_type=input_type).embeddings
        embeddings_list.extend(batch_embeddings)
    return embeddings_list

In [None]:
embeddings_list = create_embeddings(psg_chunks, 'sentence_chunk', 'voyage-law-2', 128) # 100 for openai and 128 for voyage

for i in tqdm(range(len(psg_chunks))):
    psg_chunks[i]['embedding'] = embeddings_list[i]

In [59]:
with open("ObliQADataset/ObliQA_test.json") as f:
    data = json.load(f)

In [None]:
questions_embeddings_list = create_embeddings(data, 'Question', 'voyage-law-2', 128, input_type="query") # 100 for openai and 128 for voyage

for i in tqdm(range(len(data))):
    data[i]['embedding'] = questions_embeddings_list[i]

In [None]:
if not os.path.exists('embeddings'):
    print(f"Directory \'embeddings\' does not exist. Creating it...")
    os.makedirs('embeddings')  # Create the directory
else:
    print(f"Directory \'embeddings\' already exists.")

In [62]:
embeddings_name = 'vl2'

psg_chunks_embeddings_df = pd.DataFrame(psg_chunks)
embeddings_df_save_path = f"embeddings/{embeddings_name}_embeddings_df.csv"
psg_chunks_embeddings_df.to_csv(embeddings_df_save_path, index=False)

question_embeddings_df = pd.DataFrame(data)
question_embeddings_df_save_path = f"embeddings/{embeddings_name}_question_embeddings_df.csv"
question_embeddings_df.to_csv(question_embeddings_df_save_path, index=False)

# Retrieval

In [None]:
if not os.path.exists('retrieval'):
    print(f"Directory \'retrieval\' does not exist. Creating it...")
    os.makedirs('retrieval')  # Create the directory
else:
    print(f"Directory \'retrieval\' already exists.")

In [148]:
def retrieve_relevant_passages(query_embedding, embeddings_tensor, top_k):
    # calculate dot product
    dot_scores = embeddings_tensor @ query_embedding

    # get the top_k results
    return torch.topk(dot_scores, top_k)

In [None]:
def load_embeddings(folder_path: str) -> dict:
    """
    Loads all .csv files from a specified folder containing embeddings into a dictionary.

    Args:
        folder_path (str): Path to the folder containing .csv files.

    Returns:
        dict: A dictionary where keys are the base filenames (without extensions)
              and values are dictionaries containing:
              - 'psg_chunks': List of passage chunks as dictionaries.
              - 'embeddings_tensor': Tensor of embeddings.
              - 'test_set': List of test set dictionaries.
              - 'querie_embeddings_tensor': Tensor of query embeddings.
    """
    embeddings_data = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".csv"):
            # Load the embeddings dataframe
            df_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(df_path)
            df['embedding'] = df['embedding'].apply(ast.literal_eval)

            # Create tensors and dictionaries
            embeddings_list = df['embedding'].tolist()
            embeddings_tensor = torch.tensor(np.array(embeddings_list))

            # Distinguish between passage embeddings and query embeddings
            if "question" in file_name.lower():
                base_name = file_name.replace("_question_embeddings_df.csv", "")
                embeddings_data.setdefault(base_name, {})
                embeddings_data[base_name]['test_set'] = df.to_dict(orient='records')
                embeddings_data[base_name]['querie_embeddings_tensor'] = embeddings_tensor
            else:
                base_name = file_name.replace("_embeddings_df.csv", "")
                embeddings_data.setdefault(base_name, {})
                embeddings_data[base_name]['psg_chunks'] = df.to_dict(orient='records')
                embeddings_data[base_name]['embeddings_tensor'] = embeddings_tensor

    return embeddings_data

In [None]:
# Usage
folder_path = "embeddings"
embeddings_dict = load_embeddings(folder_path)

# Example to access specific embeddings:
# embeddings_dict['openai_large']['psg_chunks']
# embeddings_dict['openai_large']['embeddings_tensor']
# embeddings_dict['openai_large']['test_set']
# embeddings_dict['openai_large']['querie_embeddings_tensor']

# Print example shapes and details for debugging
for key, value in embeddings_dict.items():
    print(f"Embeddings for: {key}")
    if 'embeddings_tensor' in value:
        print(f"  Passage Embeddings Tensor Shape: {value['embeddings_tensor'].shape}")
    if 'querie_embeddings_tensor' in value:
        print(f"  Query Embeddings Tensor Shape: {value['querie_embeddings_tensor'].shape}")

## Simple Retrieval

In [150]:
def simple_retrieval(query: dict, embeddings_tensor: torch.tensor, top_k: int, passages_list: list)->list:
    extra_passages = embeddings_tensor.shape[0] - 13732 # the number of extra passages that were created due to the size of the embedding space

    query_emb = torch.tensor(np.array(query["embedding"]))

    top_scores, top_indices = retrieve_relevant_passages(query_emb, embeddings_tensor, top_k + extra_passages)

    unique_results = []
    seen_docs = set()
    for i in range(top_k + extra_passages):
        doc_id = passages_list[top_indices[i]]['ID']
        if doc_id not in seen_docs:
            unique_results.append((doc_id, top_scores[i]))
            seen_docs.add(doc_id)

        # once we have the top_k unique results stop
        if len(unique_results) == top_k:
            break

    return unique_results

In [None]:
# getting k most relevant passages for each query
retriever = 'vf2'
top_k = 10
retrieved = {}
for query in tqdm(embeddings_dict[retriever]['test_set']):
    retrieved[query["QuestionID"]] = simple_retrieval(query, embeddings_dict[retriever]['embeddings_tensor'], top_k=10, passages_list=embeddings_dict[retriever]['psg_chunks'])

with open(f"retrieval/rankings_{retriever}_{top_k}.trec", "w") as f:
    for qid, hits in retrieved.items():
        for i, (docid, score) in enumerate(hits):
            # Format: <QueryID> Q0 <DocumentID> <Rank> <Score> <RunName>
            line = f"{qid} 0 {docid} {i+1} {score} {retriever}"
            f.write(line + "\n")

## Rank fusion on pairs

In [153]:
def rank_fusion_retrieval(query: dict, retriever_1, retriever_2, passages_list: list, top_k: int, a: float = 0.5) -> list:

    extra_passages = 0

    # Determine the type of retriever_1 and retrieve scores
    if hasattr(retriever_1, "get_scores"):  # BM25 retriever
        retriever1_scores = retriever_1.get_scores(query["tokenized_text"])
    elif isinstance(retriever_1, torch.Tensor):  # Neural retriever as a function
        extra_passages = retriever_1.shape[0] - 13732
        query_emb = torch.tensor(np.array(query["embedding"]))
        retriever1_scores = np.array(retriever_1 @ query_emb)
    else:
        raise ValueError("Unsupported retriever type for retriever_1")

    # Determine the type of retriever_2 and retrieve scores
    if hasattr(retriever_2, "get_scores"):  # BM25 retriever
        retriever2_scores = retriever_2.get_scores(query["tokenized_text"])
    elif isinstance(retriever_2, torch.Tensor):  # Neural retriever as a function
        extra_passages = retriever_2.shape[0] - 13732
        query_emb = torch.tensor(np.array(query["embedding"]))
        retriever2_scores = np.array(retriever_2 @ query_emb)
    else:
        raise ValueError("Unsupported retriever type for retriever_2")

    # Perform rank fusion
    top_scores, top_indices = rank_fusion(retriever1_scores, retriever2_scores, a, top_k + extra_passages)

    # Extract top-k unique results
    unique_results = []
    seen_docs = set()
    for i in range(top_k + extra_passages):
        doc_id = passages_list[top_indices[i]]['ID']
        if doc_id not in seen_docs:
            unique_results.append((doc_id, top_scores[i]))
            seen_docs.add(doc_id)

        # once we have the top_k unique results stop
        if len(unique_results) == top_k:
            break

    return unique_results

In [154]:
from rank_bm25 import *
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage-law-2')

for chunk in embeddings_dict['vl2']['psg_chunks']:
    chunk["tokenized_text"] = tokenizer.tokenize(chunk['sentence_chunk'])

for question in embeddings_dict['vl2']['test_set']:
    question["tokenized_text"] = tokenizer.tokenize(question['Question'])

# Extract tokenized chunks and questions into separate lists
tokenized_chunks = [chunk["tokenized_text"] for chunk in embeddings_dict['vl2']['psg_chunks']]
tokenized_questions = [question["tokenized_text"] for question in embeddings_dict['vl2']['test_set']]

In [None]:
bm25 = BM25Okapi(tokenized_chunks)

# Rank-fusion for BM25 and any retriever

retriever = 'vl2'
a = 0.25
top_k = 10
retrieved = {}
for i in tqdm(range(len(embeddings_dict[retriever]['test_set']))):

    retrieved[embeddings_dict[retriever]['test_set'][i]["QuestionID"]] = rank_fusion_retrieval(embeddings_dict[retriever]['test_set'][i], bm25, embeddings_dict[retriever]['embeddings_tensor'], embeddings_dict[retriever]['psg_chunks'], top_k, a)

with open(f"retrieval/rankings_rf_bm25_{retriever}_{top_k}_{int(a*100)}.trec", "w") as f:
    for qid, hits in retrieved.items():
        for i, (docid, score) in enumerate(hits):
            # Format: <QueryID> Q0 <DocumentID> <Rank> <Score> <RunName>
            line = f"{qid} 0 {docid} {i+1} {score} rf_bm25_{retriever}"
            f.write(line + "\n")

In [None]:
# VOYAGE LAW AND FINANCE

retriever1 = 'vl2'
retriever2 = 'vf2'
a = 0.40
top_k = 10
retrieved = {}
for i in tqdm(range(len(embeddings_dict[retriever]['test_set']))):

    retrieved[embeddings_dict[retriever1]['test_set'][i]["QuestionID"]] = rank_fusion_retrieval(embeddings_dict[retriever]['test_set'][i], embeddings_dict[retriever1]['embeddings_tensor'], embeddings_dict[retriever2]['embeddings_tensor'], embeddings_dict[retriever]['psg_chunks'], top_k, a)

with open(f"retrieval/rankings_rf_{retriever1}_{retriever2}_{top_k}_{int(a*100)}.trec", "w") as f:
    for qid, hits in retrieved.items():
        for i, (docid, score) in enumerate(hits):
            # Format: <QueryID> Q0 <DocumentID> <Rank> <Score> <RunName>
            line = f"{qid} 0 {docid} {i+1} {score} rf_{retriever1}_{retriever2}"
            f.write(line + "\n")

## Triple rank fusion

In [157]:
def triple_rank_fusion_retrieval(
    query: dict,
    retriever_1,  # BM25 retriever or neural embeddings tensor
    retriever_2,  # BM25 retriever or neural embeddings tensor
    retriever_3,  # BM25 retriever or neural embeddings tensor
    passages_list: list,  # List of passage chunks with 'ID' metadata
    top_k: int,  # Number of top unique results to return
    a: float = 0.33,  # Weight for retriever_1
    b: float = 0.33   # Weight for retriever_2
) -> list:
    """
    Perform retrieval using three retrievers and combine their results using rank fusion.

    Args:
        query: The query dictionary containing the query embedding or tokenized text.
        retriever_1: BM25 retriever object or neural embeddings tensor (torch.Tensor).
        retriever_2: BM25 retriever object or neural embeddings tensor (torch.Tensor).
        retriever_3: BM25 retriever object or neural embeddings tensor (torch.Tensor).
        passages_list: List of passage chunks with metadata including 'ID'.
        top_k: The number of top unique results to return.
        a: Weight for the first retriever in rank fusion.
        b: Weight for the second retriever in rank fusion.

    Returns:
        A list of top-k unique results as (passage ID, score).
    """
    extra_passages = 0

    # Determine the type of retriever_1 and retrieve scores
    if hasattr(retriever_1, "get_scores"):  # BM25 retriever
        scores_1 = retriever_1.get_scores(query["tokenized_text"])
    elif isinstance(retriever_1, torch.Tensor):  # Neural retriever
        extra_passages = retriever_1.shape[0] - 13732
        query_emb = torch.tensor(np.array(query["embedding"]))
        scores_1 = np.array(retriever_1 @ query_emb)
    else:
        raise ValueError("Unsupported retriever type for retriever_1")

    # Determine the type of retriever_2 and retrieve scores
    if hasattr(retriever_2, "get_scores"):  # BM25 retriever
        scores_2 = retriever_2.get_scores(query["tokenized_text"])
    elif isinstance(retriever_2, torch.Tensor):  # Neural retriever
        extra_passages = retriever_2.shape[0] - 13732
        query_emb = torch.tensor(np.array(query["embedding"]))
        scores_2 = np.array(retriever_2 @ query_emb)
    else:
        raise ValueError("Unsupported retriever type for retriever_2")

    # Determine the type of retriever_3 and retrieve scores
    if hasattr(retriever_3, "get_scores"):  # BM25 retriever
        scores_3 = retriever_3.get_scores(query["tokenized_text"])
    elif isinstance(retriever_3, torch.Tensor):  # Neural retriever
        extra_passages = retriever_3.shape[0] - 13732
        query_emb = torch.tensor(np.array(query["embedding"]))
        scores_3 = np.array(retriever_3 @ query_emb)
    else:
        raise ValueError("Unsupported retriever type for retriever_3")

    top_scores, top_indices = rank_fusion_on_three(scores_1, scores_2, scores_3, a, b, top_k + extra_passages)

    # Extract top-k unique results
    unique_results = []
    seen_docs = set()
    for i in range(top_k + extra_passages):
        doc_id = passages_list[top_indices[i]]['ID']
        if doc_id not in seen_docs:
            unique_results.append((doc_id, top_scores[i]))
            seen_docs.add(doc_id)

        # Stop once we have the top_k unique results
        if len(unique_results) == top_k:
            break

    return unique_results

In [159]:
from rank_bm25 import *

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

for chunk in embeddings_dict['vl2']['psg_chunks']:
    chunk["tokenized_text"] = word_tokenize(chunk['sentence_chunk'])

for question in embeddings_dict['vl2']['test_set']:
    question["tokenized_text"] = word_tokenize(question['Question'])

# Extract tokenized chunks and questions into separate lists
tokenized_chunks = [chunk["tokenized_text"] for chunk in embeddings_dict['vl2']['psg_chunks']]
tokenized_questions = [question["tokenized_text"] for question in embeddings_dict['vl2']['test_set']]

In [None]:
bm25 = BM25Okapi(tokenized_chunks)
a = 0.25
b = 0.20
top_k = 10
retrieved = {}
for i in tqdm(range(len(embeddings_dict['vl2']['test_set']))):
    retrieved[embeddings_dict['vl2']['test_set'][i]["QuestionID"]] = triple_rank_fusion_retrieval(embeddings_dict['vl2']['test_set'][i], bm25, embeddings_dict['vl2']['embeddings_tensor'], embeddings_dict['vf2']['embeddings_tensor'], embeddings_dict['vl2']['psg_chunks'], top_k=top_k, a=a, b=b)

with open(f"retrieval/rankings_trf_{top_k}_a{int(a*100)}_b{int(b*100)}.trec", "w") as f:
    for qid, hits in retrieved.items():
        for i, (docid, score) in enumerate(hits):
            # Format: <QueryID> Q0 <DocumentID> <Rank> <Score> <RunName>
            line = f"{qid} 0 {docid} {i+1} {score} triple_rank_fusion_bm25_law_finance"
            f.write(line + "\n")

### Adding reranking

In [None]:
bm25 = BM25Okapi(tokenized_chunks)
a = 0.25
b = 0.20
top_k = 10
top_k_init = 60
retrieved = {}
for i in tqdm(range(len(embeddings_dict['vl2']['test_set']))):

    unique_indices = triple_rank_fusion_retrieval(embeddings_dict['vl2']['test_set'][i], bm25, embeddings_dict['vl2']['embeddings_tensor'], embeddings_dict['vf2']['embeddings_tensor'], embeddings_dict['vl2']['psg_chunks'],top_k=10, a=a, b=b)

    #then we use the voyage rerank-2
    question = embeddings_dict['vl2']['test_set'][i]['Question']
    passages = [embeddings_dict['vl2']['psg_chunks'][k]['sentence_chunk'] for k in unique_indices]

    reranking = vo.rerank(question, passages, model="rerank-2", top_k=top_k)

    results = []
    for r in reranking.results:
        k = unique_indices[r.index]
        results.append((embeddings_dict['vl2']['psg_chunks'][k]['ID'], r.relevance_score))

    retrieved[embeddings_dict['vl2']['test_set'][i]["QuestionID"]] = results

with open(f"retrieval/rankings_fusion_bm25_vl2_vf2_rerank2_{top_k_init}_{top_k}.trec", "w") as f:
    for qid, hits in retrieved.items():
        for i, (docid, score) in enumerate(hits):
            # Format: <QueryID> Q0 <DocumentID> <Rank> <Score> <RunName>
            line = f"{qid} 0 {docid} {i+1} {score} triple_rank_fusion_bm25_voyage_law2_finance2_rerank2"
            f.write(line + "\n")

# Evaluating your results

In [82]:
import os
import json
from typing import Dict

def load_qrels(docs_dir: str, fqrels: str) -> Dict[str, Dict[str, int]]:
    ndocs = 40
    docs = []
    for i in range(1, ndocs + 1):
        with open(os.path.join(docs_dir, f"{i}.json")) as f:
            doc = json.load(f)
            docs.append(doc)

    did2pid2id: Dict[str, Dict[str, str]] = {}
    for doc in docs:
        for psg in doc:
            did2pid2id.setdefault(psg["DocumentID"], {})
            assert psg["ID"] not in did2pid2id[psg["DocumentID"]]
            did2pid2id[psg["DocumentID"]].setdefault(psg["PassageID"], psg["ID"])

    with open(fqrels) as f:
        data = json.load(f)
    qrels = {}
    for e in data:
        qid = e["QuestionID"]
        for psg in e["Passages"]:
            qrels.setdefault(qid, {})
            pid = did2pid2id[psg["DocumentID"]][psg["PassageID"]]
            qrels[qid][pid] = 1
    return qrels

In [83]:
qrels = load_qrels("ObliQADataset/StructuredRegulatoryDocuments", "ObliQADataset/ObliQA_test.json")
with open("qrels", "w") as f:
    for qid, rels in qrels.items():
        for pid, rel in rels.items():
            line = f"{qid} Q0 {pid} {rel}"
            f.write(line + "\n")

In [None]:
!git clone https://github.com/usnistgov/trec_eval.git && cd trec_eval && make
!trec_eval/trec_eval -m recall.10 -m map_cut.10 ./qrels ./retrieval/rankings.trec #replace with your desired retrieval system results