# PART 1 - Information Retrieval

## Import Necessary Libraries

In [None]:
!pip install nltk torch faiss-cpu requests numpy neo4j sentence-transformers groq rank_bm25 pyserini==0.22.1

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting neo4j
  Downloading neo4j-5.26.0-py3-none-any.whl.metadata (5.9 kB)
Collecting groq
  Downloading groq-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pyserini==0.22.1
  Downloading pyserini-0.22.1-py3-none-any.whl.metadata (4.5 kB)
Collecting pyjnius>=1.4.0 (from pyserini==0.22.1)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting nmslib>=2.1.1 (from pyserini==0.22.1)
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnxruntime>=1.8.1 (from pyserini==0.22.1)
  Downloading onnxruntime-1.20.1-cp310-cp310-manyl

In [None]:
import os
import re
import json
import nltk
import time
import faiss
import torch
import pickle
import requests
import subprocess
import numpy as np
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
from typing import List, Dict
from google.colab import files
from nltk.corpus import wordnet
from neo4j import GraphDatabase
from rank_bm25 import BM25Okapi
from google.colab import userdata
import google.generativeai as genai
from nltk.tokenize import word_tokenize
from sklearn.metrics import average_precision_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
GROQ_API_KEY = userdata.get('GROQ_API_KEY')
NEO_USER = userdata.get('NEO_USER')
NEO_PASS = userdata.get('NEO_PASS')
NEO_URL = userdata.get('NEO_URL')
GROQ_URL = userdata.get('GROQ_URL')

In [None]:
!git clone https://github.com/usnistgov/trec_eval.git && cd trec_eval && make

Cloning into 'trec_eval'...
remote: Enumerating objects: 1147, done.[K
remote: Counting objects: 100% (332/332), done.[K
remote: Compressing objects: 100% (100/100), done.[K
remote: Total 1147 (delta 264), reused 277 (delta 226), pack-reused 815 (from 1)[K
Receiving objects: 100% (1147/1147), 764.18 KiB | 7.08 MiB/s, done.
Resolving deltas: 100% (769/769), done.
gcc -g -I.  -Wall -Wno-macro-redefined -DVERSIONID=\"10.0-rc2\"  -o trec_eval trec_eval.c formats.c meas_init.c meas_acc.c meas_avg.c meas_print_single.c meas_print_final.c gain_init.c get_qrels.c get_trec_results.c get_prefs.c get_qrels_prefs.c get_qrels_jg.c form_res_rels.c form_res_rels_jg.c form_prefs_counts.c utility_pool.c get_zscores.c convert_zscores.c measures.c  m_map.c m_P.c m_num_q.c m_num_ret.c m_num_rel.c m_num_rel_ret.c m_gm_map.c m_Rprec.c m_recip_rank.c m_bpref.c m_iprec_at_recall.c m_recall.c m_Rprec_mult.c m_utility.c m_11pt_avg.c m_ndcg.c m_ndcg_cut.c m_Rndcg.c m_ndcg_rel.c m_binG.c m_G.c m_rel_P.c m_suc

In [None]:
!git clone https://github.com/RegNLP/RePASs.git
%cd RePASs
!pip install -r requirements.txt

Cloning into 'RePASs'...
remote: Enumerating objects: 140, done.[K
remote: Counting objects: 100% (140/140), done.[K
remote: Compressing objects: 100% (132/132), done.[K
remote: Total 140 (delta 61), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (140/140), 306.77 KiB | 2.92 MiB/s, done.
Resolving deltas: 100% (61/61), done.
/content/RePASs
Collecting tiktoken (from -r requirements.txt (line 10))
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
os.getcwd()

'/content/RePASs'

In [None]:
os.makedirs('/content/RePASs/models/obligation-classifier-legalbert', exist_ok=True)

In [None]:
%cd ..

/content


## Load Data

In [None]:
!git clone https://github.com/RegNLP/ObliQADataset.git

Cloning into 'ObliQADataset'...
remote: Enumerating objects: 80, done.[K
remote: Counting objects: 100% (80/80), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 80 (delta 17), reused 47 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (80/80), 11.87 MiB | 14.17 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [None]:
with open("/content/ObliQADataset/ObliQA_test.json", "r") as f:
    test_data = json.load(f)

In [None]:
with open("/content/ObliQADataset/ObliQA_train.json", "r") as f:
    train_data = json.load(f)

In [None]:
with open("/content/ObliQADataset/RIRAGSharedTask/RIRAG_Unseen_Questions.json", "r") as f:
    unseen_data = json.load(f)

In [None]:
data = train_data + test_data

## Generate Embeddings

### LegalBert

In [None]:
model_path = "/content/RePASs/models/obligation-classifier-legalbert"

legal_bert_tokenizer = AutoTokenizer.from_pretrained(model_path)
legal_bert_model = AutoModel.from_pretrained(model_path, trust_remote_code=True)

In [None]:
def embed_text(text: str):
    inputs = legal_bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = legal_bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

In [None]:
corpus = []
questions = []
bm25_corpus = []

for item in data:
    questions.append({"QuestionID": item["QuestionID"], "Question": item["Question"]})
    for passage in item["Passages"]:
        corpus.append({"Passage": passage["Passage"], "DocumentID": passage["DocumentID"], "PassageID": passage["PassageID"]})
        bm25_corpus.append(passage["Passage"])

tokenized_corpus = [word_tokenize(doc.lower()) for doc in bm25_corpus]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
try:
    all_question_embeddings = np.load("/content/all_question_embeddings.npy")
    print("Embeddings shape:", all_question_embeddings.shape)
except Exception as e:
    print(f"An error occurred while loading embeddings: {e}")

Embeddings shape: (25081, 768)


In [None]:
try:
    index_file_path = "/content/faiss_index.index"
    index = faiss.read_index(index_file_path)
    print("FAISS index loaded successfully.")

except Exception as e:
    print(f"An error occurred: {e}")

FAISS index loaded successfully.


In [None]:
try:
  with open("/content/question_to_id_map.pkl", "rb") as f:
    question_to_id_map = pickle.load(f)
    print("Question to ID map loaded successfully.")
except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: [Errno 2] No such file or directory: '/content/question_to_id_map.pkl'


## Set up Graph DB

In [None]:
driver = GraphDatabase.driver(NEO_URL, auth=(NEO_USER, NEO_PASS))

In [None]:
def load_data_into_neo4j(data):
    with driver.session() as session:
        for item in data:
            question_id = item["QuestionID"]
            question_text = item["Question"]
            group_id = item["Group"]

            # Create Question node with Group property
            session.run("""
                MERGE (q:Question {QuestionID: $question_id})
                ON CREATE SET q.Question = $question_text, q.Group = $group_id
            """, question_id=question_id, question_text=question_text, group_id=group_id)

            # Create Group node
            session.run("""
                MERGE (g:Group {GroupID: $group_id})
            """, group_id=group_id)

            # Create relationship between Question and Group
            session.run("""
                MATCH (q:Question {QuestionID: $question_id}), (g:Group {GroupID: $group_id})
                MERGE (q)-[:QUESTION_IN_GROUP]->(g)
            """, question_id=question_id, group_id=group_id)

            # Create Passage nodes and link to Question
            for passage in item["Passages"]:
                document_id = passage["DocumentID"]
                passage_id = passage["PassageID"]
                passage_text = passage["Passage"]

                # Create Passage node
                session.run("""
                    MERGE (p:Passage {PassageID: $passage_id, DocumentID: $document_id})
                    ON CREATE SET p.Passage = $passage_text
                """, passage_id=passage_id, document_id=document_id, passage_text=passage_text)

                # Create relationship between Question and Passage
                session.run("""
                    MATCH (q:Question {QuestionID: $question_id}), (p:Passage {PassageID: $passage_id})
                    MERGE (q)-[:QUESTION_HAS_PASSAGE]->(p)
                """, question_id=question_id, passage_id=passage_id)

## Retrieval Pipeline

### Graph Retrieval

In [None]:
def get_relevant_graph_passages(question_text, question_id=None, embedder=None, top_k=10):
    """
    Retrieves passages linked to a question from the Neo4j database and returns them
    with relevance scores based on their similarity to the query text.

    Args:
        question_text (str): The text of the question to match passages against.
        question_id (str): Optional. The ID of the question.
        embedder: A text embedding model for computing embeddings.

    Returns:
        list of dict: Each dict contains PassageID, DocumentID, Passage, and Score.
    """
    driver = GraphDatabase.driver(NEO_URL, auth=(NEO_USER, NEO_PASS))
    query = ""
    params = {}

    if question_id:
        query = """
            MATCH (q:Question {QuestionID: $question_id})-[:QUESTION_HAS_PASSAGE]->(p:Passage)
            RETURN p.PassageID AS passage_id, p.DocumentID AS document_id, p.Passage AS passage_text
        """
        params = {"question_id": question_id}
    elif question_text:
        query = """
            MATCH (q:Question {Question: $question_text})-[:QUESTION_HAS_PASSAGE]->(p:Passage)
            RETURN p.PassageID AS passage_id, p.DocumentID AS document_id, p.Passage AS passage_text
        """
        params = {"question_text": question_text}
    else:
        print("Provide either question_id or question_text.")
        return []

    with driver.session() as session:
        result = session.run(query, **params)
        passages = [{"PassageID": record["passage_id"],
                     "DocumentID": record["document_id"],
                     "Passage": record["passage_text"]}
                    for record in result]

    if not passages:
        return []

    query_embedding = embed_text(question_text).flatten().reshape(1, -1)
    passages_embeddings = np.array([embed_text(passage["Passage"]).flatten() for passage in passages])
    similarities = cosine_similarity(query_embedding, passages_embeddings)[0]

    for idx, passage in enumerate(passages):
        passage["Score"] = similarities[idx]

    passages = sorted(passages, key=lambda x: x["Score"], reverse=True)

    return passages[:top_k]

In [None]:
query = "How does the FSRA define and evaluate \"principal risks and uncertainties\" for a Petroleum Reporting Entity, particularly for the remaining six months of the financial year?"
query_id = "7073c16e-1974-4051-9064-9f5706c663c7"

In [None]:
for passage in get_relevant_graph_passages(query):
    print(passage)

{'PassageID': '10.1.7.(2)', 'DocumentID': 11, 'Passage': 'A Reporting Entity must:\n(a)\tprepare such report:\n(i)\tfor the first six months of each financial year or period, and if there is a change to the accounting reference date, prepare such report in respect of the period up to the old accounting reference date; and\n(ii)\tin accordance with the applicable IFRS standards or other standards acceptable to the Regulator;\n(b)\tensure the financial statements have either been audited or reviewed by auditors, and the audit or review by the auditor is included within the report; and\n(c)\tensure that the report includes:\n(i)\texcept in the case of a Mining Exploration Reporting Entity or a Petroleum Exploration Reporting Entity, an indication of important events that have occurred during the first six months of the financial year, and their impact on the financial statements;\n(ii)\texcept in the case of a Mining Exploration Reporting Entity or a Petroleum Exploration Reporting Entity

### Contextual Retrieval

In [None]:
def get_relevant_vector_passages(query, question_id=None, top_k=10):
    query_tokens = word_tokenize(query.lower())
    bm25_scores = bm25.get_scores(query_tokens)

    top_bm25_indices = np.argsort(bm25_scores)[-top_k:][::-1]
    top_bm25_passages = [
        {
            "PassageID": corpus[i]["PassageID"],
            "DocumentID": corpus[i]["DocumentID"],
            "Passage": corpus[i]["Passage"],
            "Score": bm25_scores[i]
        }
        for i in top_bm25_indices
    ]

    query_embedding = embed_text(query).flatten().reshape(1, -1)
    faiss_distances, faiss_indices = index.search(query_embedding, k=top_k)
    top_faiss_passages = [
        {
            "PassageID": corpus[i]["PassageID"],
            "DocumentID": corpus[i]["DocumentID"],
            "Passage": corpus[i]["Passage"],
            "Score": 1 - faiss_distances[0][idx]
        }
        for idx, i in enumerate(faiss_indices[0])
    ]

    combined_passages = {}
    for passage in top_bm25_passages + top_faiss_passages:
        passage_id = passage["PassageID"]
        if passage_id not in combined_passages:
            combined_passages[passage_id] = passage
        else:
            combined_passages[passage_id]["Score"] = max(
                combined_passages[passage_id]["Score"], passage["Score"]
            )

    combined_passages_list = list(combined_passages.values())
    combined_embeddings = [embed_text(passage["Passage"]).flatten() for passage in combined_passages_list]

    question_embedding = all_question_embeddings[question_to_id_map[question_id]]
    similarities = cosine_similarity([question_embedding], combined_embeddings).flatten()

    for idx, passage in enumerate(combined_passages_list):
        passage["Score"] = similarities[idx]

    ranked_passages = sorted(
        combined_passages_list,
        key=lambda x: x["Score"],
        reverse=True
    )

    for rank, passage in enumerate(ranked_passages[:top_k], start=1):
        passage["Rank"] = rank

    return ranked_passages[:top_k]

In [None]:
top_passages = get_relevant_vector_passages(query, query_id)
print("Top Relevant Passages:")
for passage in top_passages:
    print(passage)

Top Relevant Passages:
{'PassageID': '37)', 'DocumentID': 28, 'Passage': 'BECOMING AWARE OF INSIDE INFORMATION\nIn considering the operation of Rule 7.2.1,  the concept of ‘awareness’, or knowledge, of Inside Information is central to operation of FSRA’s continuous disclosure framework.  In interpreting Rule 7.2.1, the Listing Authority considers that a Reporting Entity  becomes ‘aware’ of Inside Information if, and as soon as, an Officer  of the Reporting Entity has, or ought reasonably to have, come into possession of the Inside Information in the course of the performance of their duties as an Officer of that Reporting Entity.\n', 'Score': 0.7444859, 'Rank': 1}
{'PassageID': '6.5', 'DocumentID': 39, 'Passage': 'The Regulator will withdraw its permission to use an ADGM mark if a product or service fails to, or chooses not to, maintain the corresponding designation.', 'Score': 0.56410515, 'Rank': 2}
{'PassageID': '10.1.7.(2)', 'DocumentID': 11, 'Passage': 'A Reporting Entity must:\n(a

## Hybrid Retrieval

In [None]:
def hybrid_retrieval(query, question_id=None, top_k=10):
    """
    Hybrid retrieval function that retrieves top graph and vector passages,
    recalculates their relevance to the query, and returns the top combined results.
    """
    if not isinstance(query, str):
        raise ValueError(f"Query should be a string, but got {type(query)}")

    graph_results = get_relevant_graph_passages(query, question_id, top_k=10)
    top_graph_passages = graph_results

    vector_results = get_relevant_vector_passages(query, question_id, top_k=10)
    top_vector_passages = vector_results

    combined_passages = {p["PassageID"]: p for p in top_graph_passages}
    for p in top_vector_passages:
        if p["PassageID"] not in combined_passages:
            combined_passages[p["PassageID"]] = p
        else:
            combined_passages[p["PassageID"]]["Score"] = max(
                combined_passages[p["PassageID"]]["Score"], p["Score"]
            )

    combined_passages_list = list(combined_passages.values())

    query_embedding = embed_text(query).flatten().reshape(1, -1)
    combined_embeddings = [embed_text(p["Passage"]).flatten() for p in combined_passages_list]
    similarities = cosine_similarity(query_embedding, combined_embeddings).flatten()

    for idx, passage in enumerate(combined_passages_list):
        passage["Score"] = similarities[idx]

    ranked_passages = sorted(combined_passages_list, key=lambda x: x["Score"], reverse=True)

    top_combined_results = []
    for rank, passage in enumerate(ranked_passages[:top_k], start=1):
        top_combined_results.append({
            "PassageID": passage["PassageID"],
            "DocumentID": passage["DocumentID"],
            "Score": passage["Score"]*100,
            "Rank": rank,
            "Passage": passage["Passage"]
        })

    return top_combined_results

In [None]:
query="What type of procedures must a Third Party Provider establish and maintain to handle issues such as major operational and security incidents?"
query_id = "d34e3516-f053-4652-a0ac-ede703144b9a"

In [None]:
top_passages = hybrid_retrieval(query, query_id)
print("Top Relevant Passages:\n\n")
for passage in top_passages:
    print(passage)

Top Relevant Passages:


{'PassageID': '20.14.1.(2)', 'DocumentID': 3, 'Score': 92.97748804092407, 'Rank': 1, 'Passage': 'As part of that framework, the Third Party Provider must establish and maintain effective incident management procedures, including for the detection and classification of major operational and security incidents.'}
{'PassageID': '19.23.1.(2)', 'DocumentID': 3, 'Score': 91.89510345458984, 'Rank': 2, 'Passage': 'Management of operational and security risks. As part of that framework, the Payment Service Provider must establish and maintain effective incident management procedures, including for the detection and classification of major operational and security incidents.'}
{'PassageID': '35)', 'DocumentID': 21, 'Score': 90.609610080719, 'Rank': 3, 'Passage': 'REGULATORY REQUIREMENTS\nThird Party Outsourcing\nIn its assessment of a potential third party service provider, the regulated firm must therefore satisfy itself that the service provider maintains robust proces

## TREC Evaluation

In [None]:
def load_qrels(docs_dir: str, fqrels: str) -> Dict[str, Dict[str, int]]:
    ndocs = 40
    docs = []
    for i in range(1, ndocs + 1):
        with open(os.path.join(docs_dir, f"{i}.json")) as f:
            doc = json.load(f)
            docs.append(doc)

    did2pid2id: Dict[str, Dict[str, str]] = {}
    for doc in docs:
        for psg in doc:
            did2pid2id.setdefault(psg["DocumentID"], {})
            assert psg["ID"] not in did2pid2id[psg["DocumentID"]]
            did2pid2id[psg["DocumentID"]].setdefault(psg["PassageID"], psg["ID"])

    with open(fqrels) as f:
        data = json.load(f)
    qrels = {}
    for e in data:
        qid = e["QuestionID"]
        for psg in e["Passages"]:
            qrels.setdefault(qid, {})
            pid = did2pid2id[psg["DocumentID"]][psg["PassageID"]]
            qrels[qid][pid] = 1
    return did2pid2id, qrels

In [None]:
did2pid2id, qrels = load_qrels("/content/ObliQADataset/StructuredRegulatoryDocuments", "/content/ObliQADataset/ObliQA_test.json")
with open("qrels", "w") as f:
    for qid, rels in qrels.items():
        for pid, rel in rels.items():
            line = f"{qid} Q0 {pid} {rel}"
            f.write(line + "\n")

In [None]:
with open('rankings.trec','w') as f:
  for question in test_data[1588:]:
    query=question['Question']
    query_id=question['QuestionID']
    retrieved_docs=hybrid_retrieval(query, query_id)
    for doc_index, doc in enumerate(retrieved_docs,start=1):
      doc_id=doc['DocumentID']
      passage_id=doc['PassageID']
      pid = did2pid2id.get(doc_id, {}).get(passage_id, passage_id)
      if not pid:
        print(f"No matching PID found for doc_id: {doc_id}, passage_id: {passage_id}")
        continue
      passage=doc['Passage']
      score=doc['Score']
      f.write(f"{query_id} 0 {pid} {doc_index} {score} HybridRetreival\n")

In [None]:
from google.colab import files

files.download('/content/rankings.trec')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!trec_eval/trec_eval -m recall.10 -m map_cut.10 /content/qrels /content/rankings.trec

recall_10             	all	0.7938
map_cut_10            	all	0.7474


## Rankings File of Unseen Data

In [None]:
questions = []

for item in unseen_data:
    questions.append({"QuestionID": item["QuestionID"], "Question": item["Question"]})

dimension = 768
unseen_index = faiss.IndexFlatL2(dimension)

unseen_question_to_id_map = {}
unseen_question_embeddings = []

for i, question in enumerate(questions):
    question_id = question["QuestionID"]
    question_text = question["Question"]
    question_embedding = embed_text(question_text).flatten()
    unseen_question_to_id_map[question_id] = i
    unseen_question_embeddings.append(question_embedding)

unseen_question_embeddings = np.vstack(unseen_question_embeddings)
unseen_index.add(unseen_question_embeddings)

faiss.write_index(unseen_index, "/content/unseen_index.index")

np.save("/content/unseen_question_embeddings.npy", unseen_question_embeddings)

with open("/content/unseen_question_to_id_map.pkl", "wb") as f:
    pickle.dump(unseen_question_to_id_map, f)

In [None]:
combined_embeddings = np.vstack((all_question_embeddings, unseen_question_embeddings))

with open("/content/unseen_question_to_id_map.pkl", "rb") as f:
    new_question_to_id_map = pickle.load(f)

offset = len(question_to_id_map)
for question_id, index in new_question_to_id_map.items():
    new_question_to_id_map[question_id] = index + offset

combined_question_to_id_map = {**question_to_id_map, **new_question_to_id_map}

new_index = faiss.read_index("/content/unseen_index.index")

index = faiss.read_index("/content/faiss_index.index")

index.add(unseen_question_embeddings)

faiss.write_index(index, "/content/updated_index.index")

np.save("/content/updated_question_embeddings.npy", combined_embeddings)

with open("/content/updated_question_to_id_map.pkl", "wb") as f:
    pickle.dump(combined_question_to_id_map, f)

In [None]:
all_question_embeddings = np.load("/content/updated_question_embeddings.npy")
print("Embeddings shape:", all_question_embeddings.shape)

index_file_path = "/content/updated_index.index"
index = faiss.read_index(index_file_path)
print("FAISS index loaded successfully.")

with open("/content/updated_question_to_id_map.pkl", "rb") as f:
    question_to_id_map = pickle.load(f)

Embeddings shape: (25527, 768)
FAISS index loaded successfully.


In [None]:
with open('rankings_unseen.trec','w') as f:
  for question in unseen_data:
    query=question['Question']
    query_id=question['QuestionID']
    retrieved_docs=hybrid_retrieval(query, query_id)
    for doc_index,doc in enumerate(retrieved_docs,start=1):
      doc_id=doc['DocumentID']
      passage_id=doc['PassageID']
      pid = did2pid2id.get(doc_id, {}).get(passage_id, passage_id)
      passage=doc['Passage']
      score=doc['Score']
      f.write(f"{query_id} 0 {pid} {doc_index} {score} hybret\n")

In [None]:
files.download('/content/rankings_unseen.trec')

# PART 2 - Answer Generation

## Using llama3-70b-8192 and llama-8b-8192 alternatively

In [None]:
def generate_answer(query, context):
        prompt = (
            "Based on the following context, provide a detailed and structured answer that directly addresses the question. Each answer sentence must align with a sentence in the source passage(s), covering all critical regulatory obligations and avoiding any contradictions.\n\n"
            f"Question: {query}\n"
            f"Context: {context}\n"
            "Answer: Provide a comprehensive response that reflects all key requirements and procedures mentioned in the regulatory documents, ensuring factual consistency and alignment with the context."
        )

        response = requests.post(
            GROQ_URL,
            headers={"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"},
            json={
                "model": "llama3-70b-8192",
                "messages": [{"role": "user", "content": prompt}]
            }
        )

        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content']
        else:
            print(f"Error: {response.text}")
            return f"Error: {response.text}"

In [None]:
def run_pipeline(query, query_id):
    retrieved_docs = hybrid_retrieval(query, query_id)

    passages = [doc['Passage'] for doc in retrieved_docs]

    context_text = " ".join(passages)

    answer = generate_answer(query, context_text)

    result = {
        "QuestionID": query_id,
        "Question": query,
        "RetrievedPassage(s)": passages,
        "Answer": answer
    }
    return result

In [None]:
#Test Run
result = run_pipeline(query, query_id)
print(result)

In [None]:
#Test Unseen Run
unseen_id="07ced741-8abc-43a2-80b0-7740685481f4"
unseen_query="Can the ADGM provide clarification on the processes and procedures a Reporting Entity should follow if it disagrees with the Listing Authority\u2019s assessment that disclosure is necessary to correct or prevent a false market?"

In [None]:
result = run_pipeline(unseen_query, unseen_id)
print(result)

In [None]:
def load_existing_results(file_path):
    try:
        with open(file_path, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return []

In [None]:
results = load_existing_results("/content/generated_answers_unseen.json")

test_questions = []

for item in unseen_data:
    test_questions.append({"QuestionID": item["QuestionID"], "Question": item["Question"]})

batch_size = 5

for i in range(0, len(test_questions), batch_size):
    batch = test_questions[i:i + batch_size]
    batch_results = []

    for q in batch:
        result = run_pipeline(q["Question"], q["QuestionID"])
        batch_results.append(result)
        time.sleep(10)

    results.extend(batch_results)

    with open("/content/generated_answers_unseen.json", "w") as f:
        json.dump(results, f, indent=4)

    print(f"Processed batch {i // batch_size + 1} with {len(batch)} questions.")

In [None]:
files.download('/content/generated_answers_unseen.json')

### Repass on Llama

In [None]:
%cd /content/RePASs

Evaluation of Answers Generated for Unseen Questions

In [None]:
!python /content/RePASs/scripts/evaluate_model.py --input_file /content/generated_answers_unseen.json --group_method_name my_method