# Installation
- Make sure to install python3.12 and create a virtual environment
- install the dependencies in the requirements.txt (`pip install -r requirements.txt`)

In [1]:
from typing import List
from sentence_transformers import CrossEncoder
import torch
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.is_available())

False


# Testing bge-reranker-v2-m3

In [3]:
# load model (will auto-download the first time it is run and get cached)
model_name = "BAAI/bge-reranker-v2-m3" # Model size 568M params - https://huggingface.co/BAAI/bge-reranker-v2-m3
reranker = CrossEncoder(model_name, device="cuda" if torch.cuda.is_available() else "cpu")

In [4]:
def rerank(query: str, docs: list[str], top_m: int = 5):
    pairs = [(query, d) for d in docs]  # model expects list of (query, passage) pairs so we create them here
    scores = reranker.predict(pairs)  # get prediction scores for each pair
    ranked = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True) # combine passages and scores and sort them by score
    return ranked[:top_m] 

In [5]:
# example passages to rerank
retrieved_docs = [
    "Thomas Edison invented the electric light bulb in 1879.",
    "Bananas are yellow and grow in tropical regions.",
    "Edison also founded General Electric.",
    "The Wright brothers invented the airplane.",
    "Albert Einstein developed the theory of relativity.",
    "The Great Wall of China is visible from space.",
    "Isaac Newton formulated the laws of motion and universal gravitation.",
    "The capital of France is Paris.",
    "The human body has 206 bones.",
    "The Pacific Ocean is the largest ocean on Earth.",
    "The Mona Lisa was painted by Leonardo da Vinci.",
    "Water boils at 100 degrees Celsius.",
    "Mount Everest is the highest mountain in the world.",
    "Shakespeare wrote many famous plays.",
    "The Amazon rainforest is the largest tropical rainforest.",
    "The speed of light is approximately 299,792 kilometers per second.",
    "The currency of Japan is the yen.",
    "The Eiffel Tower is located in Paris.",
    "The human brain is the most complex organ in the body.",
    "The Great Barrier Reef is the largest coral reef system.",
    "The first manned moon landing was in 1969.",
    "The Statue of Liberty was a gift from France to the United States.",
    "The Nile is the longest river in the world.",
    "The human heart pumps blood throughout the body.",
    "The Taj Mahal is located in India.",
    "The Sahara is the largest hot desert in the world.",
    "The first computer was invented in the 1940s.",
    "The human eye can distinguish about 10 million different colors.",
    "The Colosseum is an ancient amphitheater in Rome.",
    "The Great Depression began in 1929.",
    "The human skeleton provides structure and support to the body.",
    "The Leaning Tower of Pisa is famous for its tilt.",
    "The first successful airplane flight was in 1903.",
    "The human skin is the body's largest organ.",
    "The Golden Gate Bridge is located in San Francisco.",
    "The first telephone was invented by Alexander Graham Bell.",
]

In [6]:
top_docs = rerank("Who invented the light bulb?", retrieved_docs, top_m=10)
top_docs


[('Thomas Edison invented the electric light bulb in 1879.',
  np.float32(0.99463516)),
 ('Isaac Newton formulated the laws of motion and universal gravitation.',
  np.float32(0.004423816)),
 ('Edison also founded General Electric.', np.float32(0.0041850945)),
 ('Albert Einstein developed the theory of relativity.',
  np.float32(0.00369747)),
 ('The first telephone was invented by Alexander Graham Bell.',
  np.float32(0.0020467327)),
 ('The Mona Lisa was painted by Leonardo da Vinci.', np.float32(0.0008676998)),
 ('The Wright brothers invented the airplane.', np.float32(0.0002814222)),
 ('Mount Everest is the highest mountain in the world.',
  np.float32(8.034188e-05)),
 ('The Eiffel Tower is located in Paris.', np.float32(6.339106e-05)),
 ('Shakespeare wrote many famous plays.', np.float32(4.1562165e-05))]

## Comparison between the reranker and the retriever
The reranker should be more powerful than the retriever. Otherwise, it will not improve the ranking of the documents


How to compare? 
- we can try a couple of questions from the dataset and check the rank of the answer sentences (the reranker should improve their rank)

In [7]:
import os
import sys
from pathlib import Path
import json


project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.append(project_root)

DATA_DIR = Path("/Users/hamad/Desktop/Apps/SDAIA/Project/SDAIA-Final-Project/qa_system/data/hotpot_dev_fullwiki_v1.json")


from qa_system.retrieval import Retriever
from qa_system.reranker import Reranker



In [9]:
def load_jsonl_or_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        raw = f.read().strip()
        if not raw:
            return []
        if "\n" in raw and raw.lstrip().startswith("{"):
            return [json.loads(line) for line in raw.splitlines() if line.strip()]
        return json.loads(raw)

retriever = Retriever()
reranker = Reranker()
dataset = load_jsonl_or_json(DATA_DIR)


[Reranker] Loading BAAI/bge-reranker-v2-m3 on cpu...


In [10]:

TEST_SIZE = 10

queries = []
answers = []

for entry in dataset[:TEST_SIZE]:
    queries.append(entry["question"])
    ans = []
    for supporting_fact in entry["supporting_facts"]:
        title = supporting_fact[0]
        sentence_no = supporting_fact[1]
        
        for passage in entry['context']:
            if title == passage[0]:
                ans.append(title + ": " + passage[1][sentence_no])
                break
        
    answers.append(ans)



for i in range(5):
    print(f"\nQuery: {queries[i]}")
    print("Answers:")
    for j in range(len(answers[i])):
        print(f"\t {j}. {answers[i][j]}")
    print()
    


Query: Were Scott Derrickson and Ed Wood of the same nationality?
Answers:


Query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?
Answers:
	 0. Kiss and Tell (1945 film): Kiss and Tell is a 1945 American comedy film starring then 17-year-old Shirley Temple as Corliss Archer.


Query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?
Answers:
	 0. Animorphs: Animorphs is a science fantasy series of young adult books written by Katherine Applegate and her husband Michael Grant, writing together under the name K. A. Applegate, and published by Scholastic.
	 1. Animorphs:  It is told in first person, with all six main characters taking turns narrating the books through their own perspectives.


Query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?
Answers:
	 0. Laleli Mosque: The Laleli Mosque (Turki

In [11]:
retriever_results = {}
for query in queries:
    results = retriever.retrieve(query, top_k=20)
    retriever_results[query] = results





huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [12]:
reranker_results = {}
for query in retriever_results:
    results = reranker.rerank(query, retriever_results[query], top_k=len(retriever_results[query]))
    reranker_results[query] = results


In [None]:
retriever_results['Were Scott Derrickson and Ed Wood of the same nationality?']

{'text': 'Sinister 2: Sinister 2 is a 2015 American supernatural horror film directed by Ciaran Foy and written by Scott Derrickson and C. Robert Cargill.',
 'id': '52931294a7c56e69d77c5b872cb0d001',
 'retriever_score': 11.20068359375,
 'reranker_score': 0.008293986320495605,
 'score': 0.008293986320495605}

In [37]:
for i in range(5):
    print(f"\nQuery: {queries[i]}")
    print("Retriever Answers:")
    for idx, answer in enumerate(retriever_results[queries[i]][:5]):
        print(f"\t {idx+1}. {answer['text']}")

    print("\nReranker Answers:")
    for idx, answer in enumerate(reranker_results[queries[i]][:5]):
        print(f"\t {idx+1}. {answer['text']}")
    
    


Query: Were Scott Derrickson and Ed Wood of the same nationality?
Retriever Answers:
	 1. Sinister 2: Sinister 2 is a 2015 American supernatural horror film directed by Ciaran Foy and written by Scott Derrickson and C. Robert Cargill.
	 2. Ed Wood (film): Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.
	 3. The Exorcism of Emily Rose: The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.
	 4. Tommy Swerdlow: Tommy Swerdlow is an American actor and screenwriter.
	 5. Doctor Strange (2016 film):  The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.

Reranker An