# BM25

Notebooks to show our BM25 method and its performances.

## Import

In [1]:
from bm25 import BM25
import pandas as pd
from tqdm import tqdm
from query_preprocessing import preprocess_queries
from metrics import export_query_result_to_submission_csv, get_answers_for_submission_csv, compute_recall_10

#disable warnings   
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

## BM25 Initialization

In [3]:
# Load all models for each langs
langs = ['en']
langs_models = {}

for lang in tqdm(langs, desc="Loading BM25 models", unit="lang"):
    path = f"bm25_models/bm25_{lang}.pkl"
    model = BM25.load_pickle(path)
    langs_models[lang] = model

Loading BM25 models:   0%|          | 0/1 [00:00<?, ?lang/s]Exception ignored in: <generator object tqdm.__iter__ at 0x0000023672A84DC0>
Traceback (most recent call last):
  File "C:\Users\ratas\Documents\EPFL_MA3\DIS_Project\.venv\Lib\site-packages\tqdm\std.py", line 1196, in __iter__
    self.close()
  File "C:\Users\ratas\Documents\EPFL_MA3\DIS_Project\.venv\Lib\site-packages\tqdm\std.py", line 1265, in close
    def close(self):

KeyboardInterrupt: 
Loading BM25 models:   0%|          | 0/1 [04:28<?, ?lang/s]


KeyboardInterrupt: 

## Document Retrival

In [17]:
queries = preprocess_queries("../data/dev.csv")
#save the queries in a csv file
queries.to_csv("../output/queries.csv", index=False)

Retrieve top10 doc for every query

In [18]:
# For each lang of the df use top_10_docid_for_all_queries of the corresponding model
for lang, model in langs_models.items():
    # Filter the queries DataFrame for the current language
    lang_queries = queries[queries['lang'] == lang]
    
    # Get the top 10 document IDs for each query using the model
    top10_docid_per_query = model.top_10_docid_for_all_queries(lang_queries)
    
    # Extract query IDs and document IDs
    queries_id = list(top10_docid_per_query.keys())
    docids = list(top10_docid_per_query.values())
    
    # Determine the mode for exporting: 'w' for the first language, 'a' for the rest
    mode = 'w' if lang == langs[0] else 'a'
    
    # Export the results to a CSV file
    export_query_result_to_submission_csv(queries_id, docids, mode=mode)


Ranking queries: 100%|██████████| 200/200 [00:35<00:00,  5.61it/s]
Ranking queries: 100%|██████████| 200/200 [00:31<00:00,  6.35it/s]
Ranking queries: 100%|██████████| 200/200 [00:30<00:00,  6.67it/s]
Ranking queries: 100%|██████████| 200/200 [00:31<00:00,  6.41it/s]
Ranking queries: 100%|██████████| 200/200 [00:26<00:00,  7.47it/s]
Ranking queries: 100%|██████████| 200/200 [00:22<00:00,  8.84it/s]


Compute recall@10

In [19]:
get_answers_for_submission_csv()
compute_recall_10()

0.7658333333333334

In [None]:
#load submission, drop column query_id and add id column starting from 0
results = pd.read_csv("../output/submission.csv")
results = results.drop(columns=["query_id"])
results["id"] = results.index

In [20]:
#take the 200 first rows and compute mean of column "correct", and so on every 200 rows
# Define the segment size
results = pd.read_csv("../output/submission_answers.csv")


langs = ['fr', 'es', 'de', 'it', 'ar', 'ko']

segment_size = 200

# Calculate the number of segments
num_segments = len(results) // segment_size

print("Recall@10 per language segment: \n")
# Loop through each segment and print the mean of the "correct" column, rounded to 2 decimal places
for i in range(num_segments):
    start = i * segment_size
    end = start + segment_size
    mean_value = results["correct"].iloc[start:end].mean()
    print(f"{langs[i]}: {mean_value:.2f}")

Recall@10 per language segment: 

fr: 0.91
es: 0.66
de: 0.91
it: 0.74
ar: 0.65
ko: 0.72
