# BM25

Notebooks to show our BM25 method and its performances.

## Import

In [1]:
from bm25 import BM25_retriever
import pandas as pd
from tqdm import tqdm
from query_preprocessing import preprocess_queries
from metrics import export_query_result_to_submission_csv, get_answers_for_submission_csv, compute_recall_10

#disable warnings   
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

## BM25 Initialization

In [2]:
# Load all models for each langs
langs = ['en', 'fr', 'de', 'es', 'it', 'ko', 'ar']
#langs = ['en']
langs_models = {}

for lang in tqdm(langs, desc="Loading BM25 models", unit="lang"):
    path = f"bm25_models/bm25_{lang}.pkl"
    model = BM25_retriever(path)
    langs_models[lang] = model

Loading BM25 models:   0%|          | 0/7 [02:00<?, ?lang/s]


KeyboardInterrupt: 

## Document Retrival

In [None]:
queries = preprocess_queries("../data/test.csv")
#save the queries in a csv file
queries.to_csv("../output/queries.csv", index=False)

Retrieve top10 doc for every query

In [None]:
# For each lang of the df use top_10_docid_for_all_queries of the corresponding model
for lang, model in langs_models.items():
    # Filter the queries DataFrame for the current language
    lang_queries = queries[queries['lang'] == lang]
    
    # Get the top 10 document IDs for each query using the model
    top10_docid_per_query = model.top_10_docid_for_all_queries(lang_queries)
    
    # Extract query IDs and document IDs
    queries_id = list(top10_docid_per_query.keys())
    docids = list(top10_docid_per_query.values())
    
    # Determine the mode for exporting: 'w' for the first language, 'a' for the rest
    mode = 'w' if lang == langs[0] else 'a'
    
    # Export the results to a CSV file
    export_query_result_to_submission_csv(queries_id, docids, mode=mode)


Compute recall@10

In [None]:
#load submission, drop column query_id and add id column starting from 0
results = pd.read_csv("../output/submission.csv")
results.drop(columns=["query_id"], inplace=True)
results.to_csv("../output/official_submission.csv", index=True) 

In [None]:
# get_answers_for_submission_csv()
# compute_recall_10()

In [None]:
#take the 200 first rows and compute mean of column "correct", and so on every 200 rows
# Define the segment size
results = pd.read_csv("../output/submission_answers.csv")


langs = ['en', 'fr', 'es', 'de', 'it', 'ar', 'ko']

segment_size = 200

# Calculate the number of segments
num_segments = len(results) // segment_size

print("Recall@10 per language segment: \n")
# Loop through each segment and print the mean of the "correct" column, rounded to 2 decimal places
for i in range(num_segments):
    start = i * segment_size
    end = start + segment_size
    mean_value = results["correct"].iloc[start:end].mean()
    print(f"{langs[i]}: {mean_value:.2f}")