# BM25

Notebooks to show our BM25 method and its performances.

## Import

In [23]:
from tf_idf import TFIDFTrainer
from rank_bm25 import BM25Okapi
import pandas as pd
import data_loader

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Document cleaning

In [24]:
french_doc_df = TFIDFTrainer("data/corpus.json/clean_corpus_fr.json").corpus

Loading corpus...
Corpus loaded successfully !

Information about the given corpus
###################################
Number of documents: 10676
Language (only one language should be displayed): ['fr']



In [35]:
french_doc_df

Unnamed: 0,docid,text,lang
0,doc-fr-1447,production café costa rica représente 2016 env...,fr
1,doc-fr-4878,continuité gouvernement cog principe établisse...,fr
2,doc-fr-801,juan manuel fangio né balcarce mort buenos air...,fr
3,doc-fr-1750,louis auguste mathieu legrand peintre dessinat...,fr
4,doc-fr-5810,gare lille saint sauveur simplement gare saint...,fr
...,...,...,...
10671,doc-fr-4573,frelon asiatique frelon pattes jaunes vespa ve...,fr
10672,doc-fr-7662,surveillance épidémiologique activité santé pu...,fr
10673,doc-fr-7669,série télévisée britannique 37 épisodes enviro...,fr
10674,doc-fr-7683,lucien bégule né saint genis laval mort lyon 2...,fr


### BM25 Initialization

In [42]:
# Extract the text data
corpus = french_doc_df['text'].tolist()

# Tokenize the corpus
tokenized_corpus = [doc.split() for doc in corpus]

# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

In [43]:
train_df = pd.read_csv("data/train.csv")
french_train_df = train_df[train_df['lang'] == 'fr']

In [None]:
from tqdm import tqdm

# Create a new DataFrame to store the results
results_df = pd.DataFrame(columns=['top_10_doc', 'is_answer_in_top_10'])

# Function to get top 10 doc IDs and check if the answer is in the top 10
def get_top_10_and_check(row):
    query = data_loader.clean_sentence(row['query'], 'fr')
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    top_10_indices = scores.argsort()[-10:][::-1]
    top_10_doc_ids = french_doc_df.iloc[top_10_indices]['docid'].tolist()
    is_answer_in_top_10 = row['positive_docs'] in top_10_doc_ids
    return top_10_doc_ids, is_answer_in_top_10

# Iterate over the rows and store the results in the new DataFrame
for index, row in tqdm(french_train_df.iterrows(), total=french_train_df.shape[0]):
    top_10_doc_ids, is_answer_in_top_10 = get_top_10_and_check(row)
    results_df.at[index, 'top_10_doc'] = top_10_doc_ids
    results_df.at[index, 'is_answer_in_top_10'] = is_answer_in_top_10



In [53]:
# Drop the existing columns before joining
french_train_df = french_train_df.drop(columns=['top_10_doc', 'is_answer_in_top_10'], errors='ignore')

# Merge the results back to the original DataFrame
french_train_df = french_train_df.join(results_df)



In [54]:
french_train_df.head()

Unnamed: 0,query_id,query,positive_docs,negative_docs,lang,top_10_doc,is_answer_in_top_10
10000,q-fr-1080,Quand Antoine Meillet est-il né ?,doc-fr-7715,"['doc-fr-4657', 'doc-fr-2635', 'doc-fr-7352', ...",fr,"[doc-fr-7656, doc-fr-8717, doc-fr-12048, doc-f...",False
10001,q-fr-1081,Quelles sont les origines de l'algèbre linéair...,doc-fr-7723,"['doc-fr-1298', 'doc-fr-4506', 'doc-fr-6921', ...",fr,"[doc-fr-7404, doc-fr-9341, doc-fr-8341, doc-fr...",False
10002,q-fr-1082,"Quelle est l'étymologie du mot ""algorithme"" et...",doc-fr-7731,"['doc-fr-3025', 'doc-fr-3923', 'doc-fr-5672', ...",fr,"[doc-fr-8048, doc-fr-14074, doc-fr-11133, doc-...",False
10003,q-fr-1083,"Quels sont les pouvoirs exécutif, législatif e...",doc-fr-7739,"['doc-fr-840', 'doc-fr-7178', 'doc-fr-2238', '...",fr,"[doc-fr-12294, doc-fr-13317, doc-fr-8086, doc-...",False
10004,q-fr-1084,Quelle est la langue officielle de l'Autriche ?,doc-fr-7747,"['doc-fr-2144', 'doc-fr-5969', 'doc-fr-3666', ...",fr,"[doc-fr-9596, doc-fr-12551, doc-fr-8680, doc-f...",False


In [55]:
# Compute different metrics
recall = french_train_df['is_answer_in_top_10'].mean()
print(f"Recall@10: {recall}")

Recall@10: 0.4191542288557214
