# BM25

Notebooks to show our BM25 method and its performances.

## Import

In [1]:
from tf_idf import TFIDFTrainer
from rank_bm25 import BM25Okapi
import pandas as pd
import data_loader

%load_ext autoreload
%autoreload 2

## Document cleaning

In [2]:
french_doc_df = TFIDFTrainer("../data/corpus.json/clean_corpus_fr.json").corpus

Loading corpus...
Corpus loaded successfully !

Information about the given corpus
###################################
Number of documents: 10676
Language (only one language should be displayed): ['fr']



In [3]:
french_doc_df

Unnamed: 0,docid,text,lang
0,doc-fr-1447,production café costa rica représente 2016 env...,fr
1,doc-fr-4878,continuité gouvernement cog principe établisse...,fr
2,doc-fr-801,juan manuel fangio né balcarce mort buenos air...,fr
3,doc-fr-1750,louis auguste mathieu legrand peintre dessinat...,fr
4,doc-fr-5810,gare lille saint sauveur simplement gare saint...,fr
...,...,...,...
10671,doc-fr-4573,frelon asiatique frelon pattes jaunes vespa ve...,fr
10672,doc-fr-7662,surveillance épidémiologique activité santé pu...,fr
10673,doc-fr-7669,série télévisée britannique 37 épisodes enviro...,fr
10674,doc-fr-7683,lucien bégule né saint genis laval mort lyon 2...,fr


### BM25 Initialization

In [4]:
# Extract the text data
corpus = french_doc_df['text'].tolist()

# Tokenize the corpus
tokenized_corpus = [doc.split() for doc in corpus]

# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

In [5]:
train_df = pd.read_csv("../data/train.csv")
french_train_df = train_df[train_df['lang'] == 'fr']

In [6]:
from tqdm import tqdm

# Create a new DataFrame to store the results
results_df = pd.DataFrame(columns=['top_10_doc', 'is_answer_in_top_10'])

# Function to get top 10 doc IDs and check if the answer is in the top 10
def get_top_10_and_check(row):
    query = data_loader.clean_sentence(row['query'], 'fr')
    tokenized_query = query.split()
    scores = bm25.get_scores(tokenized_query)
    top_10_indices = scores.argsort()[-10:][::-1]
    top_10_doc_ids = french_doc_df.iloc[top_10_indices]['docid'].tolist()
    is_answer_in_top_10 = row['positive_docs'] in top_10_doc_ids
    return top_10_doc_ids, is_answer_in_top_10

# Iterate over the rows and store the results in the new DataFrame
for index, row in tqdm(french_train_df.iterrows(), total=french_train_df.shape[0]):
    top_10_doc_ids, is_answer_in_top_10 = get_top_10_and_check(row)
    results_df.at[index, 'top_10_doc'] = top_10_doc_ids
    results_df.at[index, 'is_answer_in_top_10'] = is_answer_in_top_10



100%|██████████| 1608/1608 [01:29<00:00, 17.90it/s]


In [11]:
#merge the two dataframes
# french_train_df = french_train_df.merge(results_df, left_index=True, right_index=True)
french_train_df = french_train_df[['query_id', 'top_10_doc']]
french_train_df

Unnamed: 0,query_id,top_10_doc
10000,q-fr-1080,"[doc-fr-7656, doc-fr-8717, doc-fr-12048, doc-f..."
10001,q-fr-1081,"[doc-fr-7404, doc-fr-9341, doc-fr-8341, doc-fr..."
10002,q-fr-1082,"[doc-fr-8048, doc-fr-14074, doc-fr-11133, doc-..."
10003,q-fr-1083,"[doc-fr-12294, doc-fr-13317, doc-fr-8086, doc-..."
10004,q-fr-1084,"[doc-fr-9596, doc-fr-12551, doc-fr-8680, doc-f..."
...,...,...
11603,q-fr-1072,"[doc-fr-7662, doc-fr-12947, doc-fr-13576, doc-..."
11604,q-fr-1073,"[doc-fr-7669, doc-fr-10286, doc-fr-10131, doc-..."
11605,q-fr-1075,"[doc-fr-7683, doc-fr-9452, doc-fr-4692, doc-fr..."
11606,q-fr-1077,"[doc-fr-13231, doc-fr-2905, doc-fr-13621, doc-..."


In [31]:
from metrics import export_query_result_to_submission_csv, get_answers_for_submission_csv, compute_recall_10

export_query_result_to_submission_csv(french_train_df['query_id'].tolist(), french_train_df['top_10_doc'].tolist())

In [32]:
get_answers_for_submission_csv()

In [29]:
compute_recall_10()

0.4191542288557214