# Document Retrieval - BM25s

## Packages required

In [None]:
!python -m spacy download en_core_web_sm 
!python -m spacy download fr_core_news_sm
!python -m spacy download de_core_news_sm 
!python -m spacy download es_core_news_sm
!python -m spacy download it_core_news_sm

## Imports

In [None]:
import pandas as pd
import warnings
from data_helpers import QueryClean
from models.BM25s.bm25s import BM25sRetriever

warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

## Parameters

In [3]:
query_source= 'test',  # 'train', 'dev' or 'test'
processing_wanted='lc' # 'lc', 'lc_sw' or 'lc_sw_l'
k1=1.6 # 1.0, 1.1, 1.2, 1.6, 2.0

## Query Preprocessing

In [None]:
query = QueryClean(
        queries_path=f'data/{query_source}.csv',
        processing_wanted=processing_wanted,
        show_progress=False
    )

# Perform the pre-processing step chosen
langs = query.pre_process()

## Model BM25 initialization and document retrieval

In [None]:
# Initiate the list to stack all the matches per language in one .csv file
match_per_lang = []

# For each lang, initialize the corresponding BM25Retriever and compute the queries matching
for lang in langs:
    bm25s = BM25sRetriever(queries_df=query.data_clean[lang],
                         model_path=f'models/BM25s/bm25s_matrix/{processing_wanted}/k1_{k1}/bm25s_{lang}.pkl',
                         top_k=10)
    bm25s.match()
    match_per_lang.append(bm25s.matches)

## Submission.csv output

In [None]:
# Stack all the pd.Series to create a unified pd.Series with all the matches
matches = pd.concat(match_per_lang, ignore_index=True)

# Write on disk a .csv file with the matches
matches.to_csv(f'submission.csv',
                       index=True,
                       index_label='id')