# Submission notebook for Kaggle

## Packages required

In [None]:
!pip install numpy pandas spacy nltk tqdm scipy 

## Imports

In [1]:
from data_helpers import QueryClean
from models.BM25s.bm25s import BM25sRetriever
import pandas as pd

## Parameters

In [None]:
test_output_path = '/kaggle/temp/test_submission.csv'
submission_output_path = '/kaggle/working/submission.csv'

processing_wanted='lc',  # 'lc', 'lc_sw' or 'lc_sw_l'
k1=1.6

## Query Preprocessing

In [None]:
query = QueryClean(
        queries_path=f'/kaggle/input/dis-project-1-document-retrieval/test.csv',
        processing_wanted=processing_wanted,
        show_progress=False
    )

# Perform the pre-processing step chosen
langs = query.pre_process()

## Model BM25 initialization and document retrieval

In [None]:
# Initiate the list to stack all the matches per language in one .csv file
match_per_lang = []

# For each lang, initialize the corresponding BM25Retriever and compute the queries matching
for lang in langs:
    bm25s = BM25sRetriever(queries_df=query.data_clean[lang],
                         model_path=f'models/BM25s/bm25s_matrix/{processing_wanted}/k1_{k1}/bm25s_{lang}.pkl', #TODO
                         top_k=10)
    bm25s.match()
    match_per_lang.append(bm25s.matches)

## Submission.csv output

In [None]:
# Stack all the pd.Series to create a unified pd.Series with all the matches
matches = pd.concat(match_per_lang, ignore_index=True)

# Output on Kaggle
matches.to_csv(test_output_path,
               index=True,
               index_label='id')