# Tokenize the corpus

## If the corpus has not been tokenized yet

In [None]:
# Load corpus json
import json
import gc

print('Load corpus.json')
with open('data/corpus.json/corpus.json', 'r') as f:
    documents = json.load(f)

In [None]:
# Extract text from docs
print('Extract text from docs')
texts = [doc['text'] for doc in documents]

del documents
gc.collect()

In [None]:
import bm25s
import Stemmer  # optional: for stemming

# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

# Tokenize the corpus and only keep the ids (faster and saves memory)
corpus_tokens = bm25s.tokenize(texts, stopwords="en", stemmer=stemmer)

del texts
gc.collect()

In [None]:
# Save corpus_tokens on disk
import pickle

# Save aggregated_docs_vectors to disk
with open(f'saved_objects/corpus_tokens.pkl', 'wb') as f:
    pickle.dump(corpus_tokens, f)
print(f"Saved saved_objects/corpus_tokens.pkl")

## If the corpus has already been tokenized

In [None]:
# Load the corpus_tokens from disk
import pickle

with open('saved_objects/corpus_tokens.pkl', 'rb') as f:
    corpus_tokens = pickle.load(f)
print("Loaded corpus_tokens from disk.")

# Index the corpus

In [None]:
from models.bm25 import BM25
# Create the BM25 model and index the corpus
retriever = BM25()
retriever.index_corpus(corpus_tokens)
del corpus_tokens
gc.collect()

# Calculate recall on dev set

In [6]:
import Stemmer

# optional: create a stemmer
stemmer = Stemmer.Stemmer("english")

In [7]:
# Load dict to match document index and corresponding docid
import json
with open('saved_objects/doc_index_to_docid.json', 'r') as f:
    doc_index_to_docid = json.load(f)
doc_index_to_docid = {int(key): value for key, value in doc_index_to_docid.items()} # reconvert keys to int

In [10]:
# Function to calculate recall@10
def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    retrieved_set = set(retrieved_docs[:k])
    relevant_set = set(relevant_docs)
    intersection = retrieved_set.intersection(relevant_set)
    recall = len(intersection) / len(relevant_set)
    return recall

import pandas as pd

# Load dev set
dev_set_path = 'data/dev.csv'
dev_set = pd.read_csv(dev_set_path)


# Positive/Negative docs to list
def docs_to_list(docs):
    if isinstance(docs, str):
        if docs.startswith('[') and docs.endswith(']'):
            return eval(docs)
        else:
            return [docs]
    return docs

In [None]:
# Retrieve docs for each query in dev set
queries = []
for index, row in dev_set.iterrows():
    queries.append(row['query'])

queries_tokens = bm25s.tokenize(queries, stemmer=stemmer)
retrieved_docs_indices, scores = retriever.search(queries_tokens, k=10, n_threads=-1)

In [None]:
import numpy as np
# Calculate recall@10 for each query in dev set
recalls = []
lang_recalls = {}
for index, row in dev_set.iterrows():
    lang = row['lang']
    retrieve_docs_ids = [doc_index_to_docid[doc_index] for doc_index in retrieved_docs_indices[index]]
    positive_docs = docs_to_list(row['positive_docs']) # convert str to python list
    recall = calculate_recall_at_k(retrieve_docs_ids, positive_docs, k=10)
    recalls.append(recall)

    # Add recall to specific langage
    if lang not in lang_recalls:
        lang_recalls[lang] = []
    lang_recalls[lang].append(recall)

# Calculate average recall
mean_recall_at_10 = np.mean(recalls)
print(f"Mean Recall@10: {mean_recall_at_10:.4f}")

# Calculate average recall for each language
for lang, lang_recall_list in lang_recalls.items():
    mean_lang_recall = np.mean(lang_recall_list)
    print(f"Mean Recall@10 for {lang}: {mean_lang_recall:.4f}")

# Predictions on test set

In [13]:
# Load test set
test_set_path = 'data/test.csv'
test_set = pd.read_csv(test_set_path)

In [None]:
# Retrieve docs for each query in test set
queries = []
for index, row in test_set.iterrows():
    queries.append(row['query'])

queries_tokens = bm25s.tokenize(queries, stemmer=stemmer)
retrieved_docs_indices, scores = retriever.search(queries_tokens, k=10, n_threads=-1)

In [15]:
# Get documents retrieved for each query in test set
predicted_docs = []
for index, row in test_set.iterrows():
    query_id = row['id']
    retrieve_docs_ids = [doc_index_to_docid[doc_index] for doc_index in retrieved_docs_indices[index]]
    predicted_docs.append((query_id, retrieve_docs_ids))

# Create Dataframe with results
results_df = pd.DataFrame(predicted_docs, columns=['id', 'docids'])

# Save to csv
results_df.to_csv('predicted_docs.csv', index=False)