# Tokenize the corpus

## If the corpus has not been tokenized yet

In [1]:
# Load corpus json
import json
import gc

print('Load corpus.json')
with open('data/corpus.json/corpus.json', 'r') as f:
    documents = json.load(f)

Load corpus.json


In [2]:
# Extract text and lang from docs

texts_and_lang = []
for doc in documents:
    texts_and_lang.append((doc["text"], doc["lang"]))

del documents
gc.collect()

0

In [3]:
# Tokenize the corpus by language and combine the results
from preprocessing.tokenization import tokenize

corpus_tokens = tokenize(texts_and_lang)

del texts_and_lang
gc.collect()

Tokenizing texts:   0%|          | 0/268022 [00:00<?, ?it/s]

es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting t

Stemming tokens:   0%|          | 0/268022 [00:00<?, ?it/s]

IOStream.flush timed out

57




In [4]:
# Save corpus_tokens on disk
import pickle

# Save aggregated_docs_vectors to disk
with open(f'saved_objects/corpus_tokens.pkl', 'wb') as f:
    pickle.dump(corpus_tokens, f)
print(f"Saved saved_objects/corpus_tokens.pkl")

Saved saved_objects/corpus_tokens.pkl


## If the corpus has already been tokenized

In [None]:
# Load the corpus_tokens from disk
import pickle

with open('saved_objects/corpus_tokens.pkl', 'rb') as f:
    corpus_tokens = pickle.load(f)
print("Loaded corpus_tokens from disk.")

# Index the corpus

In [5]:
from models.bm25 import BM25
import gc
# Create the BM25 model and index the corpus
k1 = 1.5
b = 0.75
retriever = BM25(k1=k1, b=b)
retriever.index_corpus(corpus_tokens)
del corpus_tokens
gc.collect()

Counting Tokens:   0%|          | 0/268022 [00:00<?, ?it/s]

Computing Scores:   0%|          | 0/268022 [00:00<?, ?it/s]

268061

# Calculate recall on dev set

In [6]:
# Load dict to match document index and corresponding docid
import json
with open('saved_objects/doc_index_to_docid.json', 'r') as f:
    doc_index_to_docid = json.load(f)
doc_index_to_docid = {int(key): value for key, value in doc_index_to_docid.items()} # reconvert keys to int

# Load dict to match document index and corresponding lang
import json
with open('saved_objects/doc_index_to_lang.json', 'r') as f:
    doc_index_to_lang = json.load(f)
doc_index_to_lang = {int(key): value for key, value in doc_index_to_lang.items()} # reconvert keys to int

In [7]:
# Function to calculate recall@10
def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    retrieved_set = set(retrieved_docs[:k])
    relevant_set = set(relevant_docs)
    intersection = retrieved_set.intersection(relevant_set)
    recall = len(intersection) / len(relevant_set)
    return recall

import pandas as pd

# Load dev set
dev_set_path = 'data/dev.csv'
dev_set = pd.read_csv(dev_set_path)


# Positive/Negative docs to list
def docs_to_list(docs):
    if isinstance(docs, str):
        if docs.startswith('[') and docs.endswith(']'):
            return eval(docs)
        else:
            return [docs]
    return docs

In [8]:
# Retrieve docs for each query in dev set
from preprocessing.tokenization import tokenize

queries_and_lang = []
langs = []
for index, row in dev_set.iterrows():
    queries_and_lang.append((row['query'], row['lang'])) # to preprocess according to lang
    langs.append(row['lang']) # to associate query with its lang during retrieval

queries_tokens = tokenize(queries_and_lang)

retrieved_docs_indices, scores = retriever.search(queries_tokens, langs, k=10, n_threads=-1, doc_index_to_lang=doc_index_to_lang, filter_by_lang=False)

Tokenizing texts:   0%|          | 0/1400 [00:00<?, ?it/s]

es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting t

Stemming tokens:   0%|          | 0/1400 [00:00<?, ?it/s]

Retrieving Documents:   0%|          | 0/1400 [00:00<?, ?it/s]

In [9]:
import numpy as np
# Calculate recall@10 for each query in dev set
recalls = []
lang_recalls = {}
for index, row in dev_set.iterrows():
    lang = row['lang']
    retrieve_docs_ids = [doc_index_to_docid[doc_index] for doc_index in retrieved_docs_indices[index]]
    positive_docs = docs_to_list(row['positive_docs']) # convert str to python list
    recall = calculate_recall_at_k(retrieve_docs_ids, positive_docs, k=10)
    recalls.append(recall)

    # Add recall to specific langage
    if lang not in lang_recalls:
        lang_recalls[lang] = []
    lang_recalls[lang].append(recall)

# Calculate average recall
mean_recall_at_10 = np.mean(recalls)
print(f"Mean Recall@10: {mean_recall_at_10:.4f}")

# Calculate average recall for each language
for lang, lang_recall_list in lang_recalls.items():
    mean_lang_recall = np.mean(lang_recall_list)
    print(f"Mean Recall@10 for {lang}: {mean_lang_recall:.4f}")

Mean Recall@10: 0.7850
Mean Recall@10 for en: 0.7700
Mean Recall@10 for fr: 0.9000
Mean Recall@10 for de: 0.6950
Mean Recall@10 for es: 0.9350
Mean Recall@10 for it: 0.8000
Mean Recall@10 for ko: 0.6400
Mean Recall@10 for ar: 0.7550


# MLflow logging

In [10]:
model_name = "bm25_stopwords_each_lang_stopwords_stemmer_each_lang_retrieve_docs_by_lang_no_spanish"

import mlflow

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("word_embedding")

# Start an MLflow run# MLflow logging
with mlflow.start_run(run_name=model_name):
    mlflow.log_param("framework", "BM25S")
    mlflow.log_param("model", "BM25S")
    mlflow.log_param("input", "full_corpus")
    
    mlflow.log_metric("k1", k1)
    mlflow.log_metric("b", b)

    mlflow.log_metric("recall_at10_dev", mean_recall_at_10)
    for lang, lang_recall_list in lang_recalls.items():
        mean_lang_recall = np.mean(lang_recall_list)
        mlflow.log_metric(f"recall_at10_dev_{lang}", mean_lang_recall)


2024/10/27 15:17:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run bm25_stopwords_each_lang_stopwords_stemmer_each_lang_retrieve_docs_by_lang_no_spanish at: http://127.0.0.1:5000/#/experiments/0/runs/9e2b1b78b6164c7f886a0b2c208d8fdb.
2024/10/27 15:17:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


# Predictions on test set

In [11]:
# Load test set
test_set_path = 'data/test.csv'
test_set = pd.read_csv(test_set_path)

In [12]:
# Retrieve docs for each query in test set
from preprocessing.tokenization import tokenize

queries_and_lang = []
langs = []
for index, row in test_set.iterrows():
    queries_and_lang.append((row['query'], row['lang'])) # to preprocess according to lang
    langs.append(row['lang']) # to associate query with its lang during retrieval

queries_tokens = tokenize(queries_and_lang)

retrieved_docs_indices, scores = retriever.search(queries_tokens, langs, k=10, n_threads=-1, doc_index_to_lang=doc_index_to_lang, filter_by_lang=False)

Tokenizing texts:   0%|          | 0/2000 [00:00<?, ?it/s]

es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting to English stopwords
es stopwords not supported, defaulting t

Stemming tokens:   0%|          | 0/2000 [00:00<?, ?it/s]

Retrieving Documents:   0%|          | 0/2000 [00:00<?, ?it/s]

In [13]:
# Get documents retrieved for each query in test set
predicted_docs = []
for index, row in test_set.iterrows():
    query_id = row['id']
    retrieve_docs_ids = [doc_index_to_docid[doc_index] for doc_index in retrieved_docs_indices[index]]
    predicted_docs.append((query_id, retrieve_docs_ids))

# Create Dataframe with results
results_df = pd.DataFrame(predicted_docs, columns=['id', 'docids'])

# Save to csv
results_df.to_csv('submission.csv', index=False)