In [None]:
import numpy as np
import pandas as pd
import pickle
import re
import spacy
from tqdm import tqdm
from typing import List

## Preparing the Data

In [None]:
lang = 'fr'

with open(f"./tokens_merged/tokens_{lang}.pkl", mode="rb") as f:
    ids, corpus = pickle.load(f)

In [None]:
train_df = pd.read_csv(f"./data/train.csv")
dev_df = pd.read_csv(f"./data/dev.csv")
test_df = pd.read_csv(f"./data/test.csv")

In [None]:
train_df = train_df[train_df["lang"] == lang]
dev_df = dev_df[dev_df["lang"] == lang]
test_df = test_df[test_df["lang"] == lang]

In [None]:
train_df["positive_docs_i"] = train_df["positive_docs"].apply(lambda x: ids.index(x))
dev_df["positive_docs_i"] = dev_df["positive_docs"].apply(lambda x: ids.index(x))

In [None]:
train_df.head()

Unnamed: 0,query_id,query,positive_docs,negative_docs,lang,positive_docs_i
10000,q-fr-1080,Quand Antoine Meillet est-il né ?,doc-fr-7715,"['doc-fr-4657', 'doc-fr-2635', 'doc-fr-7352', ...",fr,3066
10001,q-fr-1081,Quelles sont les origines de l'algèbre linéair...,doc-fr-7723,"['doc-fr-1298', 'doc-fr-4506', 'doc-fr-6921', ...",fr,3074
10002,q-fr-1082,"Quelle est l'étymologie du mot ""algorithme"" et...",doc-fr-7731,"['doc-fr-3025', 'doc-fr-3923', 'doc-fr-5672', ...",fr,3082
10003,q-fr-1083,"Quels sont les pouvoirs exécutif, législatif e...",doc-fr-7739,"['doc-fr-840', 'doc-fr-7178', 'doc-fr-2238', '...",fr,3090
10004,q-fr-1084,Quelle est la langue officielle de l'Autriche ?,doc-fr-7747,"['doc-fr-2144', 'doc-fr-5969', 'doc-fr-3666', ...",fr,3098


In [None]:
dev_df.head()

Unnamed: 0,query_id,query,positive_docs,negative_docs,lang,positive_docs_i
200,q-fr-0,Quels sont les chiffres concernant les violenc...,doc-fr-0,"['doc-fr-1', 'doc-fr-2', 'doc-fr-3', 'doc-fr-4...",fr,2710
201,q-fr-1,complex au fil du temps. Quelle est la contrib...,doc-fr-8,"['doc-fr-9', 'doc-fr-10', 'doc-fr-11', 'doc-fr...",fr,1244
202,q-fr-6,Quel est le projet de Fiorini dans ses Gravure...,doc-fr-48,"['doc-fr-49', 'doc-fr-50', 'doc-fr-51', 'doc-f...",fr,219
203,q-fr-19,Quelles étaient les conséquences de l'assassin...,doc-fr-152,"['doc-fr-153', 'doc-fr-154', 'doc-fr-155', 'do...",fr,2922
204,q-fr-34,omment Cochrane réussit-il à tromper la frégat...,doc-fr-272,"['doc-fr-273', 'doc-fr-274', 'doc-fr-275', 'do...",fr,1546


## Tokenization

In [None]:
class BaseTokenizer:
    def __init__(self, model_name: str):
        spacy.cli.download(model_name)
        self.nlp = spacy.load(model_name, exclude=["senter", "ner"])
        self.stop_words = set(self.nlp.Defaults.stop_words)

    @staticmethod
    def preprocess_text(text: str) -> str:
        text = re.sub(r"http[s]?://\S+|www\.\S+", " ", text)
        text = re.sub(r"[^\w\s]{4,}", " ", text)
        return re.sub(r"\s+", " ", text.replace("\n", " ")).strip().lower()

    def tokenize_batch(
        self, texts: List[str], batch_size: int = 64, n_process: int = 8
    ):
        print("Tokenizing...")
        preprocessed_texts = [self.preprocess_text(text) for text in texts]
        print("Preprocessed...")
        docs = self.nlp.pipe(
            preprocessed_texts, batch_size=batch_size, n_process=n_process
        )
        print("Docs...")
        tokenized_texts = [
            [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
            for doc in tqdm(docs, total=len(preprocessed_texts))
        ]
        return tokenized_texts


class EnglishTokenizer(BaseTokenizer):
    def __init__(self):
        super().__init__("en_core_web_sm")


class FrenchTokenizer(BaseTokenizer):
    def __init__(self):
        super().__init__("fr_core_news_sm")


class GermanTokenizer(BaseTokenizer):
    def __init__(self):
        super().__init__("de_core_news_sm")


class ItalianTokenizer(BaseTokenizer):
    def __init__(self):
        super().__init__("it_core_news_sm")


class SpanishTokenizer(BaseTokenizer):
    def __init__(self):
        super().__init__("es_core_news_sm")


class KoreanTokenizer(BaseTokenizer):
    def __init__(self):
        super().__init__("ko_core_news_sm")

In [None]:
if lang == "en":
    tokenizer = EnglishTokenizer()
elif lang == "fr":
    tokenizer = FrenchTokenizer()
elif lang == "de":
    tokenizer = GermanTokenizer()
elif lang == "it":
    tokenizer = ItalianTokenizer()
elif lang == "es":
    tokenizer = SpanishTokenizer()
elif lang == "ko":
    tokenizer = KoreanTokenizer()
else:
    raise KeyError("language")

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
tokenized_queries = tokenizer.tokenize_batch(train_df["query"].tolist(), batch_size=32, n_process=1)

Tokenizing...
Preprocessed...
Docs...


100%|██████████| 1608/1608 [00:03<00:00, 505.07it/s]


## Model

In [None]:
from lib.bm25.bm25 import BM25

bm25 = BM25()
bm25.fit(corpus)

ranks = []
for i, tokenized_query in enumerate(tqdm(tokenized_queries, total=len(tokenized_queries))):
    target = train_df.iloc[i]["positive_docs_i"]
    scores = bm25._scores(tokenized_query)
    score_target = scores[target]
    rank_target = sum(score_target <= score for score in scores)
    ranks.append(rank_target)

# recall @ 10
recall_10 = sum(rank <= 10 for rank in ranks) / len(ranks)
recall_10

100%|██████████| 10676/10676 [00:02<00:00, 3997.62it/s]
100%|██████████| 10676/10676 [00:04<00:00, 2494.82it/s]
100%|██████████| 12944661/12944661 [00:22<00:00, 575456.33it/s]
100%|██████████| 1608/1608 [00:01<00:00, 862.11it/s]


0.42039800995024873

In [None]:
# # use tf-idf

# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer(
#     tokenizer=lambda x: x, preprocessor=lambda x: x, lowercase=False
# )
# X = vectorizer.fit_transform(corpus)
# X

# from sklearn.metrics.pairwise import cosine_similarity

# ranks_tfidf = []

# for i, tokenized_query in enumerate(tqdm(tokenized_queries, total=len(tokenized_queries))):
#     target = train_df.iloc[i]["positive_docs_i"]
#     query_vector = vectorizer.transform([tokenized_query])
#     scores = cosine_similarity(X, query_vector).ravel()
#     score_target = scores[target]
#     rank_target = sum(score_target <= score for score in scores)
#     ranks_tfidf.append(rank_target)

# recall_10_tfidf = sum(rank <= 10 for rank in ranks_tfidf) / len(ranks_tfidf)
# recall_10_tfidf

100%|██████████| 1608/1608 [01:45<00:00, 15.31it/s]


0.34577114427860695

In [None]:
# from rank_bm25 import BM25Plus

# bm25_plus = BM25Plus(corpus)

# ranks_bm25_plus = []

# for i, tokenized_query in enumerate(tqdm(tokenized_queries, total=len(tokenized_queries))):
#     target = train_df.iloc[i]["positive_docs_i"]
#     scores = bm25_plus.get_scores(tokenized_query)
#     score_target = scores[target]
#     rank_target = sum(score_target <= score for score in scores)
#     ranks_bm25_plus.append(rank_target)

# recall_10_bm25_plus = sum(rank <= 10 for rank in ranks_bm25_plus) / len(ranks_bm25_plus)
# recall_10_bm25_plus

100%|██████████| 1608/1608 [00:52<00:00, 30.41it/s]


0.42039800995024873

In [None]:
# import fasttext
# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity

# target_indices = train_df["positive_docs_i"].tolist()

In [None]:
# # Load the pre-trained FastText model for French
# ft_model = fasttext.load_model('cc.fr.300.bin')  # Specify the path to the downloaded model

# # Assuming `corpus`, `tokenized_queries`, and `target_indices` are defined
# # `corpus`: List of tokenized documents
# # `tokenized_queries`: List of tokenized query terms
# # `target_indices`: List of indices in `corpus` that are the correct document for each query

# # Function to get average FastText vector for a list of tokens
# def get_average_embedding(tokens):
#     vectors = [ft_model.get_word_vector(token) for token in tokens if token in ft_model]
#     if vectors:
#         return np.mean(vectors, axis=0)
#     else:
#         return np.zeros(ft_model.get_dimension())

# # Calculate embeddings for each document in the corpus, with progress tracking
# print("Calculating embeddings for corpus...")
# corpus_embeddings = [get_average_embedding(doc) for doc in tqdm(corpus)]

# # Function to perform retrieval for a query and return top-k results
# def retrieve_top_k(query_tokens, corpus_embeddings, k=10):
#     query_embedding = get_average_embedding(query_tokens)
#     similarities = cosine_similarity([query_embedding], corpus_embeddings).flatten()
#     top_k_indices = np.argsort(similarities)[-k:][::-1]  # Indices of top-k most similar documents
#     return top_k_indices

# # Calculate recall@10
# correct_retrievals = 0
# k = 10  # Setting k=10 for recall@10

# print("Calculating recall@10 for each query...")
# for i, q_tokens in enumerate(tqdm(tokenized_queries)):
#     top_k_indices = retrieve_top_k(q_tokens, corpus_embeddings, k)
#     if target_indices[i] in top_k_indices:
#         correct_retrievals += 1

# # Calculate recall@10 as a percentage
# recall_at_10 = correct_retrievals / len(tokenized_queries) * 100
# print(f"Recall@10: {recall_at_10:.2f}%")

Calculating embeddings for corpus...


 12%|█▏        | 1284/10676 [40:19<4:54:58,  1.88s/it] 


KeyboardInterrupt: 