In [1]:
import numpy as np
import pandas as pd
import fasttext
import re
import collections
import string
from num2words import num2words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
from tqdm import tqdm
import gc

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martinlebras/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
test_file_path = "data/test.csv"
df_test = pd.read_csv(test_file_path)

In [None]:
import json
json_path = "data/corpus.json"
with open(json_path,"r",encoding='utf-8') as file:
    data = json.load(file)

In [2]:
def create_language_corpus(corpus: list) -> dict:
    language_corpus = {"en":[],"fr":[],"es":[],"de":[],"it":[],"ar":[],"ko":[]}
    for document in corpus:
        language_corpus[document["lang"]].append(document)
    for language in tqdm(language_corpus.keys()):
        with open(f"corpus_{language}.json","w",encoding='utf-8') as file:
            json.dump(language_corpus[language],file,ensure_ascii=False)
    del language_corpus

In [3]:
def lowercase_sentence_start(text):

    def lowercase_match(match):
        return match.group(1) + match.group(2).lower()
    
    pattern = r'([.!?]\s+|^)([A-Z])'
    
    return re.sub(pattern, lowercase_match, text)

In [4]:
def convert_numbers_to_words(text, lang='en'):
    
    def replace_number(match):
        number_str = match.group(0)
        
        if '.' in number_str:
            return num2words(float(number_str), lang=lang)
        else:
            return num2words(int(number_str), lang=lang)
        
    pattern = r'\b\d+(\.\d+)?\b'
    
    # Substitute all matched numbers with their word equivalents
    return re.sub(pattern, replace_number, text)

In [5]:
def remove_stopwords(text, lang='english'):

    try:
        stop_words = set(stopwords.words(lang))
    except:
        stop_words = set()
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    
    return filtered_text

In [6]:
def preprocess_document(text: str, lower: bool, stopwords: bool, number: bool, language: str): #lower, remove punctuation and stopwords in the given language, convert numbers to their text value, handle names with captial letters
    language_mapping = {"en":"english","fr":"french","de":"german","es":"spanish","ar":"arabic","ko":"korean","it":"italian"}
    if lower:
        text = lowercase_sentence_start(text)
    else:
        text = text.lower()
    if number:
        text = convert_numbers_to_words(text, language)
    pattern = f"[{re.escape(string.punctuation)}]"
    text = re.sub(pattern, '', text)
    if stopwords:
        words = remove_stopwords(text,language_mapping[language])
    else:
        words = text.split(" ")
    return words

Model embedding per language
Issue for query across languages 
Multilingual model or consider most relevant documents are the one with same languages ? Check train data
Sentence embedding with TF-IDF in the document (?) and then regular aggregation across sentences of the document (min,max,mean ?)

In [7]:
def get_document_vocabulary(words: str) -> dict:
    return dict(collections.Counter(words))

In [8]:
def get_corpus_frequencies(corpus: list, language: str) -> dict:
    corpus_dict = {}
    for document in tqdm(corpus):
        corpus_dict[document["docid"]] = get_document_vocabulary(preprocess_document(document["text"],False,True,False,language))
    return corpus_dict

In [9]:
def get_corpus_vocabulary(corpus_frequencies: dict) -> list:
    vocabulary = list()
    for doc_id in tqdm(corpus_frequencies.keys()):
        document_vocabulary = list(corpus_frequencies[doc_id].keys())
        vocabulary.extend(document_vocabulary)
    return list(set(vocabulary))

In [10]:
def term_frequency(word: str, document_vocabulary: dict) -> float:
    return document_vocabulary[word] / max(document_vocabulary.values())

In [11]:
def inverse_document_frequency(corpus_frequencies: dict, corpus_vocabulary: list) -> dict:

    document_count_per_word = collections.defaultdict(int)
    
    for doc_id, word_freq in corpus_frequencies.items():
        for word in word_freq.keys():
            document_count_per_word[word] += 1
    
    num_documents = len(corpus_frequencies)
    
    idf = {}
    for word in tqdm(corpus_vocabulary):
        n_word = document_count_per_word[word]
        idf[word] = float(np.log(num_documents / n_word))
    
    return idf

In [12]:
def tf_idf(word: str, document_vocabulary: dict, idf: dict) -> float:
    return term_frequency(word, document_vocabulary)*idf[word]

In [13]:
def tf_idf_mean_agg(model, document_vocabulary: dict, idf: dict) -> np.array: #Consider a weight normalization step if necessary
    first_word = True
    weights = 0
    for word in document_vocabulary.keys():
        word_embedding = model.get_word_vector(word)
        if first_word:
            document_embedding = np.zeros_like(word_embedding)
            first_word = False
        weight = tf_idf(word, document_vocabulary, idf)
        document_embedding += word_embedding*weight
        weights += weight
    document_embedding = document_embedding/weights
    return document_embedding


In [14]:
def mean_agg(model, document_vocabulary: dict) -> np.array: #Consider a weight normalization step if necessary
    first_word = True
    count = 0
    for word in document_vocabulary.keys():
        word_embedding = model.get_word_vector(word)
        if first_word:
            document_embedding = np.zeros_like(word_embedding)
            first_word = False
        document_embedding += word_embedding
        count += 1
    document_embedding = document_embedding/count
    return document_embedding

In [15]:
def embed_corpus(corpus: list, model, language: str, freq: bool) -> dict:
    documents_embeddings = {}
    if freq:
        corpus_frequencies = get_corpus_frequencies(corpus, language)
        corpus_vocabulary = get_corpus_vocabulary(corpus_frequencies)
        idf = inverse_document_frequency(corpus_frequencies, corpus_vocabulary)
        with open(f"data/corpus_freq_{language}.json","w",encoding='utf-8') as freq_file:
            json.dump(corpus_frequencies, freq_file, ensure_ascii=False)
        with open(f"data/corpus_vocab_{language}.json","w",encoding='utf-8') as vocab_file:
            json.dump(corpus_vocabulary, vocab_file, ensure_ascii=False)
        with open(f"data/corpus_idf_{language}.json","w",encoding='utf-8') as idf_file:
            json.dump(idf, idf_file, ensure_ascii=False)
    else:
        with open(f"data/corpus_idf_{language}.json","r",encoding='utf-8') as idf_file:
            idf = json.load(idf_file)
    print(f"Embedding documents in {language}")
    for document in tqdm(corpus):
        document_vocabulary = get_document_vocabulary(preprocess_document(document["text"],False,True,False,language))
        documents_embeddings[document["docid"]] = tf_idf_mean_agg(model, document_vocabulary, idf).tolist()
    print(f"Documents embedded in {language}")
    return documents_embeddings

In [16]:
def cosine_similarity(word_a: np.array, word_b: np.array) -> float:
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(word_a)):
        x = word_a[i]; y = word_b[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
        result = 0
    else:
        result =  sumxy / np.sqrt(sumxx*sumyy)
    return result

Check for titles in the documents and if there is one, handle it differently

Create a separate corpus for each language in order to have a separate vocabulary for each language

In [68]:
#print("Creating corpus for each language")
#create_language_corpus(data)
#print("Corpus created for each language")
#gc.collect()
languages = ["en","fr","it","es","de","ar","ko"]
for language in languages:
    print(f"Loading corpus in {language}")
    with open(f"data/corpus_{language}.json","r",encoding='utf-8') as corpus_file:
        corpus = json.load(corpus_file)
    print(f"Corpus loaded in {language}")

    print(f"Loading model in {language}")
    model = fasttext.load_model(f"cc.{language}.300.bin")
    print(f"Model loaded in {language}")

    print(f"Embedding corpus in {language}")
    embedded_corpus = embed_corpus(corpus, model, language, False)
    print(f"Corpus embedded in {language}")

    print(f"Saving embedded corpus in {language}")
    with open(f"data/embedded_corpus_{language}.json","w",encoding='utf-8') as file:
        json.dump(embedded_corpus, file, ensure_ascii=False)
    print(f"Embedded corpus saved in {language}")
    del corpus, model, embedded_corpus
    gc.collect()


Loading corpus in en
Corpus loaded in en
Loading model in en
Model loaded in en
Embedding corpus in en
Embedding documents in en


100%|██████████| 207363/207363 [43:31<00:00, 79.39it/s] 


Documents embedded in en
Corpus embedded in en
Saving embedded corpus in en
Embedded corpus saved in en
Loading corpus in fr
Corpus loaded in fr
Loading model in fr
Model loaded in fr
Embedding corpus in fr
Embedding documents in fr


100%|██████████| 10676/10676 [07:06<00:00, 25.02it/s]


Documents embedded in fr
Corpus embedded in fr
Saving embedded corpus in fr
Embedded corpus saved in fr
Loading corpus in it
Corpus loaded in it
Loading model in it
Model loaded in it
Embedding corpus in it
Embedding documents in it


100%|██████████| 11250/11250 [07:59<00:00, 23.48it/s]


Documents embedded in it
Corpus embedded in it
Saving embedded corpus in it
Embedded corpus saved in it
Loading corpus in es
Corpus loaded in es
Loading model in es
Model loaded in es
Embedding corpus in es
Embedding documents in es


100%|██████████| 11019/11019 [07:16<00:00, 25.26it/s]


Documents embedded in es
Corpus embedded in es
Saving embedded corpus in es
Embedded corpus saved in es
Loading corpus in de
Corpus loaded in de
Loading model in de
Model loaded in de
Embedding corpus in de
Embedding documents in de


100%|██████████| 10992/10992 [06:53<00:00, 26.57it/s]


Documents embedded in de
Corpus embedded in de
Saving embedded corpus in de
Embedded corpus saved in de
Loading corpus in ar
Corpus loaded in ar
Loading model in ar
Model loaded in ar
Embedding corpus in ar
Embedding documents in ar


100%|██████████| 8829/8829 [07:48<00:00, 18.85it/s] 


Documents embedded in ar
Corpus embedded in ar
Saving embedded corpus in ar
Embedded corpus saved in ar
Loading corpus in ko
Corpus loaded in ko
Loading model in ko
Model loaded in ko
Embedding corpus in ko
Embedding documents in ko


100%|██████████| 7893/7893 [05:34<00:00, 23.59it/s] 


Documents embedded in ko
Corpus embedded in ko
Saving embedded corpus in ko
Embedded corpus saved in ko


In [42]:
train_file_path = "data/train.csv"
df_train = pd.read_csv(train_file_path)

In [18]:
df_train

Unnamed: 0,query_id,query,positive_docs,negative_docs,lang
0,q-en-425512,What is the connection between AAA and Lucha U...,doc-en-798457,"['doc-en-810925', 'doc-en-634020', 'doc-en-143...",en
1,q-en-16636,What is the medical use of iloperidone?,doc-en-121692,"['doc-en-177976', 'doc-en-700330', 'doc-en-567...",en
2,q-en-282671,Who was the provisional administrator in 1940?,doc-en-750259,"['doc-en-805362', 'doc-en-413387', 'doc-en-827...",en
3,q-en-216614,What was the critical reception of the film se...,doc-en-703883,"['doc-en-685958', 'doc-en-84060', 'doc-en-2046...",en
4,q-en-156120,What was the main Spanish record of the year i...,doc-en-648393,"['doc-en-4307', 'doc-en-761696', 'doc-en-79426...",en
...,...,...,...,...,...
21870,q-ar-1187,احتفالية تلعب دورًا كبيرًا في تعزيز الترابط ال...,doc-ar-8463,"['doc-ar-5304', 'doc-ar-1977', 'doc-ar-5843', ...",ar
21871,q-ar-1188,ما هو عدد أتباع كنيسة الأدفنتست في جزيرة سان ا...,doc-ar-8469,"['doc-ar-6798', 'doc-ar-1489', 'doc-ar-3100', ...",ar
21872,q-ar-1189,من هو أنتاناس سمتا؟,doc-ar-8476,"['doc-ar-2898', 'doc-ar-6787', 'doc-ar-3235', ...",ar
21873,q-ar-1191,سؤالي هو: ما هي الميزة التي كانت للإيرلنديين ف...,doc-ar-8491,"['doc-ar-786', 'doc-ar-8084', 'doc-ar-3208', '...",ar


In [19]:
languages = ["en","fr","es","it","de","ar","ko"]
corpus_query = {language:df_train[df_train["lang"] == language][["query_id","query"]].set_index("query_id").to_dict()["query"] for language in languages}

In [20]:
def embed_queries(corpus: dict, model, language: str, freq: bool) -> dict:
    queries_embeddings = {}
    with open(f"data/corpus_idf_{language}.json","r",encoding='utf-8') as idf_file:
        idf = json.load(idf_file)
    print(f"Embedding queries in {language}")
    for query_id, query_text in tqdm(corpus.items()):
        query_vocabulary = get_document_vocabulary(preprocess_document(query_text,False,True,False,language))
        try:
            queries_embeddings[query_id] = mean_agg(model, query_vocabulary).tolist()
        except:
            print(query_id)
            print(query_text)
    print(f"Documents queries in {language}")
    return queries_embeddings

In [45]:
languages = ["en","fr","it","es","de","ar","ko"]
for language in languages:
    print(f"Loading corpus in {language}")
    corpus = corpus_query[language]
    print(f"Corpus loaded in {language}")

    print(f"Loading model in {language}")
    model = fasttext.load_model(f"cc.{language}.300.bin")
    print(f"Model loaded in {language}")

    print(f"Embedding corpus in {language}")
    embedded_queries = embed_queries(corpus, model, language, False)
    print(f"Corpus embedded in {language}")

    print(f"Saving embedded corpus in {language}")
    with open(f"data/embedded_queries_{language}.json","w",encoding='utf-8') as file:
        json.dump(embedded_queries, file, ensure_ascii=False)
    print(f"Embedded corpus saved in {language}")
    del corpus, model, embedded_queries
    gc.collect()

Loading corpus in en
Corpus loaded in en
Loading model in en
Model loaded in en
Embedding corpus in en
Embedding queries in en


100%|██████████| 10000/10000 [00:01<00:00, 7965.68it/s]


Documents queries in en
Corpus embedded in en
Saving embedded corpus in en
Embedded corpus saved in en
Loading corpus in fr
Corpus loaded in fr
Loading model in fr
Model loaded in fr
Embedding corpus in fr
Embedding queries in fr


100%|██████████| 1608/1608 [00:00<00:00, 5724.64it/s]


Documents queries in fr
Corpus embedded in fr
Saving embedded corpus in fr
Embedded corpus saved in fr
Loading corpus in it
Corpus loaded in it
Loading model in it
Model loaded in it
Embedding corpus in it
Embedding queries in it


100%|██████████| 2151/2151 [00:00<00:00, 5826.81it/s]


q-it-867
chi?
Documents queries in it
Corpus embedded in it
Saving embedded corpus in it
Embedded corpus saved in it
Loading corpus in es
Corpus loaded in es
Loading model in es
Model loaded in es
Embedding corpus in es
Embedding queries in es


100%|██████████| 2254/2254 [00:00<00:00, 5189.64it/s]


Documents queries in es
Corpus embedded in es
Saving embedded corpus in es
Embedded corpus saved in es
Loading corpus in de
Corpus loaded in de
Loading model in de
Model loaded in de
Embedding corpus in de
Embedding queries in de


100%|██████████| 1847/1847 [00:00<00:00, 5369.70it/s]

q-de-484
sein könnte. 
Documents queries in de
Corpus embedded in de
Saving embedded corpus in de





Embedded corpus saved in de
Loading corpus in ar
Corpus loaded in ar
Loading model in ar
Model loaded in ar
Embedding corpus in ar
Embedding queries in ar


100%|██████████| 1817/1817 [00:00<00:00, 4433.72it/s]

q-ar-414
أيلول.
Documents queries in ar
Corpus embedded in ar
Saving embedded corpus in ar





Embedded corpus saved in ar
Loading corpus in ko
Corpus loaded in ko
Loading model in ko
Model loaded in ko
Embedding corpus in ko
Embedding queries in ko


100%|██████████| 2198/2198 [00:00<00:00, 7429.97it/s]


Documents queries in ko
Corpus embedded in ko
Saving embedded corpus in ko
Embedded corpus saved in ko


In [29]:
def find_top_k_doc(queries,documents,k,language):
    query_matrix = list()
    queries_ids = list()
    for query_id, query_embeddings in queries.items():
        query_matrix.append(query_embeddings)
        queries_ids.append(query_id)
    query_matrix = np.array(query_matrix)
    queries_norms = np.diagonal(query_matrix.dot(query_matrix.T))
    queries_inverse_norms = np.linalg.inv(np.diag(np.sqrt(queries_norms)))
    
    doc_matrix = list()
    doc_ids = list()
    for doc_id, doc_embeddings in documents.items():
        doc_matrix.append(doc_embeddings)
        doc_ids.append(doc_id)

    if language != "en":
        doc_matrix = np.array(doc_matrix)
        documents_norms = np.diagonal(doc_matrix.dot(doc_matrix.T))
        documents_inverse_norms = np.linalg.inv(np.diag(np.sqrt(documents_norms)))
        cosine_similarities = np.dot(queries_inverse_norms,np.dot(np.dot(query_matrix,doc_matrix.T),documents_inverse_norms))
    else:
        step = len(doc_matrix)//10
        cosine_similarities = np.zeros((len(queries_ids),len(doc_ids)))
        for i in range(10):
            if i != 9:
                doc_sub_matrix = np.array(doc_matrix[i*step:(i+1)*step])
            else:
                doc_sub_matrix = np.array(doc_matrix[i*step:])
            documents_norms = np.diagonal(doc_sub_matrix.dot(doc_sub_matrix.T))
            documents_inverse_norms = np.linalg.inv(np.diag(np.sqrt(documents_norms)))
            sub_cosine_similarities = np.dot(queries_inverse_norms,np.dot(np.dot(query_matrix,doc_sub_matrix.T),documents_inverse_norms))
            if i != 9:
                cosine_similarities[:,i*step:(i+1)*step] = sub_cosine_similarities
            else:
                cosine_similarities[:,i*step:] = sub_cosine_similarities
    top_k_per_query = cosine_similarities.argsort(axis=1)[::-1][:,:k]
    doc_ids = np.array(doc_ids)
    top_k_documents_id = dict()
    for i in range(len(queries_ids)):
        top_k_documents_id[queries_ids[i]] = doc_ids[top_k_per_query[i]].tolist()
    return top_k_documents_id

In [28]:
languages = ["en","fr","es","de","ar","ko","it"]
for language in languages:
    with open(f"data/embedded_corpus_{language}.json","r",encoding='utf-8') as doc_file:
        corpus = json.load(doc_file)
    with open(f"data/embedded_queries_{language}.json","r",encoding='utf-8') as query_file:
        queries = json.load(query_file)
    retrieved_documents = find_top_k_doc(queries,corpus,10,language)
    with open(f"data/retrieved_doc_{language}.json","w",encoding='utf-8') as output_file:
        json.dump(retrieved_documents,output_file,ensure_ascii=False)

In [None]:
languages = ["en","fr","es","de","ar","ko","it"]
for language in languages:
    with open(f'data/retrieved_doc_{language}.json', 'r') as f:
        retrieved_docs = json.load(f)

    # Load ground truth from CSV file
    ground_truth = pd.read_csv('data/train.csv')
    ground_truth = ground_truth[ground_truth["lang"] == language]

    # Step 2: Initialize counters for evaluation
    total_positive = 0
    retrieved_positive = 0
    total_negative = 0
    retrieved_negative = 0

    # Step 3: Evaluate each query
    for index, row in ground_truth.iterrows():
        query_id = row['query_id']
        positive_docs = {row['positive_docs']} # Convert list in string format to actual list
        negative_docs = set(eval(row['negative_docs'])) # Same for negative docs
        
        retrieved = set(retrieved_docs.get(str(query_id), []))  # Get the retrieved docs for this query
        
        # Count positives and negatives
        total_positive += len(positive_docs)
        total_negative += len(negative_docs)
        
        retrieved_positive += len(positive_docs.intersection(retrieved))  # How many positive docs were retrieved
        retrieved_negative += len(negative_docs.intersection(retrieved))  # How many negative docs were retrieved

    # Step 4: Calculate percentages
    percentage_positive_retrieved = (retrieved_positive / total_positive) * 100 if total_positive > 0 else 0
    percentage_negative_retrieved = (retrieved_negative / total_negative) * 100 if total_negative > 0 else 0

    # Step 5: Output results
    print(f"Percentage of positive documents retrieved in {language}: {percentage_positive_retrieved:.2f}%")
    print(f"Percentage of negative documents retrieved in {language}: {percentage_negative_retrieved:.2f}%")