In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [24]:
import json
json_path = "corpus.json"
with open(json_path,"r",encoding='utf-8') as file:
    data = json.load(file)

In [4]:
test_file_path = "test.csv"
df_test = pd.read_csv(test_file_path)

In [1]:
import numpy as np
import pandas as pd
import fasttext
import re
import collections
import string
from num2words import num2words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
from tqdm import tqdm
import gc

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/martinlebras/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def create_language_corpus(corpus: list) -> dict:
    language_corpus = {"en":[],"fr":[],"es":[],"de":[],"it":[],"ar":[],"ko":[]}
    for document in corpus:
        language_corpus[document["lang"]].append(document)
    for language in tqdm(language_corpus.keys()):
        with open(f"corpus_{language}.json","w",encoding='utf-8') as file:
            json.dump(language_corpus[language],file,ensure_ascii=False)
    del language_corpus

In [3]:
def lowercase_sentence_start(text):

    def lowercase_match(match):
        return match.group(1) + match.group(2).lower()
    
    pattern = r'([.!?]\s+|^)([A-Z])'
    
    return re.sub(pattern, lowercase_match, text)

In [4]:
def convert_numbers_to_words(text, lang='en'):
    
    def replace_number(match):
        number_str = match.group(0)
        
        if '.' in number_str:
            return num2words(float(number_str), lang=lang)
        else:
            return num2words(int(number_str), lang=lang)
        
    pattern = r'\b\d+(\.\d+)?\b'
    
    # Substitute all matched numbers with their word equivalents
    return re.sub(pattern, replace_number, text)

In [5]:
def remove_stopwords(text, lang='english'):

    try:
        stop_words = set(stopwords.words(lang))
    except:
        stop_words = set()
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    
    return filtered_text

In [6]:
def preprocess_document(text: str, lower: bool, stopwords: bool, number: bool, language: str): #lower, remove punctuation and stopwords in the given language, convert numbers to their text value, handle names with captial letters
    language_mapping = {"en":"english","fr":"french","de":"german","es":"spanish","ar":"arabic","ko":"korean","it":"italian"}
    if lower:
        text = lowercase_sentence_start(text)
    else:
        text = text.lower()
    if number:
        text = convert_numbers_to_words(text, language)
    pattern = f"[{re.escape(string.punctuation)}]"
    text = re.sub(pattern, '', text)
    if stopwords:
        words = remove_stopwords(text,language_mapping[language])
    else:
        words = text.split(" ")
    return words

Model embedding per language
Issue for query across languages 
Multilingual model or consider most relevant documents are the one with same languages ? Check train data
Sentence embedding with TF-IDF in the document (?) and then regular aggregation across sentences of the document (min,max,mean ?)

In [7]:
def get_document_vocabulary(words: str) -> dict:
    return dict(collections.Counter(words))

In [8]:
def get_corpus_frequencies(corpus: list, language: str) -> dict:
    corpus_dict = {}
    for document in tqdm(corpus):
        corpus_dict[document["docid"]] = get_document_vocabulary(preprocess_document(document["text"],False,True,False,language))
    return corpus_dict

In [9]:
def get_corpus_vocabulary(corpus_frequencies: dict) -> list:
    vocabulary = list()
    for doc_id in tqdm(corpus_frequencies.keys()):
        document_vocabulary = list(corpus_frequencies[doc_id].keys())
        vocabulary.extend(document_vocabulary)
    return list(set(vocabulary))

In [10]:
def term_frequency(word: str, document_vocabulary: dict) -> float:
    return document_vocabulary[word] / max(document_vocabulary.values())

In [11]:
def inverse_document_frequency(corpus_frequencies: dict, corpus_vocabulary: list) -> dict:

    document_count_per_word = collections.defaultdict(int)
    
    for doc_id, word_freq in corpus_frequencies.items():
        for word in word_freq.keys():
            document_count_per_word[word] += 1
    
    num_documents = len(corpus_frequencies)
    
    idf = {}
    for word in tqdm(corpus_vocabulary):
        n_word = document_count_per_word[word]
        idf[word] = float(np.log(num_documents / n_word))
    
    return idf

In [12]:
def tf_idf(word: str, document_vocabulary: dict, idf: dict) -> float:
    return term_frequency(word, document_vocabulary)*idf[word]

In [13]:
def tf_idf_mean_agg(model, document_vocabulary: dict, idf: dict) -> np.array: #Consider a weight normalization step if necessary
    first_word = True
    weights = 0
    for word in document_vocabulary.keys():
        word_embedding = model.get_word_vector(word)
        if first_word:
            document_embedding = np.zeros_like(word_embedding)
            first_word = False
        weight = tf_idf(word, document_vocabulary, idf)
        document_embedding += word_embedding*weight
        weights += weight
    document_embedding = document_embedding/weights
    return document_embedding


In [14]:
def mean_agg(model, document_vocabulary: dict) -> np.array: #Consider a weight normalization step if necessary
    first_word = True
    count = 0
    for word in document_vocabulary.keys():
        word_embedding = model.get_word_vector(word)
        if first_word:
            document_embedding = np.zeros_like(word_embedding)
            first_word = False
        document_embedding += word_embedding
        count += 1
    document_embedding = document_embedding/count
    return document_embedding

In [15]:
def embed_corpus(corpus: list, model, language: str, freq: bool) -> dict:
    documents_embeddings = {}
    if freq:
        corpus_frequencies = get_corpus_frequencies(corpus, language)
        corpus_vocabulary = get_corpus_vocabulary(corpus_frequencies)
        idf = inverse_document_frequency(corpus_frequencies, corpus_vocabulary)
        with open(f"corpus_freq_{language}.json","w",encoding='utf-8') as freq_file:
            json.dump(corpus_frequencies, freq_file, ensure_ascii=False)
        with open(f"corpus_vocab_{language}.json","w",encoding='utf-8') as vocab_file:
            json.dump(corpus_vocabulary, vocab_file, ensure_ascii=False)
        with open(f"corpus_idf_{language}.json","w",encoding='utf-8') as idf_file:
            json.dump(idf, idf_file, ensure_ascii=False)
    else:
        with open(f"corpus_idf_{language}.json","r",encoding='utf-8') as idf_file:
            idf = json.load(idf_file)
    print(f"Embedding documents in {language}")
    for document in tqdm(corpus):
        document_vocabulary = get_document_vocabulary(preprocess_document(document["text"],False,True,False,language))
        documents_embeddings[document["docid"]] = tf_idf_mean_agg(model, document_vocabulary, idf).tolist()
    print(f"Documents embedded in {language}")
    return documents_embeddings

In [16]:
def cosine_similarity(word_a: np.array, word_b: np.array) -> float:
    sumxx, sumxy, sumyy = 0, 0, 0
    for i in range(len(word_a)):
        x = word_a[i]; y = word_b[i]
        sumxx += x*x
        sumyy += y*y
        sumxy += x*y
    if sumxy == 0:
        result = 0
    else:
        result =  sumxy / np.sqrt(sumxx*sumyy)
    return result

Check for titles in the documents and if there is one, handle it differently

Create a separate corpus for each language in order to have a separate vocabulary for each language

In [68]:
#print("Creating corpus for each language")
#create_language_corpus(data)
#print("Corpus created for each language")
#gc.collect()
languages = ["en","fr","it","es","de","ar","ko"]
for language in languages:
    print(f"Loading corpus in {language}")
    with open(f"corpus_{language}.json","r",encoding='utf-8') as corpus_file:
        corpus = json.load(corpus_file)
    print(f"Corpus loaded in {language}")

    print(f"Loading model in {language}")
    model = fasttext.load_model(f"cc.{language}.300.bin")
    print(f"Model loaded in {language}")

    print(f"Embedding corpus in {language}")
    embedded_corpus = embed_corpus(corpus, model, language, False)
    print(f"Corpus embedded in {language}")

    print(f"Saving embedded corpus in {language}")
    with open(f"embedded_corpus_{language}.json","w",encoding='utf-8') as file:
        json.dump(embedded_corpus, file, ensure_ascii=False)
    print(f"Embedded corpus saved in {language}")
    del corpus, model, embedded_corpus
    gc.collect()


Loading corpus in en
Corpus loaded in en
Loading model in en
Model loaded in en
Embedding corpus in en
Embedding documents in en


100%|██████████| 207363/207363 [43:31<00:00, 79.39it/s] 


Documents embedded in en
Corpus embedded in en
Saving embedded corpus in en
Embedded corpus saved in en
Loading corpus in fr
Corpus loaded in fr
Loading model in fr
Model loaded in fr
Embedding corpus in fr
Embedding documents in fr


100%|██████████| 10676/10676 [07:06<00:00, 25.02it/s]


Documents embedded in fr
Corpus embedded in fr
Saving embedded corpus in fr
Embedded corpus saved in fr
Loading corpus in it
Corpus loaded in it
Loading model in it
Model loaded in it
Embedding corpus in it
Embedding documents in it


100%|██████████| 11250/11250 [07:59<00:00, 23.48it/s]


Documents embedded in it
Corpus embedded in it
Saving embedded corpus in it
Embedded corpus saved in it
Loading corpus in es
Corpus loaded in es
Loading model in es
Model loaded in es
Embedding corpus in es
Embedding documents in es


100%|██████████| 11019/11019 [07:16<00:00, 25.26it/s]


Documents embedded in es
Corpus embedded in es
Saving embedded corpus in es
Embedded corpus saved in es
Loading corpus in de
Corpus loaded in de
Loading model in de
Model loaded in de
Embedding corpus in de
Embedding documents in de


100%|██████████| 10992/10992 [06:53<00:00, 26.57it/s]


Documents embedded in de
Corpus embedded in de
Saving embedded corpus in de
Embedded corpus saved in de
Loading corpus in ar
Corpus loaded in ar
Loading model in ar
Model loaded in ar
Embedding corpus in ar
Embedding documents in ar


100%|██████████| 8829/8829 [07:48<00:00, 18.85it/s] 


Documents embedded in ar
Corpus embedded in ar
Saving embedded corpus in ar
Embedded corpus saved in ar
Loading corpus in ko
Corpus loaded in ko
Loading model in ko
Model loaded in ko
Embedding corpus in ko
Embedding documents in ko


100%|██████████| 7893/7893 [05:34<00:00, 23.59it/s] 


Documents embedded in ko
Corpus embedded in ko
Saving embedded corpus in ko
Embedded corpus saved in ko


In [42]:
train_file_path = "train.csv"
df_train = pd.read_csv(train_file_path)

In [18]:
df_train

Unnamed: 0,query_id,query,positive_docs,negative_docs,lang
0,q-en-425512,What is the connection between AAA and Lucha U...,doc-en-798457,"['doc-en-810925', 'doc-en-634020', 'doc-en-143...",en
1,q-en-16636,What is the medical use of iloperidone?,doc-en-121692,"['doc-en-177976', 'doc-en-700330', 'doc-en-567...",en
2,q-en-282671,Who was the provisional administrator in 1940?,doc-en-750259,"['doc-en-805362', 'doc-en-413387', 'doc-en-827...",en
3,q-en-216614,What was the critical reception of the film se...,doc-en-703883,"['doc-en-685958', 'doc-en-84060', 'doc-en-2046...",en
4,q-en-156120,What was the main Spanish record of the year i...,doc-en-648393,"['doc-en-4307', 'doc-en-761696', 'doc-en-79426...",en
...,...,...,...,...,...
21870,q-ar-1187,احتفالية تلعب دورًا كبيرًا في تعزيز الترابط ال...,doc-ar-8463,"['doc-ar-5304', 'doc-ar-1977', 'doc-ar-5843', ...",ar
21871,q-ar-1188,ما هو عدد أتباع كنيسة الأدفنتست في جزيرة سان ا...,doc-ar-8469,"['doc-ar-6798', 'doc-ar-1489', 'doc-ar-3100', ...",ar
21872,q-ar-1189,من هو أنتاناس سمتا؟,doc-ar-8476,"['doc-ar-2898', 'doc-ar-6787', 'doc-ar-3235', ...",ar
21873,q-ar-1191,سؤالي هو: ما هي الميزة التي كانت للإيرلنديين ف...,doc-ar-8491,"['doc-ar-786', 'doc-ar-8084', 'doc-ar-3208', '...",ar


In [19]:
languages = ["en","fr","es","it","de","ar","ko"]
corpus_query = {language:df_train[df_train["lang"] == language][["query_id","query"]].set_index("query_id").to_dict()["query"] for language in languages}

In [20]:
def embed_queries(corpus: dict, model, language: str, freq: bool) -> dict:
    queries_embeddings = {}
    with open(f"corpus_idf_{language}.json","r",encoding='utf-8') as idf_file:
        idf = json.load(idf_file)
    print(f"Embedding queries in {language}")
    for query_id, query_text in tqdm(corpus.items()):
        query_vocabulary = get_document_vocabulary(preprocess_document(query_text,False,True,False,language))
        try:
            queries_embeddings[query_id] = mean_agg(model, query_vocabulary).tolist()
        except:
            print(query_id)
            print(query_text)
    print(f"Documents queries in {language}")
    return queries_embeddings

In [45]:
languages = ["en","fr","it","es","de","ar","ko"]
for language in languages:
    print(f"Loading corpus in {language}")
    corpus = corpus_query[language]
    print(f"Corpus loaded in {language}")

    print(f"Loading model in {language}")
    model = fasttext.load_model(f"cc.{language}.300.bin")
    print(f"Model loaded in {language}")

    print(f"Embedding corpus in {language}")
    embedded_queries = embed_queries(corpus, model, language, False)
    print(f"Corpus embedded in {language}")

    print(f"Saving embedded corpus in {language}")
    with open(f"embedded_queries_{language}.json","w",encoding='utf-8') as file:
        json.dump(embedded_queries, file, ensure_ascii=False)
    print(f"Embedded corpus saved in {language}")
    del corpus, model, embedded_queries
    gc.collect()

Loading corpus in en
Corpus loaded in en
Loading model in en
Model loaded in en
Embedding corpus in en
Embedding queries in en


100%|██████████| 10000/10000 [00:01<00:00, 7965.68it/s]


Documents queries in en
Corpus embedded in en
Saving embedded corpus in en
Embedded corpus saved in en
Loading corpus in fr
Corpus loaded in fr
Loading model in fr
Model loaded in fr
Embedding corpus in fr
Embedding queries in fr


100%|██████████| 1608/1608 [00:00<00:00, 5724.64it/s]


Documents queries in fr
Corpus embedded in fr
Saving embedded corpus in fr
Embedded corpus saved in fr
Loading corpus in it
Corpus loaded in it
Loading model in it
Model loaded in it
Embedding corpus in it
Embedding queries in it


100%|██████████| 2151/2151 [00:00<00:00, 5826.81it/s]


q-it-867
chi?
Documents queries in it
Corpus embedded in it
Saving embedded corpus in it
Embedded corpus saved in it
Loading corpus in es
Corpus loaded in es
Loading model in es
Model loaded in es
Embedding corpus in es
Embedding queries in es


100%|██████████| 2254/2254 [00:00<00:00, 5189.64it/s]


Documents queries in es
Corpus embedded in es
Saving embedded corpus in es
Embedded corpus saved in es
Loading corpus in de
Corpus loaded in de
Loading model in de
Model loaded in de
Embedding corpus in de
Embedding queries in de


100%|██████████| 1847/1847 [00:00<00:00, 5369.70it/s]

q-de-484
sein könnte. 
Documents queries in de
Corpus embedded in de
Saving embedded corpus in de





Embedded corpus saved in de
Loading corpus in ar
Corpus loaded in ar
Loading model in ar
Model loaded in ar
Embedding corpus in ar
Embedding queries in ar


100%|██████████| 1817/1817 [00:00<00:00, 4433.72it/s]

q-ar-414
أيلول.
Documents queries in ar
Corpus embedded in ar
Saving embedded corpus in ar





Embedded corpus saved in ar
Loading corpus in ko
Corpus loaded in ko
Loading model in ko
Model loaded in ko
Embedding corpus in ko
Embedding queries in ko


100%|██████████| 2198/2198 [00:00<00:00, 7429.97it/s]


Documents queries in ko
Corpus embedded in ko
Saving embedded corpus in ko
Embedded corpus saved in ko


In [29]:
def find_top_k_doc(queries,documents,k,language):
    query_matrix = list()
    queries_ids = list()
    for query_id, query_embeddings in queries.items():
        query_matrix.append(query_embeddings)
        queries_ids.append(query_id)
    query_matrix = np.array(query_matrix)
    queries_norms = np.diagonal(query_matrix.dot(query_matrix.T))
    queries_inverse_norms = np.linalg.inv(np.diag(np.sqrt(queries_norms)))
    
    doc_matrix = list()
    doc_ids = list()
    for doc_id, doc_embeddings in documents.items():
        doc_matrix.append(doc_embeddings)
        doc_ids.append(doc_id)

    if language != "en":
        doc_matrix = np.array(doc_matrix)
        documents_norms = np.diagonal(doc_matrix.dot(doc_matrix.T))
        documents_inverse_norms = np.linalg.inv(np.diag(np.sqrt(documents_norms)))
        cosine_similarities = np.dot(queries_inverse_norms,np.dot(np.dot(query_matrix,doc_matrix.T),documents_inverse_norms))
    else:
        step = len(doc_matrix)//10
        cosine_similarities = np.zeros((len(queries_ids),len(doc_ids)))
        for i in range(10):
            if i != 9:
                doc_sub_matrix = np.array(doc_matrix[i*step:(i+1)*step])
            else:
                doc_sub_matrix = np.array(doc_matrix[i*step:])
            documents_norms = np.diagonal(doc_sub_matrix.dot(doc_sub_matrix.T))
            documents_inverse_norms = np.linalg.inv(np.diag(np.sqrt(documents_norms)))
            sub_cosine_similarities = np.dot(queries_inverse_norms,np.dot(np.dot(query_matrix,doc_sub_matrix.T),documents_inverse_norms))
            if i != 9:
                cosine_similarities[:,i*step:(i+1)*step] = sub_cosine_similarities
            else:
                cosine_similarities[:,i*step:] = sub_cosine_similarities
    top_k_per_query = cosine_similarities.argsort(axis=1)[::-1][:,:k]
    doc_ids = np.array(doc_ids)
    top_k_documents_id = dict()
    for i in range(len(queries_ids)):
        top_k_documents_id[queries_ids[i]] = doc_ids[top_k_per_query[i]].tolist()
    return top_k_documents_id

In [28]:
languages = ["en","fr","es","de","ar","ko","it"]
for language in languages:
    with open(f"embedded_corpus_{language}.json","r",encoding='utf-8') as doc_file:
        corpus = json.load(doc_file)
    with open(f"embedded_queries_{language}.json","r",encoding='utf-8') as query_file:
        queries = json.load(query_file)
    retrieved_documents = find_top_k_doc(queries,corpus,10,language)
    with open(f"retrieved_doc_{language}.json","w",encoding='utf-8') as output_file:
        json.dump(retrieved_documents,output_file,ensure_ascii=False)

In [None]:
#Write an evaluation code for the current doc retrieval

In [30]:
with open(f"retrieved_doc_en.json","r",encoding="utf-8") as retrieved_file:
    data = json.load(retrieved_file)

In [75]:
df = pd.DataFrame(data)

In [76]:
df = df.transpose().reset_index()

In [77]:
df

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,9
0,q-en-425512,doc-en-683410,doc-en-87691,doc-en-2392,doc-en-567893,doc-en-449096,doc-en-120949,doc-en-523773,doc-en-377963,doc-en-11048,doc-en-729127
1,q-en-16636,doc-en-11272,doc-en-817826,doc-en-414492,doc-en-156581,doc-en-577807,doc-en-482496,doc-en-654818,doc-en-680868,doc-en-784770,doc-en-740422
2,q-en-282671,doc-en-822640,doc-en-766157,doc-en-608311,doc-en-683410,doc-en-787714,doc-en-567893,doc-en-789308,doc-en-2392,doc-en-87691,doc-en-449096
3,q-en-216614,doc-en-617596,doc-en-207868,doc-en-261720,doc-en-317333,doc-en-784948,doc-en-190,doc-en-11016,doc-en-654980,doc-en-474411,doc-en-675837
4,q-en-156120,doc-en-238769,doc-en-482496,doc-en-685577,doc-en-630923,doc-en-3944,doc-en-626695,doc-en-201419,doc-en-740422,doc-en-273450,doc-en-742864
...,...,...,...,...,...,...,...,...,...,...,...
9995,q-en-491,doc-en-11016,doc-en-784948,doc-en-675837,doc-en-697014,doc-en-654980,doc-en-647534,doc-en-261720,doc-en-641059,doc-en-586849,doc-en-249064
9996,q-en-3689,doc-en-711877,doc-en-675837,doc-en-781613,doc-en-784722,doc-en-484904,doc-en-11016,doc-en-662348,doc-en-828072,doc-en-784948,doc-en-601460
9997,q-en-82165,doc-en-136137,doc-en-400917,doc-en-206058,doc-en-792979,doc-en-666061,doc-en-784689,doc-en-237360,doc-en-289130,doc-en-832700,doc-en-399850
9998,q-en-157794,doc-en-457559,doc-en-524615,doc-en-583706,doc-en-797081,doc-en-824865,doc-en-14628,doc-en-816276,doc-en-1309,doc-en-808912,doc-en-14625


In [45]:
df_train_language

Unnamed: 0,query_id,query,positive_docs,negative_docs,lang
0,q-en-425512,What is the connection between AAA and Lucha U...,doc-en-798457,"['doc-en-810925', 'doc-en-634020', 'doc-en-143...",en
1,q-en-16636,What is the medical use of iloperidone?,doc-en-121692,"['doc-en-177976', 'doc-en-700330', 'doc-en-567...",en
2,q-en-282671,Who was the provisional administrator in 1940?,doc-en-750259,"['doc-en-805362', 'doc-en-413387', 'doc-en-827...",en
3,q-en-216614,What was the critical reception of the film se...,doc-en-703883,"['doc-en-685958', 'doc-en-84060', 'doc-en-2046...",en
4,q-en-156120,What was the main Spanish record of the year i...,doc-en-648393,"['doc-en-4307', 'doc-en-761696', 'doc-en-79426...",en
...,...,...,...,...,...
9995,q-en-491,What is the title of the book written by Śrī M...,doc-en-3808,"['doc-en-598746', 'doc-en-638512', 'doc-en-417...",en
9996,q-en-3689,"the 1983 World Championships, he was still not...",doc-en-26598,"['doc-en-344827', 'doc-en-313792', 'doc-en-248...",en
9997,q-en-82165,What common characteristics did the investigat...,doc-en-442441,"['doc-en-692064', 'doc-en-741970', 'doc-en-140...",en
9998,q-en-157794,What are the limitations of the LAV III's grou...,doc-en-653407,"['doc-en-709748', 'doc-en-631594', 'doc-en-602...",en


In [78]:
evaluation_df = df_train_language.merge(df,left_on="query_id",right_on="index",how="inner",suffixes=['','_retrieved'])

In [87]:
evaluation_df["positive_docs"]

0       doc-en-798457
1       doc-en-121692
2       doc-en-750259
3       doc-en-703883
4       doc-en-648393
            ...      
9995      doc-en-3808
9996     doc-en-26598
9997    doc-en-442441
9998    doc-en-653407
9999    doc-en-788430
Name: positive_docs, Length: 10000, dtype: object

In [100]:
((evaluation_df["positive_docs"] == evaluation_df[0]) | (evaluation_df["positive_docs"] == evaluation_df[1]) | (evaluation_df["positive_docs"] == evaluation_df[2]) | (evaluation_df["positive_docs"] == evaluation_df[3]) | (evaluation_df["positive_docs"] == evaluation_df[4]) | (evaluation_df["positive_docs"] == evaluation_df[5]) | (evaluation_df["positive_docs"] == evaluation_df[6]) | (evaluation_df["positive_docs"] == evaluation_df[7]) | (evaluation_df["positive_docs"] == evaluation_df[8]) | (evaluation_df["positive_docs"] == evaluation_df[9])).astype(int)

0       0
1       0
2       0
3       0
4       0
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Length: 10000, dtype: int64

In [66]:
evaluation_df["retrieved"] = evaluation_df["retrieved"].str.split(", ")

In [None]:
for language in languages:
    df_train_language = df_train[df_train["lang"] == language]
    with open(f"retrieved_doc_{language}.json","r",encoding="utf-8") as retrieved_file:
        data = json.load(retrieved_file)
    df = pd.DataFrame(data)
    df = df.transpose().reset_index()
    df["retrieved"] = df.apply(lambda x: x[0] + ", " + x[1] + ", " + x[2] + ", " + x[3] + ", " + x[4] + ", " + x[5] + ", " + x[6] + ", " + x[7] + ", " + x[8] + ", " + x[9],axis=1)
    evaluation_df = df_train_language.merge(df[["index","retrieved"]],left_on="query_id",right_on="index")

In [None]:
#Look for embedding combination strategies