In [2]:
import datasets
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# prepare dataset

In [7]:
lang='th'  # or any of the 16 languages
miracl = datasets.load_dataset('miracl/miracl', lang, use_auth_token=True)

Downloading builder script: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.46k/6.46k [00:00<00:00, 8.67MB/s]
Downloading readme: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.51k/3.51k [00:00<00:00, 12.6MB/s]
Downloading data files:   0%|                                                                                                                                                                                                             | 0/4 [00:00<?, ?it/s]
Downloading data:   0%|                                                                                                                                                                                                             |

In [8]:
all_query = []
all_answers = []
all_text = []
for data in miracl['dev']: 
    query_id = data['query_id']
    query = data['query']
    positive_passages = data['positive_passages']
    negative_passages = data['negative_passages']
    
    all_query.append(query)
    all_answers.append([x['text'] for x in positive_passages])
   
    all_text += [x['text'] for x in positive_passages]
    all_text += [x['text'] for x in negative_passages]
all_text = list(set(all_text))  

# test model

In [4]:
def evaluate(docs, doc_embeddings, answers, question_embeddings,mrr_rank=10):
    # docs : all docs [d1,d2,d3]
    # doc_embeddings : embeddings from all docs => [e1,e2,e3,...]
    # answers : a set of answers => [[a1,a2],[a1,a2,a3]]
    # question_embeddings: a embedding from all questions => [e1,e2,e3,...]
    top_1 = 0; top_5 = 0; top_10 = 0;
    mrr_score = 0
    sim_score = np.inner(question_embeddings,doc_embeddings)
    status_bar = enumerate(sim_score)
    for idx,sim in status_bar:
        index = np.argsort(sim)[::-1]
        doc_sorted = [docs[i] for i in index]
        answer_idx = [doc_sorted.index(a) for a in answers[idx]] # cal index for each answer
        final_idx_search = min(answer_idx) # since we have multiple answers, we find the min index! 
        if final_idx_search == 0:
            top_1+=1
            top_5+=1
            top_10+=1
        elif final_idx_search < 5:
            top_5+=1
            top_10+=1
        elif final_idx_search < 10:
            top_10+=1  
        if final_idx_search < mrr_rank:
            mrr_score += (1/(final_idx_search+1))
    mrr_score/=len(question_embeddings)
    return top_1,top_5,top_10,mrr_score

# sentence-transformer

In [10]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
model = SentenceTransformer(model_name)

In [11]:
doc_context_encoded = model.encode(all_text,convert_to_numpy=True,normalize_embeddings=True)
questions = model.encode(all_query,convert_to_numpy=True,normalize_embeddings=True)

top_1,top_5,top_10,mrr = evaluate(all_text,doc_context_encoded,all_answers,questions)
print(f'{model_name}')
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

sentence-transformers/paraphrase-multilingual-mpnet-base-v2
Traninng Score P@1: 0.3820
Traninng Score P@5: 0.6576
Traninng Score P@10: 0.7299
Mrr score:0.4965


# Cohere embedding

In [1]:
import cohere
co = cohere.Client('YOUR COHERE API KEY')

In [8]:
bs = 96
query = []
doc = []
for i in range(len(all_text)//bs+1):
    doc.append(co.embed(
      texts=all_text[(i*bs):((i+1)*bs)],
      model='embed-multilingual-v2.0',
    ).embeddings)
for i in range(len(all_query)//bs+1):
    query.append(co.embed(
      texts=all_query[(i*bs):((i+1)*bs)],
      model='embed-multilingual-v2.0',
    ).embeddings)

questions = np.concatenate(query,0)
doc_context_encoded = np.concatenate(doc,0)

top_1,top_5,top_10,mrr = evaluate(all_text,doc_context_encoded,all_answers,questions)
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

Traninng Score P@1: 0.6698
Traninng Score P@5: 0.9141
Traninng Score P@10: 0.9495
Mrr score:0.7758


# BGE-M3

In [5]:
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True) 

Fetching 23 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 143983.57it/s]


In [13]:
doc_context_encoded = model.encode(all_text)['dense_vecs']
questions = model.encode(all_query)['dense_vecs']

top_1,top_5,top_10,mrr = evaluate(all_text,doc_context_encoded,all_answers,questions)
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

Inference Embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 572/572 [01:00<00:00,  9.49it/s]
Inference Embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 62/62 [00:01<00:00, 35.64it/s]


Traninng Score P@1: 0.7967
Traninng Score P@5: 0.9550
Traninng Score P@10: 0.9795
Mrr score:0.8668
