In [1]:
import os

In [2]:
import datasets
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

# prepare dataset

In [3]:
lang='th'  # or any of the 16 languages
miracl = datasets.load_dataset('miracl/miracl', lang, use_auth_token=True)

Using the latest cached version of the module from /root/.cache/huggingface/modules/datasets_modules/datasets/miracl--miracl/f598b4ee332f2b16e82c6c83ab1ba82e1a7777ef82e7ce3c1416f6b20a142313 (last modified on Fri Nov 17 04:13:02 2023) since it couldn't be found locally at miracl/miracl., or remotely on the Hugging Face Hub.
Reusing dataset miracl (/root/.cache/huggingface/datasets/miracl___miracl/th/1.0.0/f598b4ee332f2b16e82c6c83ab1ba82e1a7777ef82e7ce3c1416f6b20a142313)


  0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
all_query = []
all_answers = []
all_text = []
for data in miracl['dev']: 
    query_id = data['query_id']
    query = data['query']
    positive_passages = data['positive_passages']
    negative_passages = data['negative_passages']
    
    all_query.append(query)
    all_answers.append([x['text'] for x in positive_passages])
   
    all_text += [x['text'] for x in positive_passages]
    all_text += [x['text'] for x in negative_passages]
all_text = list(set(all_text))  

# test model

In [23]:
def evaluate(docs, doc_embeddings, answers, question_embeddings,mrr_rank=10):
    # docs : all docs [d1,d2,d3]
    # doc_embeddings : embeddings from all docs => [e1,e2,e3,...]
    # answers : a set of answers => [[a1,a2],[a1,a2,a3]]
    # question_embeddings: a embedding from all questions => [e1,e2,e3,...]
    top_1 = 0; top_5 = 0; top_10 = 0;
    mrr_score = 0
    sim_score = np.inner(question_embeddings,doc_embeddings)
    status_bar = enumerate(sim_score)
    for idx,sim in status_bar:
        index = np.argsort(sim)[::-1]
        doc_sorted = [docs[i] for i in index]
        answer_idx = [doc_sorted.index(a) for a in answers[idx]] # cal index for each answer
        final_idx_search = min(answer_idx) # since we have multiple answers, we find the min index! 
        if final_idx_search == 0:
            top_1+=1
            top_5+=1
            top_10+=1
        elif final_idx_search < 5:
            top_5+=1
            top_10+=1
        elif final_idx_search < 10:
            top_10+=1  
        if final_idx_search < mrr_rank:
            mrr_score += (1/(final_idx_search+1))
    mrr_score/=len(question_embeddings)
    return top_1,top_5,top_10,mrr_score

In [6]:
model_name = 'mrp/simcse-model-roberta-base-thai'
model = SentenceTransformer(model_name)

In [24]:
doc_context_encoded = model.encode(all_text,convert_to_numpy=True,normalize_embeddings=True)
questions = model.encode(all_query,convert_to_numpy=True,normalize_embeddings=True)

top_1,top_5,top_10,mrr = evaluate(all_text,doc_context_encoded,all_answers,questions)
print(f'{model_name}')
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.3f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.3f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.3f}")
print(f"Mrr score:{mrr:.3f}")

mrp/simcse-model-roberta-base-thai
Traninng Score P@1: 0.319
Traninng Score P@5: 0.593
Traninng Score P@10: 0.677
Mrr score:0.433
