In [2]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("chompk/tydiqa-goldp-th")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.40k/1.40k [00:00<00:00, 5.27MB/s]


# dataset prepare

In [3]:
all_doc = set(dataset['validation']['context'])
all_doc = {c:i for i,c in enumerate(all_doc)}

question_contextid_context = []
for item in dataset['validation']:
    question = item['question']
    doc = item['context']
    question_contextid_context.append([all_doc[doc],question])
    
df_question = pd.DataFrame(question_contextid_context, columns =['doc_id','question'])
df_document = pd.DataFrame(zip(list(all_doc.values()),list(all_doc.keys())), columns =['doc_id','document'])

# test model

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np



In [5]:
def evaluate(question_id,question_all,context_id,context_all,mrr_rank=10,status=True):
    top_1 = 0; top_5 = 0; top_10 = 0;
    mrr_score = 0
    context_id = np.array(context_id)
    sim_score = np.inner(question_all,context_all)
    status_bar = enumerate(sim_score)
    for idx,sim in status_bar:
        index = np.argsort(sim)[::-1]
        index_edit = [context_id[x] for x in index]
        idx_search = list(index_edit).index(question_id[idx])
        if idx_search == 0:
            top_1+=1
            top_5+=1
            top_10+=1
        elif idx_search < 5:
            top_5+=1
            top_10+=1
        elif idx_search < 10:
            top_10+=1  
        if idx_search < mrr_rank:
            mrr_score += (1/(idx_search+1))
    mrr_score/=len(question_all)
    return top_1,top_5,top_10,mrr_score

In [8]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' # mrp/simcse-model-distil-m-bert
model = SentenceTransformer(model_name)

In [9]:
doc_context_id = df_document['doc_id'].to_list()    
doc_context_encoded = model.encode(df_document['document'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

question_id = df_question['doc_id'].to_list()
questions = model.encode(df_question['question'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)

print(f'{model_name}')
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

sentence-transformers/paraphrase-multilingual-mpnet-base-v2
Traninng Score P@1: 0.5439
Traninng Score P@5: 0.7497
Traninng Score P@10: 0.8178
Mrr score:0.6312


# Cohere embedding

In [6]:
import cohere
co = cohere.Client('YOUR COHERE API KEY')

In [9]:
bs = 96
query = []
doc = []
for i in range(len(df_document['document'])//bs+1):
    doc.append(co.embed(
      texts=df_document['document'][(i*bs):((i+1)*bs)].values.tolist(),
      model='embed-multilingual-v2.0',
    ).embeddings)
for i in range(len(df_question['question'])//bs+1):
    query.append(co.embed(
      texts=df_question['question'][(i*bs):((i+1)*bs)].values.tolist(),
      model='embed-multilingual-v2.0',
    ).embeddings)

questions = np.concatenate(query,0)
doc_context_encoded = np.concatenate(doc,0)

doc_context_id = df_document['doc_id'].to_list()    
question_id = df_question['doc_id'].to_list()

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

Traninng Score P@1: 0.8545
Traninng Score P@5: 0.9620
Traninng Score P@10: 0.9725
Mrr score:0.9033


# BGE-M3

In [6]:
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True) 

Fetching 23 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 18058.59it/s]


In [7]:
doc_context_id = df_document['doc_id'].to_list()    
doc_context_encoded = model.encode(df_document['document'].to_list())['dense_vecs']

question_id = df_question['doc_id'].to_list()
questions = model.encode(df_question['question'].to_list())['dense_vecs']

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)

precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

Inference Embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 56/56 [00:09<00:00,  5.82it/s]
Inference Embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:01<00:00, 32.66it/s]


Traninng Score P@1: 0.8912
Traninng Score P@5: 0.9869
Traninng Score P@10: 0.9934
Mrr score:0.9343
