In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["TRANSFORMERS_CACHE"]="/workspace/cache"
os.environ["HF_DATASETS_CACHE"]="/workspace/cache"

In [2]:
import json
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# dataset prepare

In [3]:
dataset = load_dataset("xquad","xquad.th")

Downloading readme: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15.6k/15.6k [00:00<00:00, 15.9MB/s]
Downloading data files:   0%|                                                                                                                                                                                                             | 0/1 [00:00<?, ?it/s]
Downloading data:   0%|                                                                                                                                                                                                              | 0.00/337k [00:00<?, ?B/s][A
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 33

In [4]:
all_doc = set(dataset['validation']['context'])
all_doc = {c:i for i,c in enumerate(all_doc)}

question_contextid_context = []
for item in dataset['validation']:
    question = item['question']
    doc = item['context']
    question_contextid_context.append([all_doc[doc],question])
    
df_question = pd.DataFrame(question_contextid_context, columns =['doc_id','question'])
df_document = pd.DataFrame(zip(list(all_doc.values()),list(all_doc.keys())), columns =['doc_id','document'])

# test model

In [5]:
from sentence_transformers import SentenceTransformer
import numpy as np



In [6]:
def evaluate(question_id,question_all,context_id,context_all,mrr_rank=10,status=True):
    top_1 = 0; top_5 = 0; top_10 = 0;
    mrr_score = 0
    context_id = np.array(context_id)
    sim_score = np.inner(question_all,context_all)
    status_bar = enumerate(sim_score)
    for idx,sim in status_bar:
        index = np.argsort(sim)[::-1]
        index_edit = [context_id[x] for x in index]
        idx_search = list(index_edit).index(question_id[idx])
        if idx_search == 0:
            top_1+=1
            top_5+=1
            top_10+=1
        elif idx_search < 5:
            top_5+=1
            top_10+=1
        elif idx_search < 10:
            top_10+=1  
        if idx_search < mrr_rank:
            mrr_score += (1/(idx_search+1))
    mrr_score/=len(question_all)
    return top_1,top_5,top_10,mrr_score

In [9]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' # mrp/simcse-model-distil-m-bert
model = SentenceTransformer(model_name)

In [10]:
doc_context_id = df_document['doc_id'].to_list()    
doc_context_encoded = model.encode(df_document['document'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

question_id = df_question['doc_id'].to_list()
questions = model.encode(df_question['question'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)

print(f'{model_name}')
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

sentence-transformers/paraphrase-multilingual-mpnet-base-v2
Traninng Score P@1: 0.7126
Traninng Score P@5: 0.9092
Traninng Score P@10: 0.9496
Mrr score:0.7963


In [12]:
# model_name = 'mrp/simcse-model-roberta-base-thai' # mrp/simcse-model-distil-m-bert
model_name = 'kornwtp/ConGen-paraphrase-multilingual-mpnet-base-v2' # kornwtp/ConGen-simcse-model-roberta-base-thai
model = SentenceTransformer(model_name)
model.max_seq_length = 200

Downloading:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/kornwtp_ConGen-paraphrase-multilingual-mpnet-base-v2. Creating a new one with MEAN pooling.


In [13]:
doc_context_id = df_document['doc_id'].to_list()    
doc_context_encoded = model.encode(df_document['document'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

question_id = df_question['doc_id'].to_list()
questions = model.encode(df_question['question'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)

print(f'{model_name}')
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.3f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.3f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.3f}")
print(f"Mrr score:{mrr:.3f}")

kornwtp/ConGen-paraphrase-multilingual-mpnet-base-v2
Traninng Score P@1: 0.754
Traninng Score P@5: 0.931
Traninng Score P@10: 0.966
Mrr score:0.830


# Cohere embedding

In [1]:
import cohere
co = cohere.Client('YOUR COHERE API KEY')

In [8]:
bs = 96
query = []
doc = []
for i in range(len(df_document['document'])//bs+1):
    doc.append(co.embed(
      texts=df_document['document'][(i*bs):((i+1)*bs)].values.tolist(),
      model='embed-multilingual-v2.0',
    ).embeddings)
for i in range(len(df_question['question'])//bs+1):
    query.append(co.embed(
      texts=df_question['question'][(i*bs):((i+1)*bs)].values.tolist(),
      model='embed-multilingual-v2.0',
    ).embeddings)

questions = np.concatenate(query,0)
doc_context_encoded = np.concatenate(doc,0)

doc_context_id = df_document['doc_id'].to_list()    
question_id = df_question['doc_id'].to_list()

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

Traninng Score P@1: 0.8252
Traninng Score P@5: 0.9445
Traninng Score P@10: 0.9613
Mrr score:0.8778


# BGE M-3

In [7]:
from FlagEmbedding import BGEM3FlagModel
model = BGEM3FlagModel('BAAI/bge-m3',  use_fp16=True) 

Fetching 23 files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [00:00<00:00, 105777.40it/s]


In [9]:
doc_context_id = df_document['doc_id'].to_list()    
doc_context_encoded = model.encode(df_document['document'].to_list())['dense_vecs']

question_id = df_question['doc_id'].to_list()
questions = model.encode(df_question['question'].to_list())['dense_vecs']

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)

precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.4f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.4f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.4f}")
print(f"Mrr score:{mrr:.4f}")

Inference Embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 32.78it/s]


Traninng Score P@1: 0.9050
Traninng Score P@5: 0.9924
Traninng Score P@10: 0.9941
Mrr score:0.9433
