In [1]:
import json
import pandas as pd

# dataset prepare

In [2]:
with open(f'xquad.th.json') as f: # from https://github.com/google-deepmind/xquad/blob/master/xquad.th.json
    data = json.load(f)

document_id = 0
titleid_title_context = []
question_contextid_context = []
titleid_title_allcontext = []
for item in data['data']:
    title = item['title']
    for context_question in item['paragraphs']:
        context = context_question['context']
        context = context.replace('\ufeff','')
        titleid_title_context.append([document_id,title,context])
        for q in context_question['qas']:
            question = q['question']
            question_contextid_context.append([document_id,question])
        document_id+=1
df_question = pd.DataFrame(question_contextid_context, columns =['doc_id','question'])
df_document = pd.DataFrame(titleid_title_context, columns =['doc_id','title','document'])


# test model

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np



In [4]:
def sim_search(question_encoded,doc_encoded):
    query_map = np.full(doc_encoded.shape, question_encoded)
    sim_score = np.array([*map(np.inner,query_map,doc_encoded)])
    return np.argsort(sim_score)[::-1]

def evaluate(question_id,question_all,context_id,context_all,mrr_rank=10,status=True):
    top_1 = 0; top_5 = 0; top_10 = 0;
    mrr_score = 0
    context_id = np.array(context_id)
    sim_score = np.inner(question_all,context_all)
    status_bar = enumerate(sim_score)
    for idx,sim in status_bar:
        index = np.argsort(sim)[::-1]
        index_edit = [context_id[x] for x in index]
        idx_search = list(index_edit).index(question_id[idx])
        if idx_search == 0:
            top_1+=1
            top_5+=1
            top_10+=1
        elif idx_search < 5:
            top_5+=1
            top_10+=1
        elif idx_search < 10:
            top_10+=1  
        if idx_search < mrr_rank:
            mrr_score += (1/(idx_search+1))
    mrr_score/=len(question_all)
    return top_1,top_5,top_10,mrr_score

In [5]:
# model_name = 'mrp/simcse-model-roberta-base-thai' # mrp/simcse-model-distil-m-bert
model_name = 'kornwtp/ConGen-simcse-model-roberta-base-thai' # kornwtp/ConGen-simcse-model-roberta-base-thai
model = SentenceTransformer(model_name)
model.max_seq_length = 200



In [6]:
doc_context_id = df_document['doc_id'].to_list()    
doc_context_encoded = model.encode(df_document['document'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

question_id = df_question['doc_id'].to_list()
questions = model.encode(df_question['question'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)

print(f'{model_name}')
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.3f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.3f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.3f}")
print(f"Mrr score:{mrr:.3f}")

kornwtp/ConGen-simcse-model-roberta-base-thai
Traninng Score P@1: 0.480
Traninng Score P@5: 0.677
Traninng Score P@10: 0.748
Mrr score:0.565


In [7]:
# model_name = 'mrp/simcse-model-roberta-base-thai' # mrp/simcse-model-distil-m-bert
model_name = 'kornwtp/ConGen-paraphrase-multilingual-mpnet-base-v2' # kornwtp/ConGen-simcse-model-roberta-base-thai
model = SentenceTransformer(model_name)
model.max_seq_length = 200



In [8]:
doc_context_id = df_document['doc_id'].to_list()    
doc_context_encoded = model.encode(df_document['document'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

question_id = df_question['doc_id'].to_list()
questions = model.encode(df_question['question'].to_list(),convert_to_numpy=True,normalize_embeddings=True)

top_1,top_5,top_10,mrr = evaluate(question_id,questions,doc_context_id,doc_context_encoded)

print(f'{model_name}')
precision = top_1 / len(questions)
print(f"Traninng Score P@1: {precision:.3f}")
precision = top_5 / len(questions)
print(f"Traninng Score P@5: {precision:.3f}")
precision = top_10 / len(questions)
print(f"Traninng Score P@10: {precision:.3f}")
print(f"Mrr score:{mrr:.3f}")

kornwtp/ConGen-paraphrase-multilingual-mpnet-base-v2
Traninng Score P@1: 0.754
Traninng Score P@5: 0.931
Traninng Score P@10: 0.966
Mrr score:0.830
