In [1]:
import os
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
config = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=True)
config.gpu_options.allow_growth = True
tf.compat.v1.Session(config=config) 

from sentence_transformers import SentenceTransformer, util
from tensorflow.keras.layers import Layer,Input,Dense,Lambda
from tensorflow.keras import Model
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import tensorflow_text
from glob import glob
import setting as set_fnc
import copy as cp

use_enc_large = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
use_enc_small = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:0a:00.0, compute capability: 7.0



# Load datasets

In [2]:
mode = 'test'
q_lan = ['ar','de','es','ru','th','zh','tr','ro','el','hi','vi']
corpus = 'XQUAD'

df_question = {}
df_paragraph = {}
df_doc = pd.read_csv(f'../datasets/{corpus}/{mode}/{corpus.lower()}_doc_en-en.csv')
for lan in q_lan:
    df_question.update({
        f'en-{lan}':pd.read_csv(f'../datasets/{corpus}/{mode}/{corpus.lower()}_question_en-{lan}.csv')
    })  


# mUSE Example

In [4]:
mUSE_model = hub.load(f'../models/{corpus}/finetuned_USE_XQUAD_train_en-ar_de_es_ru_th_zh_tr_top0-0_q-d-distillation_1000MSE_0.001MSEq_1.0MSEd_0.01MSEqd_0.001LR_teacher_best_teacher_batchsize_16_acc_metric_3term')
doc_context_id = df_doc['doc_id'].to_list()    
doc_context_encoded = mUSE_model(df_doc['doc'].to_list()).numpy()
for lan in q_lan:
    question_id = df_question[f'en-{lan}']['doc_id'].to_list()
    questions = mUSE_model(df_question[f'en-{lan}']['question'].to_list()).numpy()
    
    top_1,top_5,top_10,mrr = set_fnc.evaluate(question_id,questions,doc_context_id,doc_context_encoded)

    print(f'USE-{lan}')
    precision = top_1 / len(questions)
    print(f"Traninng Score P@1: {precision:.3f}")
    precision = top_5 / len(questions)
    print(f"Traninng Score P@5: {precision:.3f}")
    precision = top_10 / len(questions)
    print(f"Traninng Score P@10: {precision:.3f}")
    print(f"Mrr score:{mrr:.3f}")

100%|██████████| 238/238 [00:00<00:00, 17232.20it/s]
100%|██████████| 238/238 [00:00<00:00, 12288.05it/s]
100%|██████████| 238/238 [00:00<00:00, 10325.24it/s]
100%|██████████| 238/238 [00:00<00:00, 11275.52it/s]


USE-ar
Traninng Score P@1: 0.794
Traninng Score P@5: 0.899
Traninng Score P@10: 0.933
Mrr score:0.843
USE-de
Traninng Score P@1: 0.832
Traninng Score P@5: 0.912
Traninng Score P@10: 0.954
Mrr score:0.871
USE-es
Traninng Score P@1: 0.840
Traninng Score P@5: 0.941
Traninng Score P@10: 0.962
Mrr score:0.882
USE-ru
Traninng Score P@1: 0.832
Traninng Score P@5: 0.933
Traninng Score P@10: 0.958
Mrr score:0.871


100%|██████████| 238/238 [00:00<00:00, 11719.10it/s]
100%|██████████| 238/238 [00:00<00:00, 11038.50it/s]
100%|██████████| 238/238 [00:00<00:00, 10966.46it/s]
100%|██████████| 238/238 [00:00<00:00, 11275.01it/s]


USE-th
Traninng Score P@1: 0.857
Traninng Score P@5: 0.945
Traninng Score P@10: 0.983
Mrr score:0.896
USE-zh
Traninng Score P@1: 0.824
Traninng Score P@5: 0.929
Traninng Score P@10: 0.945
Mrr score:0.866
USE-tr
Traninng Score P@1: 0.803
Traninng Score P@5: 0.912
Traninng Score P@10: 0.954
Mrr score:0.860
USE-ro
Traninng Score P@1: 0.525
Traninng Score P@5: 0.693
Traninng Score P@10: 0.794
Mrr score:0.605


100%|██████████| 238/238 [00:00<00:00, 11926.46it/s]
100%|██████████| 238/238 [00:00<00:00, 9324.59it/s]
100%|██████████| 238/238 [00:00<00:00, 10077.17it/s]

USE-el
Traninng Score P@1: 0.101
Traninng Score P@5: 0.202
Traninng Score P@10: 0.319
Mrr score:0.150
USE-hi
Traninng Score P@1: 0.063
Traninng Score P@5: 0.168
Traninng Score P@10: 0.303
Mrr score:0.111
USE-vi
Traninng Score P@1: 0.298
Traninng Score P@5: 0.420
Traninng Score P@10: 0.525
Mrr score:0.359





# Sentence-Transformer Example

In [None]:
ST_model = SentenceTransformer('XXXXXXX')
for lan in q_lan[:]:
    set_fnc.sent_bert_encode(ST_model,'ST_model',lan,df_doc,df_question)
    print('*'*50)