# Experiment: Use CL-ReKLT model to perform online prompt mining and evaluate the results

---

Created on: 4 May, 11:35

Task: Experiment: Use CL-ReKLT model to perform online prompt mining and evaluate the result

In [24]:
import copy
import re
import os, sys
import json
import jsonlines
import glob


import numpy as np
# sentence tokenizer
from typing import List, Dict, Tuple
from pythainlp.tokenize import word_tokenize as th_word_tokenize
from nltk.tokenize import word_tokenize as en_word_tokenize, sent_tokenize as en_sent_tokenize

import tensorflow_hub as hub
import numpy as np
import tensorflow_text

from tqdm import tqdm
from collections import Counter, defaultdict

from pprint import pprint


### Algorithm

1. Given a question $q_i$ from the train/val or test set of a QA dataset, select the groundtruth passage $p_i$.

2. Then, perform sentence tokenization with     `nltk.tokenize.sent_tokenize` on the paired passage to obtain a list of sentences $s^i_j

$.

3. Find the top-k candidates determined by the cosine similarity scores of question and sentences $\textrm{similarity\_score} = \textrm{embed}(q_i) \cdot \textrm{embed}(s^i_j)$. 

4. Test the ranked candidates against the ground-truth sentence.


In [2]:
def segment_sentence(text:str) -> List[str]:
    return en_sent_tokenize(text)

def find_sentences_matched_answer(answer: str, sentences: List[str]) -> List[Tuple[str, Tuple[int, int]]]:
    sentence_candidates = []
    for i, sentence in enumerate(sentences):
        if answer in sentence:
            search_obj = re.search(answer, sentence)
            sentence_candidates.append((sentence, answer, search_obj.span(0)))

    return sentence_candidates

def find_gt_sentence(qa_item, document):
    
    segments = segment_sentence(document)
    pass

def compute_dot_product_sim(query, keys):
    scores = np.inner(query, keys)
    return scores
import heapq
def select_candidate(question, sentence_candidates, method='first', model=None, topk=10):
    if method == 'first':
        return sentence_candidates[0]
    elif method == 'vsim_dot':
        q_vector = model(question)
        s_vectors = model(sentence_candidates)
        scores = compute_dot_product_sim(q_vector, s_vectors)

        K = min(topk, len(sentence_candidates))
#         print(scores)
        topk_indices = heapq.nlargest(K, range(len(sentence_candidates)), scores.take)


#         print(topk_indices)
        return [(sentence_candidates[i], float(scores[0][i])) for i in topk_indices]
    else:
        raise NotImplementedError()


In [3]:
q = 'What year did Tesla die?'
v =['Tesla died on 7 January 1943.', 'There has been a resurgence in popular interest in Tesla since the 1990s', 'His work fell into relative obscurity after his death.']
# select_candidate(q,v,method='vsim_dot', model=xquad_student_supported_langs_model)

In [4]:

from IPython.core.display import display, HTML

color_mapper = {
 "answer": "#488FB1",
}
css_text = 'p{width: 700px; color: #333; padding: 5px 30px}\n'

for k,v in color_mapper.items():
    css_text += "span."+f"{k.lower()}" \
    +"{\n background-color: " \
    +f"{v}"+"50;\n color: #333;\n border-right: 4px solid " \
    +f"{v}"+";" \
    + "\n align-items: center;" \
    + "\n  margin: 0;" \
    + "\n padding: 2px 8px;" \
    + "\n border-radius: 3px;}"
    
HTML(f"""
<style>
{css_text}
</style>
""")

print(css_text)
def render_doc(doc: str):
    from IPython.display import display, HTML
    display(HTML(f"""
    <style>
    {css_text}
    </style>
    """))
    display(HTML(f'<p>{doc}</p>'))
def visualize(question, candidate_item, paragraph):
#     print(f'Question: {question}')
    sentence_candidate = candidate_item[0]
    answer_span = candidate_item[1]
#     print(f'Answer: {answer_span}')
#     print(f'Sentence containing answer: {sentence_candidate}')

    search_obj = re.search(sentence_candidate, paragraph)
    assert search_obj != None
    start,end = search_obj.span(0)
    
    paragraph_with_label_tag = paragraph[:start] + '<span class="answer">' + paragraph[start:end] + \
                            '</span>' + paragraph[end:] 
#     print('\nContext:\n')
    render_doc(paragraph_with_label_tag)


p{width: 700px; color: #333; padding: 5px 30px}
span.answer{
 background-color: #488FB150;
 color: #333;
 border-right: 4px solid #488FB1;
 align-items: center;
  margin: 0;
 padding: 2px 8px;
 border-radius: 3px;}


### 1) xQuAD

In [5]:
XQUAD_BASE_DIR = '../data/xquad/xx/'
xquad_en = json.load(open(os.path.join(XQUAD_BASE_DIR, 'xquad.en.json'), 'r'))
xquad_en.keys(), \
xquad_en['version']

(dict_keys(['data', 'version']), '1.1')

In [6]:
def get_xquad_answer_str(context, qas):
    context_qa_pairs = []
    for qa in qas:
        question = qa['question']
        answer = qa['answers'][0]['text']
        answer_start = qa['answers'][0]['answer_start']
        context_qa_pairs.append((context, question, answer, answer_start))
    return context_qa_pairs

In [7]:
item = xquad_en['data'][3]['paragraphs'][0]
context_qa_pairs = get_xquad_answer_str(context=item['context'], qas=item['qas'])
context_qa_pairs[0]

('Tesla was renowned for his achievements and showmanship, eventually earning him a reputation in popular culture as an archetypal "mad scientist". His patents earned him a considerable amount of money, much of which was used to finance his own projects with varying degrees of success.:121,154 He lived most of his life in a series of New York hotels, through his retirement. Tesla died on 7 January 1943. His work fell into relative obscurity after his death, but in 1960 the General Conference on Weights and Measures named the SI unit of magnetic flux density the tesla in his honor. There has been a resurgence in popular interest in Tesla since the 1990s.',
 'What year did Tesla die? ',
 '1943',
 399)

In [8]:
def mine_prompt_gt(args):
    context, question, answer, answer_start = args
    sentences = segment_sentence(context)
    

    acc_len = 0
    selected_sentence = '<NA>'
#     print(f'answer_start: {answer_start}')
    for i, sentence_candidate in enumerate(sentences):
        if answer_start >= acc_len and answer_start <=acc_len + len(sentence_candidate):
            selected_sentence = sentence_candidate
            
        acc_len+=len(sentence_candidate)
#     print(f'selected_sentence: {selected_sentence}')
    prompt_template = 'Question: {} Answer: {}'
    
#     display(HTML(f'<p>Question: {question}</p>'))
#     display(HTML(f'<p>Answer: {answer}</p>'))
#     visualize(question=question, candidate_item=(selected_sentence, answer), paragraph=context)
    
    prompt = prompt_template.format(question.strip(), selected_sentence)
#     print('\nPrompt:\n')
#     print(prompt)
#     print('-'*40)
    return selected_sentence

In [9]:
mine_prompt_gt(context_qa_pairs[0])

'Tesla died on 7 January 1943.'

In [10]:
def load_model(directory: str):
    
    model = hub.load(directory)
    return model

In [11]:
muse_small_v3_model = load_model('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3')


2022-05-11 11:13:12.706641: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-11 11:13:12.732829: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fe3f583cde0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-05-11 11:13:12.732847: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


In [75]:
XQUAD_TEACHER_DIR = '../../../CL-ReLKT_store/models/XQUAD/teacher_model/'
# XORQA_TEACHER_DIR = '../../../CL-ReLKT_store/models/XORQA/'
MLQA_TEACHER_DIR = '../../../CL-ReLKT_store/models/MLQA/teacher_model/'


xquad_teacher_model = load_model(XQUAD_TEACHER_DIR)
# xorqa_teacher_model = load_model(XORQA_TEACHER_DIR)
mlqa_teacher_model = load_model(MLQA_TEACHER_DIR)


In [12]:
XQUAD_STUDENT_SUPPORTED_LANGS_DIR = '../../../CL-ReLKT_store/models/XQUAD/student_best_supported_languages/'
XQUAD_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../CL-ReLKT_store/models/XQUAD/student_best_unsupported_languages/'

XORQA_STUDENT_SUPPORTED_LANGS_DIR = '../../../CL-ReLKT_store/models/XORQA/student_best_supported_languages/'
XORQA_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../CL-ReLKT_store/models/XORQA/student_best_unsupported_languages/'

MLQA_STUDENT_SUPPORTED_LANGS_DIR = '../../../CL-ReLKT_store/models/MLQA/student_best_supported_languages/'
MLQA_STUDENT_UNSUPPORTED_LANGS_DIR = '../../../CL-ReLKT_store/models/MLQA/student_best_unsupported_languages/'

In [13]:
xquad_student_supported_langs_model = load_model(XQUAD_STUDENT_SUPPORTED_LANGS_DIR)
xorqa_student_supported_langs_model = load_model(XORQA_STUDENT_SUPPORTED_LANGS_DIR)
mlqa_student_supported_langs_model = load_model(MLQA_STUDENT_SUPPORTED_LANGS_DIR)


xquad_student_unsupported_langs_model = load_model(XQUAD_STUDENT_UNSUPPORTED_LANGS_DIR)
xorqa_student_unsupported_langs_model = load_model(XORQA_STUDENT_UNSUPPORTED_LANGS_DIR)
mlqa_student_unsupported_langs_model = load_model(MLQA_STUDENT_UNSUPPORTED_LANGS_DIR)














In [14]:
MODEL_MAPPING = {
  'model-xquad_student_supported_langs': xquad_student_supported_langs_model,
  'model-xorqa_student_supported_langs': xorqa_student_supported_langs_model,
  'model-mlqa_student_supported_langs': mlqa_student_supported_langs_model,
  'model-xquad_student_unsupported_langs': xquad_student_unsupported_langs_model,
  'model-xorqa_student_unsupported_langs': xorqa_student_unsupported_langs_model,
  'model-mlqa_student_unsupported_langs': mlqa_student_unsupported_langs_model,
  'model-muse_small_v3_model': muse_small_v3_model
}

In [15]:
q = 'What year did Tesla die?'
v =['Tesla died on 7 January 1943.', 'There has been a resurgence in popular interest in Tesla since the 1990s', 'His work fell into relative obscurity after his death.']
select_candidate(q,v,method='vsim_dot', model=xquad_student_supported_langs_model)





[('Tesla died on 7 January 1943.', 0.9757782220840454),
 ('There has been a resurgence in popular interest in Tesla since the 1990s',
  0.9665409922599792),
 ('His work fell into relative obscurity after his death.', 0.417930543422699)]

In [16]:
select_candidate(q,v,method='vsim_dot', model=xorqa_student_supported_langs_model)





[('Tesla died on 7 January 1943.', 0.8128372430801392),
 ('There has been a resurgence in popular interest in Tesla since the 1990s',
  0.7203159928321838),
 ('His work fell into relative obscurity after his death.',
  0.2122974544763565)]

In [17]:
select_candidate(q,v,method='vsim_dot', model=mlqa_student_supported_langs_model)





[('Tesla died on 7 January 1943.', 0.6891652941703796),
 ('There has been a resurgence in popular interest in Tesla since the 1990s',
  0.6083117723464966),
 ('His work fell into relative obscurity after his death.',
  0.25080782175064087)]

```python
class xquad_dataset_item:
    question: str
    context: str
    segmented_context: str
    answer: str
    answer_start: int
    gt_sentence: str
```

In [18]:
xquad_question_counter = Counter()
n_paragraph = len(xquad_en['data'])
xquad_dataset=[]
for i, item in enumerate(xquad_en['data']):
    paragraphs = item['paragraphs']
    print('.' ,end='')
    for j, paragraph in enumerate(paragraphs):
        xquad_question_counter[f'd-{i}_p-{j}'] = len(paragraph['qas'])
        
        context = paragraph['context']
        context_qa_pairs = get_xquad_answer_str(context=context, qas=paragraph['qas'])

        for context_qa_pair in context_qa_pairs:
            context, question, answer, answer_start = context_qa_pair
            gt_sentence = mine_prompt_gt(context_qa_pair)
            qa_item = {
                 'question': question,
                    'context': context,
                    'segmented_context': segment_sentence(context),
                    'answer': answer,
                    'answer_start': answer_start,
                    'gt_sentence': gt_sentence,
            }
            xquad_dataset.append(qa_item)

................................................

In [19]:
int(sum(xquad_question_counter.values()))

1190

In [20]:
xquad_dataset[0]

{'question': 'How many points did the Panthers defense surrender?',
 'context': "The Panthers defense gave up just 308 points, ranking sixth in the league, while also leading the NFL in interceptions with 24 and boasting four Pro Bowl selections. Pro Bowl defensive tackle Kawann Short led the team in sacks with 11, while also forcing three fumbles and recovering two. Fellow lineman Mario Addison added 6½ sacks. The Panthers line also featured veteran defensive end Jared Allen, a 5-time pro bowler who was the NFL's active career sack leader with 136, along with defensive end Kony Ealy, who had 5 sacks in just 9 starts. Behind them, two of the Panthers three starting linebackers were also selected to play in the Pro Bowl: Thomas Davis and Luke Kuechly. Davis compiled 5½ sacks, four forced fumbles, and four interceptions, while Kuechly led the team in tackles (118) forced two fumbles, and intercepted four passes of his own. Carolina's secondary featured Pro Bowl safety Kurt Coleman, who l

In [21]:
def evaluate(dataset, key_name, topk=10):
    matches_at1 = 0
    accuracy = None
    matches_at_k = Counter()
    for item in dataset:
        if item[key_name][0][0] == item['gt_sentence']:
            matches_at1+=1
        for k in range(min(topk, len(item[key_name]))):
            if item[key_name][k][0] == item['gt_sentence']:
                matches_at_k[k+1] +=1
                break
    # key starts from 1 to 10
    for k in range(2, topk+1):      
       
        matches_at_k[k] += matches_at_k[k-1]
    precision_at_k = {}
    for k, count in matches_at_k.items():        
        precision_at_k[k] = float(count / len(dataset))
        
    accuracy = float(matches_at1 / len(dataset))

    return accuracy, precision_at_k

def run_online_prompt_mining(dataset, prefix, model):
    result_dataset = copy.deepcopy(dataset)
#     print(f'prefix: {prefix}')
    for item in tqdm(result_dataset, total=len(result_dataset)):
        question = item['question']
        segmented_context = item['segmented_context']
        selected_candidate = select_candidate(question=question, sentence_candidates=segmented_context,
                         method='vsim_dot', model=model)
        item[f'{prefix}@top_sentence'] = selected_candidate
    print('\n\tEvaluation result:')
        
    key_name = f'{prefix}@top_sentence'
    accuracy, precision_at_k = evaluate(result_dataset, key_name)
    print(f'\t - Accuracy: {accuracy:.4f}')
    print(f'\t - precision_at_k:')
    pprint(precision_at_k)
    evaluation_result = (accuracy, precision_at_k)
    return evaluation_result, result_dataset

In [290]:
result_xquad_dataset_xquad_student_sup = run_online_prompt_mining(xquad_dataset,
                         prefix='dataset-xquad-en_model-xquad_student_sup',
                         model=xquad_student_supported_langs_model)

# result_xquad_dataset_xquad_student_unsup = run_online_prompt_mining(xquad_dataset,
#                          prefix='dataset-xquad-en_model-xquad_student_unsup',
#                          model=xquad_student_unsupported_langs_model)
# result_xquad_dataset_mlqa_student_sup = run_online_prompt_mining(xquad_dataset,
#                          prefix='dataset-xquad-en_model-mlqa_student_sup',
#                          model=mlqa_student_supported_langs_model)

# result_xquad_dataset_mlqa_student_unsup = run_online_prompt_mining(xquad_dataset,
#                          prefix='dataset-xquad-en_model-mlqa_student_unsup',
#                          model=mlqa_student_unsupported_langs_model)
# result_xquad_dataset_xorqa_student_sup = run_online_prompt_mining(xquad_dataset,
#                          prefix='dataset-xquad-en_model-xorqa_student_sup',
#                          model=xorqa_student_supported_langs_model)
# result_xquad_dataset_xorqa_student_unsup = run_online_prompt_mining(xquad_dataset,
#                          prefix='dataset-xquad-en_model-xorqa_student_unsup',
#                          model=xorqa_student_unsupported_langs_model)
# result_xquad_dataset_xquad_student_sup

prefix: dataset-xquad-en_model-xquad_student_sup


100%|███████████████████████████████████████| 1190/1190 [00:19<00:00, 60.53it/s]


Evaluation result:
 - Accuracy: 0.4681
 - precision_at_k:
{1: 0.4680672268907563,
 2: 0.6680672268907563,
 3: 0.8050420168067227,
 4: 0.8991596638655462,
 5: 0.9453781512605042,
 6: 0.9815126050420168,
 7: 0.9899159663865547,
 8: 0.9915966386554622,
 9: 0.9957983193277311,
 10: 0.9983193277310924}





### 2) MLQA

In [77]:
MLQA_BASE_DIR = '../data/mlqa/MLQA_V1/'

mlqa_xx = {}
MLQA_LANGS = ['en', 'ar', 'de', 'es', 'hi', 'vi', 'zh']
for lang in MLQA_LANGS:
    mlqa_xx[f'{lang}_val'] = json.load(open(os.path.join(MLQA_BASE_DIR, 'dev', f'dev-context-en-question-{lang}.json'), 'r'))['data'],
    mlqa_xx[f'{lang}_test'] = json.load(open(os.path.join(MLQA_BASE_DIR, 'test', f'test-context-en-question-{lang}.json'), 'r'))['data'],


In [78]:
len(mlqa_xx['ar_test'][0])

2389

In [79]:
mlqa_xx_dataset = defaultdict(lambda: {'val':[], 'test': []})
for lang in MLQA_LANGS:

    for split_name in ['val', 'test']:
        for i, item in enumerate(mlqa_xx[f'{lang}_{split_name}'][0]):
#             print(item)
#             break
            paragraphs = item['paragraphs']
    #         print('.' ,end='')
            for j, paragraph in enumerate(paragraphs):

                context = paragraph['context']
                context_qa_pairs = get_xquad_answer_str(context=context, qas=paragraph['qas'])

                for context_qa_pair in context_qa_pairs:
                    context, question, answer, answer_start = context_qa_pair
                    gt_sentence = mine_prompt_gt(context_qa_pair)
                    qa_item = {
                         'question': question,
                            'context': context,
                            'segmented_context': segment_sentence(context),
                            'answer': answer,
                            'answer_start': answer_start,
                            'gt_sentence': gt_sentence,
                    }
                    mlqa_xx_dataset[lang][split_name].append(qa_item)
#         break
#     break

In [80]:
len(mlqa_xx_dataset['ar']['val']), \
len(mlqa_xx_dataset['ar']['test'])

(517, 5335)

In [82]:
mlqa_xx_dataset['en']['test'][0]

{'question': 'Who analyzed the biopsies?',
 'context': 'In 1994, five unnamed civilian contractors and the widows of contractors Walter Kasza and Robert Frost sued the USAF and the United States Environmental Protection Agency. Their suit, in which they were represented by George Washington University law professor Jonathan Turley, alleged they had been present when large quantities of unknown chemicals had been burned in open pits and trenches at Groom. Biopsies taken from the complainants were analyzed by Rutgers University biochemists, who found high levels of dioxin, dibenzofuran, and trichloroethylene in their body fat. The complainants alleged they had sustained skin, liver, and respiratory injuries due to their work at Groom, and that this had contributed to the deaths of Frost and Kasza. The suit sought compensation for the injuries they had sustained, claiming the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conser

In [64]:
# result_mlqa_dataset_xquad_student_sup = run_online_prompt_mining(mlqa_dataset['test'],
#                          prefix='dataset-mlqa-en_model-xquad_student_sup',
#                          model=xquad_student_supported_langs_model)
# result_mlqa_dataset_xquad_student_unsup = run_online_prompt_mining(mlqa_dataset['test'],
#                          prefix='dataset-mlqa-en_model-xquad_student_unsup',
                                                                   
#                          model=xquad_student_unsupported_langs_model)
# result_mlqa_dataset_mlqa_student_sup = run_online_prompt_mining(mlqa_dataset['test'],
#                          prefix='dataset-mlqa-en_model-mlqa_student_sup',
#                          model=mlqa_student_supported_langs_model)
# result_mlqa_dataset_mlqa_student_unsup = run_online_prompt_mining(mlqa_dataset['test'],
#                          prefix='dataset-mlqa-en_model-mlqa_student_unsup',
#                          model=mlqa_student_unsupported_langs_model)

# result_mlqa_dataset_xorqa_student_sup = run_online_prompt_mining(mlqa_dataset['test'],
#                          prefix='dataset-mlqa-en_model-xorqa_student_sup',
#                          model=xorqa_student_supported_langs_model)
# result_mlqa_dataset_xorqa_student_unsup = run_online_prompt_mining(mlqa_dataset['test'],
#                          prefix='dataset-mlqa-en_model-xorqa_student_unsup',
#                          model=xorqa_student_unsupported_langs_model)

### 3) XORQA

In [84]:
import jsonlines


In [85]:
XORQA_BASE_DIR = '../data/xorqa/en/tydi_xor_gp/'
xorqa_xx = {
    'train': json.load(open(os.path.join(XORQA_BASE_DIR, 'gp_squad_train_data.json'), 'r'))['data'],
      'val': json.load(open(os.path.join(XORQA_BASE_DIR, 'gp_squad_dev_data.json'), 'r'))['data'],
    
}

In [86]:
xorqa_xx['train'][-1]

{'title': 'title:Vamsy_parentSection:Introduction_sectionName:Career._sectionIndex:2',
 'paragraphs': [{'context': 'He has published a short stories compilation called "Maa Pasalapudi Kathalu". Besides that compilation, Vamsy has written a wide variety of short stories since 1974 when he was 18 years old. His major works include "Mahallo kokila", "Manchupallaki", "Aa Naati Vaana Chinukulu", "Venditera Kathalu" (original scripts of "Sankarabharanam" and "Anveshana"), "Vennela Bomma", "Gokulam lo Radha", "Ravvala konda", "Sree seetarama lanchi service Rajahmundry", "Manyam rani", "Rangularatnam". He has penned around 150 short stories published in swathi weekly under title "Maa Diguwa Godavari Kathalu" For his contributions to the art of story telling with a native approach through his books he was bestowed with "Sripada Puraskhaaram" at Rajamundry on 17 April 2011.',
   'qas': [{'question': 'మా పసలపూడి కథలు పుస్తకమును ఎవరు రచించారు?',
     'answers': [{'text': 'Vamsy', 'answer_start': 1

In [87]:
xorqa_xx['val'][-100]

{'title': 'title:Orlov Trotter_parentSection:Introduction_sectionName:Development of the breed._sectionIndex:2',
 'paragraphs': [{'context': 'Polkan was crossed with a Dutch mare which, in 1784, produced the grey stallion Bars I (1784–1808), considered the first Orlov trotter. He was 162.5 cm high at the withers which made him taller than most contemporary trotters, possessed a fast trotting gait and featured the beauty and noble bearing which would later distinguish the newly created breed. For seventeen years Bars I was crossed with different mares and sired eleven stallions that carried his distinguishing characteristics. The emergence of the breed was the result of a thorough and elaborate selection process. About 3,000 horses kept at the stud were involved. Unlike many other Russian nobles who were fond of horse-raising, Orlov was a professional breeder who is also credited for creating some seventy different animal breeds including the Russian wolfhound.',
   'qas': [{'question':

In [88]:
def get_xorqa_answer_str(context, qas):
    context_qa_pairs = []
    for qa in qas:
        question = qa['question']
        lang = qa['lang']
        answer = qa['answers'][0]['text']
        answer_start = qa['answers'][0]['answer_start']
        context_qa_pairs.append((context, question, answer, answer_start, lang))
    return context_qa_pairs

In [89]:
xorqa_xx_dataset = defaultdict(lambda: { 'train': [], 'val': [] })

for split_name in ['train', 'val']:
    for i, item in enumerate(xorqa_xx[split_name]):
        paragraphs = item['paragraphs']
#         print('.' ,end='')
        for j, paragraph in enumerate(paragraphs):

            context = paragraph['context']
            context_qa_pairs = get_xorqa_answer_str(context=context, qas=paragraph['qas'])

            for context_qa_pair in context_qa_pairs:
                context, question, answer, answer_start, lang = context_qa_pair
                gt_sentence = mine_prompt_gt((context, question, answer, answer_start))
                qa_item = {
                     'question': question,
                     'lang': lang,
                     'context': context,
                     'segmented_context': segment_sentence(context),
                     'answer': answer,
                     'answer_start': answer_start,
                     'gt_sentence': gt_sentence,
                }
                xorqa_xx_dataset[lang][split_name].append(qa_item)

In [90]:
list(xorqa_xx_dataset.keys())

['bn', 'ja', 'ko', 'ru', 'fi', ' ar', 'te', 'ar']

In [91]:
len(xorqa_xx_dataset['ar']['val'])

485

## Full loop
 
 ```
 results = {
     'dataset-name':
         {
             'model-name':

                 [] # dataset retrieval results
         }
 }
 ```

In [92]:
MODEL_MAPPING = {
#   'model-xquad_student_supported_langs': xquad_student_supported_langs_model,
#   'model-xorqa_student_supported_langs': xorqa_student_supported_langs_model,
#   'model-mlqa_student_supported_langs': mlqa_student_supported_langs_model,
#   'model-xquad_student_unsupported_langs': xquad_student_unsupported_langs_model,
#   'model-xorqa_student_unsupported_langs': xorqa_student_unsupported_langs_model,
#   'model-mlqa_student_unsupported_langs': mlqa_student_unsupported_langs_model,
#   'model-muse_small_v3': muse_small_v3_model,
  'model-xquad_teacher': xquad_teacher_model,
#   'model-xorqa_teacher': xorqa_teacher_model,
  'model-mlqa_teacher': mlqa_teacher_model,

}

In [93]:
DATASET_MAPPING = {
    'dataset-xquad_en_train': xquad_dataset,
}

for lang in list(xorqa_xx_dataset.keys()):
    if len(xorqa_xx_dataset[lang]['train']) != 0:
        DATASET_MAPPING[f'dataset-xorqa_{lang.strip()}_train'] = xorqa_xx_dataset[lang]['train']
    if len(xorqa_xx_dataset[lang]['val']) != 0:
        DATASET_MAPPING[f'dataset-xorqa_{lang.strip()}_val'] = xorqa_xx_dataset[lang]['val']

for lang in list(MLQA_LANGS):
    DATASET_MAPPING[f'dataset-mlqa_{lang.strip()}_val'] = mlqa_xx_dataset[lang]['val']
    DATASET_MAPPING[f'dataset-mlqa_{lang.strip()}_test'] = mlqa_xx_dataset[lang]['test']
    
print(DATASET_MAPPING.keys())

dict_keys(['dataset-xquad_en_train', 'dataset-xorqa_bn_train', 'dataset-xorqa_bn_val', 'dataset-xorqa_ja_train', 'dataset-xorqa_ja_val', 'dataset-xorqa_ko_train', 'dataset-xorqa_ko_val', 'dataset-xorqa_ru_train', 'dataset-xorqa_ru_val', 'dataset-xorqa_fi_train', 'dataset-xorqa_fi_val', 'dataset-xorqa_ar_train', 'dataset-xorqa_te_train', 'dataset-xorqa_te_val', 'dataset-xorqa_ar_val', 'dataset-mlqa_en_val', 'dataset-mlqa_en_test', 'dataset-mlqa_ar_val', 'dataset-mlqa_ar_test', 'dataset-mlqa_de_val', 'dataset-mlqa_de_test', 'dataset-mlqa_es_val', 'dataset-mlqa_es_test', 'dataset-mlqa_hi_val', 'dataset-mlqa_hi_test', 'dataset-mlqa_vi_val', 'dataset-mlqa_vi_test', 'dataset-mlqa_zh_val', 'dataset-mlqa_zh_test'])


In [94]:
len(DATASET_MAPPING)

29

In [95]:
results = defaultdict(lambda: defaultdict())

In [96]:
for dataset_prefix, dataset in DATASET_MAPPING.items():
    print(f'\n\ndataset_prefix: {dataset_prefix}')
    for model_prefix, model in MODEL_MAPPING.items():
        
        print(f'\n - model_prefix: {model_prefix}')
        prefix = f'{dataset_prefix}+{model_prefix}'
        _result = run_online_prompt_mining(dataset,
                             prefix=f'{dataset_prefix}_{model_prefix}',
                             model=model)


        results[dataset_prefix][model_prefix] = _result
        print('--'*50)
    print('\n')    
    print('=='*50)
    print('\n')    



dataset_prefix: dataset-xquad_en_train

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1190/1190 [00:30<00:00, 39.18it/s]



	Evaluation result:
	 - Accuracy: 0.7160
	 - precision_at_k:
{1: 0.7159663865546219,
 2: 0.8789915966386554,
 3: 0.9352941176470588,
 4: 0.9714285714285714,
 5: 0.9882352941176471,
 6: 0.9932773109243698,
 7: 0.9957983193277311,
 8: 0.9966386554621849,
 9: 0.9974789915966387,
 10: 0.9974789915966387}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1190/1190 [00:26<00:00, 44.47it/s]



	Evaluation result:
	 - Accuracy: 0.7319
	 - precision_at_k:
{1: 0.7319327731092437,
 2: 0.8798319327731092,
 3: 0.9411764705882353,
 4: 0.9773109243697479,
 5: 0.9907563025210084,
 6: 0.9932773109243698,
 7: 0.9957983193277311,
 8: 0.9966386554621849,
 9: 0.9983193277310924,
 10: 0.9983193277310924}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_bn_train

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2474/2474 [00:45<00:00, 54.92it/s]



	Evaluation result:
	 - Accuracy: 0.3868
	 - precision_at_k:
{1: 0.3868229587712207,
 2: 0.6329830234438156,
 3: 0.778496362166532,
 4: 0.8779304769603881,
 5: 0.9236054971705739,
 6: 0.9583670169765561,
 7: 0.9737267582861763,
 8: 0.9834276475343573,
 9: 0.9878738884397736,
 10: 0.9907033144704931}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2474/2474 [00:46<00:00, 53.18it/s]



	Evaluation result:
	 - Accuracy: 0.4228
	 - precision_at_k:
{1: 0.42279708973322555,
 2: 0.650767987065481,
 3: 0.7922392886014551,
 4: 0.8811641067097817,
 5: 0.9284559417946645,
 6: 0.9595796281325788,
 7: 0.9753435731608731,
 8: 0.9810024252223121,
 9: 0.9886822958771221,
 10: 0.99232012934519}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_bn_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 523/523 [00:08<00:00, 63.93it/s]



	Evaluation result:
	 - Accuracy: 0.3996
	 - precision_at_k:
{1: 0.39961759082217974,
 2: 0.6749521988527725,
 3: 0.8107074569789675,
 4: 0.8833652007648184,
 5: 0.9311663479923518,
 6: 0.9694072657743786,
 7: 0.9770554493307839,
 8: 0.9847036328871893,
 9: 0.994263862332696,
 10: 0.9961759082217974}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 523/523 [00:08<00:00, 61.05it/s]



	Evaluation result:
	 - Accuracy: 0.4589
	 - precision_at_k:
{1: 0.4588910133843212,
 2: 0.6998087954110899,
 3: 0.8393881453154876,
 4: 0.9139579349904398,
 5: 0.9560229445506692,
 6: 0.9694072657743786,
 7: 0.9847036328871893,
 8: 0.9866156787762906,
 9: 0.9904397705544933,
 10: 0.9961759082217974}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_ja_train

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1927/1927 [00:31<00:00, 61.95it/s]



	Evaluation result:
	 - Accuracy: 0.4982
	 - precision_at_k:
{1: 0.49818370524130773,
 2: 0.742086144265698,
 3: 0.8785677218474313,
 4: 0.934094447327452,
 5: 0.9615983393876492,
 6: 0.9792423456149455,
 7: 0.987026466009341,
 8: 0.9922158796056045,
 9: 0.9963674104826155,
 10: 0.9979242345614946}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1927/1927 [00:32<00:00, 59.71it/s]



	Evaluation result:
	 - Accuracy: 0.5189
	 - precision_at_k:
{1: 0.5189413596263622,
 2: 0.7633627400103788,
 3: 0.8879086663207058,
 4: 0.9403217436429684,
 5: 0.9683445770627919,
 6: 0.9823559937727037,
 7: 0.987026466009341,
 8: 0.9922158796056045,
 9: 0.9953295277633627,
 10: 0.9989621172807472}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_ja_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 371/371 [00:06<00:00, 58.60it/s]



	Evaluation result:
	 - Accuracy: 0.5013
	 - precision_at_k:
{1: 0.5013477088948787,
 2: 0.7628032345013477,
 3: 0.8975741239892183,
 4: 0.967654986522911,
 5: 0.9838274932614556,
 6: 0.9892183288409704,
 7: 0.9946091644204852,
 8: 0.9946091644204852,
 9: 0.9946091644204852,
 10: 0.9946091644204852}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 371/371 [00:06<00:00, 60.80it/s]



	Evaluation result:
	 - Accuracy: 0.5067
	 - precision_at_k:
{1: 0.5067385444743935,
 2: 0.7789757412398922,
 3: 0.9137466307277629,
 4: 0.9622641509433962,
 5: 0.9838274932614556,
 6: 0.9865229110512129,
 7: 0.9946091644204852,
 8: 0.9946091644204852,
 9: 0.9946091644204852,
 10: 0.9946091644204852}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_ko_train

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2395/2395 [00:38<00:00, 61.94it/s]



	Evaluation result:
	 - Accuracy: 0.5340
	 - precision_at_k:
{1: 0.5340292275574112,
 2: 0.7812108559498956,
 3: 0.8960334029227558,
 4: 0.9490605427974947,
 5: 0.9703549060542798,
 6: 0.9832985386221295,
 7: 0.9895615866388309,
 8: 0.9945720250521921,
 9: 0.9966597077244259,
 10: 0.9966597077244259}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2395/2395 [00:38<00:00, 62.02it/s]



	Evaluation result:
	 - Accuracy: 0.5574
	 - precision_at_k:
{1: 0.55741127348643,
 2: 0.7870563674321504,
 3: 0.904384133611691,
 4: 0.9519832985386222,
 5: 0.9736951983298539,
 6: 0.9874739039665971,
 7: 0.9912317327766179,
 8: 0.9941544885177453,
 9: 0.9958246346555324,
 10: 0.9966597077244259}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_ko_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 460/460 [00:07<00:00, 62.12it/s]



	Evaluation result:
	 - Accuracy: 0.5087
	 - precision_at_k:
{1: 0.508695652173913,
 2: 0.7478260869565218,
 3: 0.8695652173913043,
 4: 0.9478260869565217,
 5: 0.9804347826086957,
 6: 0.9869565217391304,
 7: 0.9956521739130435,
 8: 0.9956521739130435,
 9: 0.9956521739130435,
 10: 0.9956521739130435}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 460/460 [00:07<00:00, 62.01it/s]



	Evaluation result:
	 - Accuracy: 0.5239
	 - precision_at_k:
{1: 0.5239130434782608,
 2: 0.7608695652173914,
 3: 0.8913043478260869,
 4: 0.9565217391304348,
 5: 0.9804347826086957,
 6: 0.9891304347826086,
 7: 0.9956521739130435,
 8: 0.9956521739130435,
 9: 0.9956521739130435,
 10: 0.9956521739130435}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_ru_train

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1744/1744 [00:28<00:00, 60.65it/s]



	Evaluation result:
	 - Accuracy: 0.5315
	 - precision_at_k:
{1: 0.5315366972477065,
 2: 0.7557339449541285,
 3: 0.8864678899082569,
 4: 0.9409403669724771,
 5: 0.9655963302752294,
 6: 0.9776376146788991,
 7: 0.989105504587156,
 8: 0.9948394495412844,
 9: 0.9965596330275229,
 10: 0.9971330275229358}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1744/1744 [00:28<00:00, 61.21it/s]



	Evaluation result:
	 - Accuracy: 0.5396
	 - precision_at_k:
{1: 0.5395642201834863,
 2: 0.7677752293577982,
 3: 0.8916284403669725,
 4: 0.9415137614678899,
 5: 0.9673165137614679,
 6: 0.9799311926605505,
 7: 0.9896788990825688,
 8: 0.9931192660550459,
 9: 0.9954128440366973,
 10: 0.9971330275229358}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_ru_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:06<00:00, 59.38it/s]



	Evaluation result:
	 - Accuracy: 0.5938
	 - precision_at_k:
{1: 0.59375,
 2: 0.8098958333333334,
 3: 0.890625,
 4: 0.9609375,
 5: 0.9895833333333334,
 6: 0.9973958333333334,
 7: 0.9973958333333334,
 8: 1.0,
 9: 1.0,
 10: 1.0}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 384/384 [00:06<00:00, 56.67it/s]



	Evaluation result:
	 - Accuracy: 0.6146
	 - precision_at_k:
{1: 0.6145833333333334,
 2: 0.8020833333333334,
 3: 0.8984375,
 4: 0.9557291666666666,
 5: 0.9817708333333334,
 6: 0.9947916666666666,
 7: 1.0,
 8: 1.0,
 9: 1.0,
 10: 1.0}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_fi_train

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1855/1855 [00:30<00:00, 60.25it/s]



	Evaluation result:
	 - Accuracy: 0.4189
	 - precision_at_k:
{1: 0.4188679245283019,
 2: 0.6619946091644204,
 3: 0.8037735849056604,
 4: 0.8857142857142857,
 5: 0.9369272237196765,
 6: 0.9644204851752022,
 7: 0.9730458221024259,
 8: 0.982210242587601,
 9: 0.9865229110512129,
 10: 0.9908355795148248}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1855/1855 [00:30<00:00, 60.59it/s]



	Evaluation result:
	 - Accuracy: 0.4350
	 - precision_at_k:
{1: 0.4350404312668464,
 2: 0.6711590296495957,
 3: 0.8177897574123989,
 4: 0.8889487870619946,
 5: 0.9401617250673855,
 6: 0.9628032345013477,
 7: 0.9730458221024259,
 8: 0.9838274932614556,
 9: 0.9865229110512129,
 10: 0.9908355795148248}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_fi_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 509/509 [00:09<00:00, 55.90it/s]



	Evaluation result:
	 - Accuracy: 0.3576
	 - precision_at_k:
{1: 0.3575638506876228,
 2: 0.5913555992141454,
 3: 0.8133595284872298,
 4: 0.9017681728880157,
 5: 0.9567779960707269,
 6: 0.9685658153241651,
 7: 0.9744597249508841,
 8: 0.9803536345776032,
 9: 0.9941060903732809,
 10: 0.9960707269155207}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 509/509 [00:08<00:00, 59.05it/s]



	Evaluation result:
	 - Accuracy: 0.3870
	 - precision_at_k:
{1: 0.38703339882121807,
 2: 0.6345776031434185,
 3: 0.8172888015717092,
 4: 0.9076620825147348,
 5: 0.9469548133595285,
 6: 0.9666011787819253,
 7: 0.9783889980353635,
 8: 0.9842829076620825,
 9: 0.9921414538310412,
 10: 0.9941060903732809}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_ar_train

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2303/2303 [00:38<00:00, 59.88it/s]



	Evaluation result:
	 - Accuracy: 0.5119
	 - precision_at_k:
{1: 0.5119409465914025,
 2: 0.756838905775076,
 3: 0.8679982631350412,
 4: 0.9366044290056448,
 5: 0.9617889709075119,
 6: 0.9774207555362571,
 7: 0.9848024316109423,
 8: 0.9874077290490665,
 9: 0.9887103777681285,
 10: 0.9908814589665653}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2303/2303 [00:39<00:00, 58.59it/s]



	Evaluation result:
	 - Accuracy: 0.5332
	 - precision_at_k:
{1: 0.5332175423360833,
 2: 0.7694311767260096,
 3: 0.8745115067303517,
 4: 0.9409465914025185,
 5: 0.9661311333043856,
 6: 0.9791576204950065,
 7: 0.9848024316109423,
 8: 0.9887103777681285,
 9: 0.9908814589665653,
 10: 0.9921841076856275}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_te_train

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:20<00:00, 62.51it/s]



	Evaluation result:
	 - Accuracy: 0.3625
	 - precision_at_k:
{1: 0.3624521072796935,
 2: 0.6091954022988506,
 3: 0.7793103448275862,
 4: 0.8804597701149425,
 5: 0.9425287356321839,
 6: 0.9724137931034482,
 7: 0.9808429118773946,
 8: 0.9831417624521073,
 9: 0.993103448275862,
 10: 0.9954022988505747}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1305/1305 [00:20<00:00, 63.80it/s]



	Evaluation result:
	 - Accuracy: 0.3824
	 - precision_at_k:
{1: 0.3823754789272031,
 2: 0.6444444444444445,
 3: 0.782375478927203,
 4: 0.8881226053639847,
 5: 0.9425287356321839,
 6: 0.9670498084291188,
 7: 0.9793103448275862,
 8: 0.9869731800766284,
 9: 0.9915708812260536,
 10: 0.9961685823754789}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_te_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 376/376 [00:06<00:00, 60.27it/s]



	Evaluation result:
	 - Accuracy: 0.3936
	 - precision_at_k:
{1: 0.39361702127659576,
 2: 0.601063829787234,
 3: 0.7952127659574468,
 4: 0.8776595744680851,
 5: 0.9069148936170213,
 6: 0.9680851063829787,
 7: 0.9813829787234043,
 8: 0.9867021276595744,
 9: 0.9867021276595744,
 10: 0.9867021276595744}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 376/376 [00:06<00:00, 60.83it/s]



	Evaluation result:
	 - Accuracy: 0.3644
	 - precision_at_k:
{1: 0.36436170212765956,
 2: 0.6276595744680851,
 3: 0.7792553191489362,
 4: 0.8723404255319149,
 5: 0.9441489361702128,
 6: 0.9680851063829787,
 7: 0.973404255319149,
 8: 0.9840425531914894,
 9: 0.9867021276595744,
 10: 0.9893617021276596}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-xorqa_ar_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:08<00:00, 59.05it/s]



	Evaluation result:
	 - Accuracy: 0.5216
	 - precision_at_k:
{1: 0.5216494845360825,
 2: 0.7381443298969073,
 3: 0.8762886597938144,
 4: 0.931958762886598,
 5: 0.9628865979381444,
 6: 0.9731958762886598,
 7: 0.9835051546391752,
 8: 0.9876288659793815,
 9: 0.9896907216494846,
 10: 0.9917525773195877}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 485/485 [00:07<00:00, 65.38it/s]



	Evaluation result:
	 - Accuracy: 0.5629
	 - precision_at_k:
{1: 0.5628865979381443,
 2: 0.7587628865979381,
 3: 0.8845360824742268,
 4: 0.931958762886598,
 5: 0.9690721649484536,
 6: 0.979381443298969,
 7: 0.9814432989690721,
 8: 0.9855670103092784,
 9: 0.9896907216494846,
 10: 0.9917525773195877}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_en_val

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1148/1148 [00:22<00:00, 50.09it/s]



	Evaluation result:
	 - Accuracy: 0.7247
	 - precision_at_k:
{1: 0.7247386759581882,
 2: 0.8641114982578397,
 3: 0.9259581881533101,
 4: 0.9494773519163763,
 5: 0.9651567944250871,
 6: 0.975609756097561,
 7: 0.985191637630662,
 8: 0.9878048780487805,
 9: 0.9895470383275261,
 10: 0.9921602787456446}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1148/1148 [00:22<00:00, 51.76it/s]



	Evaluation result:
	 - Accuracy: 0.7265
	 - precision_at_k:
{1: 0.7264808362369338,
 2: 0.8693379790940766,
 3: 0.9233449477351916,
 4: 0.9494773519163763,
 5: 0.9651567944250871,
 6: 0.9747386759581882,
 7: 0.9817073170731707,
 8: 0.990418118466899,
 9: 0.9912891986062717,
 10: 0.9930313588850174}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_en_test

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11590/11590 [04:03<00:00, 47.67it/s]



	Evaluation result:
	 - Accuracy: 0.7035
	 - precision_at_k:
{1: 0.7035375323554789,
 2: 0.8440897325280414,
 3: 0.9097497842968076,
 4: 0.9440034512510785,
 5: 0.962381363244176,
 6: 0.9715271786022434,
 7: 0.9805004314063848,
 8: 0.9858498705780846,
 9: 0.9885245901639345,
 10: 0.991458153580673}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11590/11590 [04:30<00:00, 42.84it/s]



	Evaluation result:
	 - Accuracy: 0.7112
	 - precision_at_k:
{1: 0.7112165660051769,
 2: 0.8484037963761863,
 3: 0.9101811906816221,
 4: 0.9446937014667817,
 5: 0.9602243313201035,
 6: 0.9715271786022434,
 7: 0.980327868852459,
 8: 0.9855910267471959,
 9: 0.9891285591026747,
 10: 0.9904227782571182}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_ar_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 517/517 [00:10<00:00, 48.49it/s]



	Evaluation result:
	 - Accuracy: 0.6538
	 - precision_at_k:
{1: 0.6537717601547389,
 2: 0.816247582205029,
 3: 0.8936170212765957,
 4: 0.9342359767891683,
 5: 0.9497098646034816,
 6: 0.9632495164410058,
 7: 0.97678916827853,
 8: 0.9825918762088974,
 9: 0.988394584139265,
 10: 0.9922630560928434}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 517/517 [00:12<00:00, 42.08it/s]



	Evaluation result:
	 - Accuracy: 0.6634
	 - precision_at_k:
{1: 0.6634429400386848,
 2: 0.8278529980657641,
 3: 0.8974854932301741,
 4: 0.9226305609284333,
 5: 0.9497098646034816,
 6: 0.9690522243713733,
 7: 0.9787234042553191,
 8: 0.9825918762088974,
 9: 0.9903288201160542,
 10: 0.9941972920696325}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_ar_test

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5335/5335 [02:08<00:00, 41.47it/s]



	Evaluation result:
	 - Accuracy: 0.6343
	 - precision_at_k:
{1: 0.6343017806935333,
 2: 0.8031865042174321,
 3: 0.8877225866916588,
 4: 0.9244611059044049,
 5: 0.9471415182755389,
 6: 0.9606373008434864,
 7: 0.971883786316776,
 8: 0.9814432989690721,
 9: 0.9861293345829428,
 10: 0.9887535145267105}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5335/5335 [02:06<00:00, 42.16it/s]



	Evaluation result:
	 - Accuracy: 0.6521
	 - precision_at_k:
{1: 0.6521087160262418,
 2: 0.814058106841612,
 3: 0.8894095595126523,
 4: 0.9257731958762887,
 5: 0.9475164011246485,
 6: 0.9613870665417057,
 7: 0.9709465791940018,
 8: 0.9806935332708528,
 9: 0.9861293345829428,
 10: 0.98912839737582}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_de_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 512/512 [00:12<00:00, 40.73it/s]



	Evaluation result:
	 - Accuracy: 0.7051
	 - precision_at_k:
{1: 0.705078125,
 2: 0.849609375,
 3: 0.91796875,
 4: 0.94921875,
 5: 0.9609375,
 6: 0.96875,
 7: 0.978515625,
 8: 0.982421875,
 9: 0.986328125,
 10: 0.986328125}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 512/512 [00:12<00:00, 41.70it/s]



	Evaluation result:
	 - Accuracy: 0.7090
	 - precision_at_k:
{1: 0.708984375,
 2: 0.85546875,
 3: 0.919921875,
 4: 0.939453125,
 5: 0.955078125,
 6: 0.966796875,
 7: 0.9765625,
 8: 0.982421875,
 9: 0.98828125,
 10: 0.98828125}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_de_test

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4517/4517 [01:52<00:00, 40.04it/s]



	Evaluation result:
	 - Accuracy: 0.6719
	 - precision_at_k:
{1: 0.6719061323887536,
 2: 0.8246623865397388,
 3: 0.8968341819791897,
 4: 0.9340270090768209,
 5: 0.9521806508744742,
 6: 0.9670135045384104,
 7: 0.9760903254372371,
 8: 0.9825105158290901,
 9: 0.9851671463360637,
 10: 0.9895948638476865}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4517/4517 [01:55<00:00, 39.13it/s]



	Evaluation result:
	 - Accuracy: 0.6783
	 - precision_at_k:
{1: 0.6783263227806066,
 2: 0.8308611910560106,
 3: 0.8968341819791897,
 4: 0.9344697808279832,
 5: 0.9526234226256365,
 6: 0.9659065751605047,
 7: 0.9760903254372371,
 8: 0.980739428824441,
 9: 0.9867168474651318,
 10: 0.988709320345362}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_es_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:11<00:00, 42.07it/s]



	Evaluation result:
	 - Accuracy: 0.7020
	 - precision_at_k:
{1: 0.702,
 2: 0.85,
 3: 0.918,
 4: 0.95,
 5: 0.966,
 6: 0.974,
 7: 0.978,
 8: 0.984,
 9: 0.99,
 10: 0.992}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:12<00:00, 39.88it/s]



	Evaluation result:
	 - Accuracy: 0.7100
	 - precision_at_k:
{1: 0.71,
 2: 0.864,
 3: 0.918,
 4: 0.952,
 5: 0.972,
 6: 0.974,
 7: 0.98,
 8: 0.986,
 9: 0.992,
 10: 0.992}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_es_test

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5253/5253 [02:21<00:00, 37.23it/s]



	Evaluation result:
	 - Accuracy: 0.6787
	 - precision_at_k:
{1: 0.6786598134399391,
 2: 0.8347610889015801,
 3: 0.905197030268418,
 4: 0.9369883875880449,
 5: 0.9565962307252999,
 6: 0.972206358271464,
 7: 0.9805825242718447,
 8: 0.9851513420902341,
 9: 0.9885779554540263,
 10: 0.9906719969541214}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5253/5253 [02:13<00:00, 39.39it/s]



	Evaluation result:
	 - Accuracy: 0.6901
	 - precision_at_k:
{1: 0.6900818579859128,
 2: 0.8387588044926708,
 3: 0.9008185798591282,
 4: 0.9371787549971445,
 5: 0.9588806396344945,
 6: 0.9712545212259661,
 7: 0.9813439939082429,
 8: 0.984770607272035,
 9: 0.9889586902722254,
 10: 0.9906719969541214}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_hi_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 507/507 [00:11<00:00, 43.27it/s]



	Evaluation result:
	 - Accuracy: 0.2880
	 - precision_at_k:
{1: 0.2879684418145957,
 2: 0.48520710059171596,
 3: 0.6390532544378699,
 4: 0.7554240631163708,
 5: 0.8264299802761341,
 6: 0.863905325443787,
 7: 0.8915187376725838,
 8: 0.903353057199211,
 9: 0.9171597633136095,
 10: 0.9447731755424064}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 507/507 [00:11<00:00, 42.60it/s]



	Evaluation result:
	 - Accuracy: 0.3176
	 - precision_at_k:
{1: 0.3175542406311637,
 2: 0.4930966469428008,
 3: 0.6390532544378699,
 4: 0.7337278106508875,
 5: 0.8205128205128205,
 6: 0.8520710059171598,
 7: 0.8796844181459567,
 8: 0.8994082840236687,
 9: 0.9211045364891519,
 10: 0.9270216962524654}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_hi_test

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4918/4918 [02:03<00:00, 39.75it/s]



	Evaluation result:
	 - Accuracy: 0.2597
	 - precision_at_k:
{1: 0.2596583977226515,
 2: 0.4522163481089874,
 3: 0.5927206181374542,
 4: 0.6966246441642945,
 5: 0.7720618137454249,
 6: 0.8259455063033754,
 7: 0.865189101260675,
 8: 0.892842618950793,
 9: 0.9143960959739732,
 10: 0.9300528670191135}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4918/4918 [02:03<00:00, 39.77it/s]



	Evaluation result:
	 - Accuracy: 0.2631
	 - precision_at_k:
{1: 0.2631150874339162,
 2: 0.45404636030906875,
 3: 0.5892639284261895,
 4: 0.695201301342009,
 5: 0.7686051240341603,
 6: 0.8190321268808459,
 7: 0.8609190727938186,
 8: 0.8899959333062221,
 9: 0.9107360715738105,
 10: 0.9296461976413176}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_vi_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 511/511 [00:14<00:00, 35.08it/s]



	Evaluation result:
	 - Accuracy: 0.3542
	 - precision_at_k:
{1: 0.3542074363992172,
 2: 0.5225048923679061,
 3: 0.6868884540117417,
 4: 0.7808219178082192,
 5: 0.837573385518591,
 6: 0.8904109589041096,
 7: 0.9080234833659491,
 8: 0.9275929549902152,
 9: 0.9432485322896281,
 10: 0.9452054794520548}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 511/511 [00:12<00:00, 39.40it/s]



	Evaluation result:
	 - Accuracy: 0.3659
	 - precision_at_k:
{1: 0.3659491193737769,
 2: 0.5362035225048923,
 3: 0.6829745596868885,
 4: 0.7788649706457925,
 5: 0.8277886497064579,
 6: 0.8767123287671232,
 7: 0.9099804305283757,
 8: 0.9275929549902152,
 9: 0.9432485322896281,
 10: 0.9452054794520548}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_vi_test

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5495/5495 [02:24<00:00, 38.03it/s]



	Evaluation result:
	 - Accuracy: 0.3480
	 - precision_at_k:
{1: 0.34795268425841674,
 2: 0.5339399454049135,
 3: 0.6611464968152866,
 4: 0.7477707006369426,
 5: 0.8065514103730664,
 6: 0.8493175614194722,
 7: 0.8780709736123748,
 8: 0.9020928116469518,
 9: 0.9233848953594177,
 10: 0.9390354868061874}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5495/5495 [02:24<00:00, 38.07it/s]



	Evaluation result:
	 - Accuracy: 0.3576
	 - precision_at_k:
{1: 0.3575978161965423,
 2: 0.5306642402183803,
 3: 0.6518653321201092,
 4: 0.7392174704276615,
 5: 0.8007279344858963,
 6: 0.8445859872611465,
 7: 0.8751592356687898,
 8: 0.902820746132848,
 9: 0.9219290263876251,
 10: 0.937943585077343}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_zh_val

 - model_prefix: model-xquad_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 504/504 [00:12<00:00, 40.16it/s]



	Evaluation result:
	 - Accuracy: 0.6865
	 - precision_at_k:
{1: 0.6865079365079365,
 2: 0.8253968253968254,
 3: 0.8988095238095238,
 4: 0.9305555555555556,
 5: 0.9464285714285714,
 6: 0.9682539682539683,
 7: 0.9761904761904762,
 8: 0.9821428571428571,
 9: 0.9861111111111112,
 10: 0.9880952380952381}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 504/504 [00:12<00:00, 41.99it/s]



	Evaluation result:
	 - Accuracy: 0.6925
	 - precision_at_k:
{1: 0.6924603174603174,
 2: 0.8313492063492064,
 3: 0.9047619047619048,
 4: 0.9384920634920635,
 5: 0.9563492063492064,
 6: 0.9662698412698413,
 7: 0.9761904761904762,
 8: 0.9841269841269841,
 9: 0.9861111111111112,
 10: 0.9880952380952381}
----------------------------------------------------------------------------------------------------






dataset_prefix: dataset-mlqa_zh_test

 - model_prefix: model-xquad_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5137/5137 [02:13<00:00, 38.62it/s]



	Evaluation result:
	 - Accuracy: 0.6564
	 - precision_at_k:
{1: 0.6564142495620011,
 2: 0.822075141132957,
 3: 0.8907922912205567,
 4: 0.9252481993381351,
 5: 0.9501654662254234,
 6: 0.9616507689312829,
 7: 0.9717734086042438,
 8: 0.9780027253260658,
 9: 0.9836480436052171,
 10: 0.9881253649990267}
----------------------------------------------------------------------------------------------------

 - model_prefix: model-mlqa_teacher


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5137/5137 [02:12<00:00, 38.80it/s]


	Evaluation result:
	 - Accuracy: 0.6693
	 - precision_at_k:
{1: 0.6692622153007592,
 2: 0.8288884562974499,
 3: 0.8954642787619232,
 4: 0.9285575238466031,
 5: 0.9488028031925249,
 6: 0.9624294335215106,
 7: 0.9729414054895854,
 8: 0.9781973914736227,
 9: 0.9834533774576601,
 10: 0.9873467004087989}
----------------------------------------------------------------------------------------------------









In [97]:
len(results)

29

In [73]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


In [98]:
json.dump(results, open('./temp.results.model-teachers.json', 'w'), ensure_ascii=False, indent=2)

# json.dump(results, open('./temp.results.mlqa-dataset-non-en.json', 'w'), ensure_ascii=False, indent=2)

# json.dump(results, open('./temp.results.muse-small-v3.json', 'w'), ensure_ascii=False, indent=2)