In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("../")

In [3]:
import time
import random
import requests, uuid, json
from typing import List
import spacy
import openai
import numpy as np
import wandb
from datasets import load_dataset
from mega.data.load_datasets import load_xnli_dataset
from mega.data.data_utils import choose_few_shot_examples
from mega.prompting.instructions import INSTRUCTIONS
from mega.prompting.prompting_utils import load_prompt_template
from mega.utils.env_utils import load_openai_env_variables
from mega.models.completion_models import get_model_pred, gpt3x_completion
from mega.prompting.prompting_utils import construct_prompt, construct_qa_prompt
from tqdm import tqdm
from evaluate import load

# Translator setup for bing
endpoint = "https://api.cognitive.microsofttranslator.com/"
with open("keys/bing_translate_key.txt") as f:
    subscription_key = f.read().split("\n")[0]
    
# Add your location, also known as region. The default is global.
# This is required if using a Cognitive Services resource.
location = "centralindia"
path = "/translate?api-version=3.0"
constructed_url = endpoint + path

headers = {
    "Ocp-Apim-Subscription-Key": subscription_key,
    "Content-type": "application/json",
    "X-ClientTraceId": str(uuid.uuid4()),
}

In [4]:
random.seed(42)
np.random.seed(42)

In [5]:
# Make sure that {env_name}.env file is present in the envs/ directory
env_name = "gpt4v2"
load_env(env_name=env_name)

In [6]:
openai.api_version

'2023-03-15-preview'

In [7]:
model = "gpt-4-32k"
pivot_lang = "es"
tgt_lang = "es"
prompt_name = "answer_given_context_and_question"
few_shot_k = 4
dataset = "xquad"
short_contexts = False
max_tokens = 100

In [8]:
config = {
    "model" : model,
    "pivot_lang": pivot_lang,
    "tgt_lang": tgt_lang,
    "prompt_name": prompt_name,
    "few_shot_k": few_shot_k,
    "dataset": dataset,
    "short_contexts": short_contexts,
    "max_tokens": max_tokens
}

wandb.init(project="GPT-4-eval", entity="scai-msri", config=config)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkabirahuja2431[0m ([33mscai-msri[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [9]:
class SpacySentenceTokenizer:
    
    def __init__(self):
        self.nlp = spacy.load('xx_ent_wiki_sm')
        self.nlp.add_pipe("sentencizer")
        
    def __call__(self, text: str) -> List[str]:
        return list(map(lambda span: span.text, self.nlp(text).sents))


In [10]:
def load_qa_dataset(dataset_name, lang, split, dataset_frac = 1, translate_test = False):
    if dataset_name == "indicqa":
        if split != "train":
            dataset = load_dataset("ai4bharat/IndicQA", f"indicqa.{lang}")[split]
        else:
            dataset = load_dataset("squad")[split]
    elif dataset_name == "xquad":
        if split != "train":
            dataset = load_dataset("xquad", f"xquad.{lang}")[split]
        else:
            dataset = load_dataset("squad")[split]
    elif dataset_name == "tydiqa":
        dataset = load_dataset("tydiqa", 'secondary_task')[split]
        dataset = dataset.map(lambda example: {"lang" : TYDIQA_LANG2CODES[example["id"].split("-")[0]]})
        dataset = dataset.filter(lambda example: example["lang"] == lang)
    elif dataset_name == "mlqa":
        if split == "train":
            print("No Training Data for MLQA, switching to validation!")
            split = "validation"
        if translate_test:
            dataset_name = f"mlqa-translate-test.{lang}"
        else:
            dataset_name = f"mlqa.{lang}.{lang}"
        
        dataset = load_dataset("mlqa", dataset_name)[split]
    
    else:
        raise NotImplementedError()
    N = len(dataset)
    selector = np.arange(int(N * dataset_frac))
    return dataset.select(selector)

In [11]:
TYDIQA_LANG2CODES = {
    "bengali": "bn",
    "korean" : "ko",
    "swahili" : "sw",
    "english" : "en",
    "indonesian" :"id",
    "arabic" : "ar",
    "finnish" : "fi",
    "telugu" : "te",
    "russian" : "ru"
}

In [12]:
LangCodes2LangNames = {
    "en" : "English",
    "ko" : "Korean",
    "es" : "Spanish",
    "de" : "German",
    "fr" : "French",
    "zh" : "Chinese"
}

In [13]:
train_dataset = load_qa_dataset(dataset,
                                lang = pivot_lang,
                                split="validation")
test_dataset = load_qa_dataset(dataset,
                                lang = tgt_lang,
                                split="validation")

Found cached dataset xquad (/home/t-kabirahuja/.cache/huggingface/datasets/xquad/xquad.es/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset xquad (/home/t-kabirahuja/.cache/huggingface/datasets/xquad/xquad.es/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
if short_contexts:
    sent_tokenizer = SpacySentenceTokenizer() 

    train_dataset = train_dataset.map(lambda example: {
        "context": [sent for sent in sent_tokenizer(example["context"]) if example["answers"]["text"][0] in sent][0]
    }, num_proc = 8)

In [15]:
train_examples = choose_few_shot_examples(
        train_dataset, few_shot_k, selection_criteria="random")

In [16]:
train_ex_ids = [train_example["id"] for train_example in train_examples]

In [17]:
train_ex_ids

['5725cc38ec44d21400f3d5bb',
 '57097d63ed30961900e841fe',
 '570d3468b3d812140066d543',
 '56e7586d37bdd419002c3eb5']

In [18]:
test_dataset

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 1190
})

In [19]:
test_dataset = test_dataset.filter(lambda example: example["id"] not in train_ex_ids)
test_dataset

Loading cached processed dataset at /home/t-kabirahuja/.cache/huggingface/datasets/xquad/xquad.es/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336/cache-fb43d141df00a9d9.arrow


Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 1186
})

In [20]:
PROMPTS_DICT = {
    "answer_given_context_and_question" : """{context}
    Q: {question}

    Referring to the passage above, the correct answer to the given question is:
    {answer}""",
    
    "lang_instruct_answer_given_context_and_question" : """{context}
    Q: {question}

    Referring to the passage above, the correct answer to the given question is? Please try to answer in {language} and ensure that the answer appears as it is in the passage.
    A: {answer}""",
    
}

In [21]:
prompt_template = PROMPTS_DICT[prompt_name]

In [22]:
# Loading instruction for the task
instruction = INSTRUCTIONS[dataset]
print(instruction)

You are an NLP assistant whose purpose is to solve reading comprehension problems. You will be provided questions on a set of passages and you will need to provide the answer as it appears in the passage. The answer should be in the same language as the question and the passage.


In [23]:
instruction_prompt = {"role": "system", "content": instruction}
instruction_prompt

{'role': 'system',
 'content': 'You are an NLP assistant whose purpose is to solve reading comprehension problems. You will be provided questions on a set of passages and you will need to provide the answer as it appears in the passage. The answer should be in the same language as the question and the passage.'}

In [24]:
def translate_with_bing(text: str, src: str, dest: str) -> str:
    """Uses the bing translator to translate `text` from `src` language to `dest` language

    Args:
        text (str): Text to translate
        src (str): Source language to translate from
        dest (str): Language to translate to

    Returns:
        str: Translated text
    """
    params = {"api-version": "3.0", "from": src, "to": [dest]}
    body = [{"text": text}]

    try:
        request = requests.post(
            constructed_url, params=params, headers=headers, json=body
        )
        response = request.json()
        # pdb.set_trace()
        translation = response[0]["translations"][0]["text"]
    except:
        pdb.set_trace()
        translation = "<MT Failed>"

    return translation

In [25]:
def fill_template(template, example, fill_answer=True):
    if fill_answer:
        return template.replace("{context}", example["context"])\
                .replace("{question}", example["question"])\
                .replace("{answer}", example["answers"]["text"][0])
    else:
        return template.replace("{context}", example["context"])\
                .replace("{question}", example["question"])\
                .replace("{answer}", "").strip()
    
def construct_translate_cot_prompt(example, lang, test=False):
        
    answer_step_by_step_format = "Let's do it step by step."
    answer_passage_trans_format = "Translation of the passage in English is: \"{translated_en_passage}\""
    answer_question_trans_format = "Translation of the question in English is: \"{translated_en_question}\""
    answer_en_answer_format = "Then the answer in English will be: {answer_en}"
    answer_format = "Hence the final answer in {lang} is: {answer}"
    
    
    user_prompt = {"role": "user", "content": fill_template(prompt_template, example, fill_answer=False)}
    if test:
        return [user_prompt]
    
    assistant_answers = [
        answer_step_by_step_format,
        answer_passage_trans_format.replace("{translated_en_passage}", translate_with_bing(example["context"], src = lang, dest = "en")),
        answer_question_trans_format.replace("{translated_en_question}", translate_with_bing(example["question"], src = lang, dest = "en")),
        answer_en_answer_format.replace("{answer_en}", translate_with_bing(example["answers"]["text"][0], src = lang, dest = "en")),
        answer_format.replace("{answer}", example["answers"]["text"][0]).replace("{lang}", LangCodes2LangNames[lang])
    ]

    assistant_answers = [{"role": "assistant", "content": answer} for answer in assistant_answers]
    return [user_prompt] +  assistant_answers

In [26]:
train_dataset[0]["answers"]

{'text': ['308'], 'answer_start': [133]}

In [27]:
train_dataset[0]

{'id': '56beb4343aeaaa14008c925b',
 'context': '\ufeffLos Panthers, que además de liderar las intercepciones de la NFL con 24 y contar con cuatro jugadores de la Pro Bowl, cedieron solo 308 puntos en defensa y se sitúan en el sexto lugar de la liga. Kawann Short, tacle defensivo de la Pro Bowl, lideró al equipo con 11 capturas, 3 balones sueltos forzados y 2 recuperaciones. A su vez, el liniero Mario Addison, consiguió 6 capturas y media. En la línea de los Panthers, también destacó como ala defensiva el veterano Jared Allen ―5 veces jugador de la Pro Bowl y que fue el líder, en activo, de capturas de la NFL con 136― junto con el también ala defensiva Kony Ealy, que lleva 5 capturas en solo 9 partidos como titular. Detrás de ellos, Thomas Davis y Luke Kuechly, dos de los tres apoyadores titulares que también han sido seleccionados para jugar la Pro Bowl. Davis se hizo con 5 capturas y media, 4 balones sueltos forzados y 4 intercepciones, mientras que Kuechly lideró al equipo en derribo

In [28]:
construct_translate_cot_prompt(
    train_dataset[1],
    lang = tgt_lang
)

[{'role': 'user',
  'content': '\ufeffLos Panthers, que además de liderar las intercepciones de la NFL con 24 y contar con cuatro jugadores de la Pro Bowl, cedieron solo 308 puntos en defensa y se sitúan en el sexto lugar de la liga. Kawann Short, tacle defensivo de la Pro Bowl, lideró al equipo con 11 capturas, 3 balones sueltos forzados y 2 recuperaciones. A su vez, el liniero Mario Addison, consiguió 6 capturas y media. En la línea de los Panthers, también destacó como ala defensiva el veterano Jared Allen ―5 veces jugador de la Pro Bowl y que fue el líder, en activo, de capturas de la NFL con 136― junto con el también ala defensiva Kony Ealy, que lleva 5 capturas en solo 9 partidos como titular. Detrás de ellos, Thomas Davis y Luke Kuechly, dos de los tres apoyadores titulares que también han sido seleccionados para jugar la Pro Bowl. Davis se hizo con 5 capturas y media, 4 balones sueltos forzados y 4 intercepciones, mientras que Kuechly lideró al equipo en derribos (118), forzó 2

In [29]:
few_shot_prompts = []
for example in train_examples:
    few_shot_prompts += construct_translate_cot_prompt(example, lang = tgt_lang)

In [30]:
print(few_shot_prompts[4])

{'role': 'assistant', 'content': 'Then the answer in English will be: with common rules for coal and steel'}


In [31]:
squad_metric = load("squad")

In [32]:
test_example = test_dataset[132]


test_prompt = construct_translate_cot_prompt(test_example, tgt_lang, test=True)

prompt = [instruction_prompt] + few_shot_prompts + test_prompt
label = test_example["answers"]["text"][0]


prompt

[{'role': 'system',
  'content': 'You are an NLP assistant whose purpose is to solve reading comprehension problems. You will be provided questions on a set of passages and you will need to provide the answer as it appears in the passage. The answer should be in the same language as the question and the passage.'},
 {'role': 'user',
  'content': 'Los principales Tratados que forman la Unión Europea comenzaron con reglas comunes para el carbón y el acero, y luego la energía atómica, pero se establecieron instituciones más integrales y formales mediante el Tratado de Roma de 1957 y el Tratado de Maastricht de 1992 (ahora: TFEU). Durante los años sesenta y setenta se introdujeron pequeñas modificaciones. Se firmaron importantes tratados de enmienda para completar el desarrollo de un mercado interior único en el Acta Única Europea 1986, para promover el desarrollo de una Europa más social en el Tratado de Ámsterdam de 1997 y para introducir pequeñas modificaciones en el poder relativo de l

In [33]:
type(prompt)

list

In [34]:
preds = []
for _ in range(20):
    
    pred = gpt3x_completion(
        prompt,
        model,
        temperature=0,
        max_tokens = 100
    )
    prompt += [{"role": "assistant", "content": pred}]
    preds.append(pred)
    if "Hence the final answer" in pred:
        break

Exceeded Rate Limit. Waiting for 2 seconds
Exceeded Rate Limit. Waiting for 4 seconds
Exceeded Rate Limit. Waiting for 8 seconds
Exceeded Rate Limit. Waiting for 16 seconds
Exceeded Rate Limit. Waiting for 32 seconds
Exceeded Rate Limit. Waiting for 64 seconds
Exceeded Rate Limit. Waiting for 2 seconds
Exceeded Rate Limit. Waiting for 4 seconds
Exceeded Rate Limit. Waiting for 8 seconds
Exceeded Rate Limit. Waiting for 2 seconds
Exceeded Rate Limit. Waiting for 4 seconds
Exceeded Rate Limit. Waiting for 8 seconds
Exceeded Rate Limit. Waiting for 16 seconds
Exceeded Rate Limit. Waiting for 32 seconds
Exceeded Rate Limit. Waiting for 64 seconds
Exceeded Rate Limit. Waiting for 128 seconds
Exceeded Rate Limit. Waiting for 2 seconds
Exceeded Rate Limit. Waiting for 4 seconds
Exceeded Rate Limit. Waiting for 8 seconds
Exceeded Rate Limit. Waiting for 16 seconds
Exceeded Rate Limit. Waiting for 32 seconds
Exceeded Rate Limit. Waiting for 64 seconds
Exceeded Rate Limit. Waiting for 128 second

KeyboardInterrupt: 

In [35]:
preds

['', '', '']

Exception in thread SockSrvRdThr:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/service/server_sock.py", line 100, in run
    sreq = self._sock_client.read_server_request()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 274, in read_server_request
    data = self._read_packet_bytes()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 248, in _read_packet_bytes
    rec = self._extract_packet_bytes()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 230, in _extract_packet_bytes
    assert magic == ord("W")
AssertionError
Exception in thr

Exception in thread SockSrvRdThr:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/service/server_sock.py", line 100, in run
    sreq = self._sock_client.read_server_request()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 274, in read_server_request
    data = self._read_packet_bytes()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 248, in _read_packet_bytes
    rec = self._extract_packet_bytes()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 230, in _extract_packet_bytes
    assert magic == ord("W")
AssertionError
Exception in thr

Exception in thread SockSrvRdThr:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/service/server_sock.py", line 100, in run
    sreq = self._sock_client.read_server_request()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 274, in read_server_request
    data = self._read_packet_bytes()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 248, in _read_packet_bytes
    rec = self._extract_packet_bytes()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/megaenv/lib/python3.8/site-packages/wandb/sdk/lib/sock_client.py", line 230, in _extract_packet_bytes
    assert magic == ord("W")
AssertionError
Exception in thr

Bad pipe message: %s [b'VL\xfa\xa9\xbb=\xd5\x80\xd0']
Bad pipe message: %s [b'\xc0\xb5\xd5\x11\xcf\xf6\xd7 \xe9\x89"\x9cu\x94\x11\xc9YM\xe3E\xe2\xff\xe8\x07\xe3\xb0\xbf\xc5\xcc\x83;\xdc\xd7e<x\xdf\xc8\xcb\xfb']
Bad pipe message: %s [b'<\xc0\x18\x1e.\x18X]\xbb5\x9a\x9b\xf50\x17\x81C\x11 \xcb\xb9f[\x0c\x98\xa4\x94&kQBo\x0c4P\xdf\xc2\x82C\x17\x10\xb2qb\xc6G[\xdd\xfeX,\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06']
Bad pipe message: %s [b'']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \x8f\x1f\xfa\xe1\xa5Q\xb8\xdcAs0\x0fV\x8a\xe3\xae6k\xabu\x89\xba']
Bad pipe message: %s [b';\n5\xadj\x85\x85x\xd8\xa2\xcc.]\x02\xb7\xaa%g\x00\x00|\xc0,\

In [152]:
preds

['Hence the final answer in Korean is: 오타와']

In [153]:
len(pred.split())

8

In [154]:
pred = preds[-1].split(":")[-1].strip()
pred

'오타와'

In [155]:
print(f"Prediction: {pred}")
print(f"Label: {label}")
prediction = {"prediction_text": pred, "id": test_example["id"]}
reference = {}
reference["answers"] = test_example["answers"]
reference["id"] = test_example["id"]
results = squad_metric.compute(
            predictions=[prediction],
            references=[reference]
        )
print(results)

Prediction: 오타와
Label: 오타와
{'exact_match': 100.0, 'f1': 100.0}


In [165]:
f1_sum = 0
em_sum = 0
avg_em = 0
avg_f1 = 0

run_details = {"num_calls": 0}

pbar = tqdm(enumerate(test_dataset))

for i, test_example in pbar:    
    test_prompt = construct_translate_cot_prompt(test_example, tgt_lang, test=True)

    prompt = [instruction_prompt] + few_shot_prompts + test_prompt
    label = test_example["answers"]["text"][0]
    preds = []
    for _ in range(20):
        try:
            pred = gpt3x_completion(
                prompt,
                model,
                temperature=0,
                evals_per_second=2,
                run_details=run_details,
                max_tokens=max_tokens
            )
            prompt += [{"role": "assistant", "content": pred}]
            preds.append(pred)
            if "Hence the final answer" in pred:
                break
        except:
            pred = "" if preds == [] else pred
            break
            
    if "Hence the final answer" in pred:
        answer_pred = preds[-1].split(":")[-1].strip()
    else:
        answer_pred = ""
    
    
    prediction = {"prediction_text": answer_pred, "id": test_example["id"]}
    reference = {}
    reference["answers"] = test_example["answers"]
    reference["id"] = test_example["id"]
    results = squad_metric.compute(
                predictions=[prediction],
                references=[reference])
    f1_sum += results["f1"]
    em_sum += results["exact_match"]
        
    avg_f1 = f1_sum / (i+1)
    avg_em = em_sum / (i+1)
    
#     wandb.log({"f1": avg_f1, "em": avg_em}, step = i+1)
#     wandb.log(run_details, step = i+1)
    pbar.set_description(f"em: {avg_em} f1: {avg_f1}")

em: 50.84033613445378 f1: 62.88800698400471: : 238it [2:57:23, 44.72s/it]  


FileNotFoundError: [Errno 2] No such file or directory: '/home/t-kabirahuja/.cache/huggingface/metrics/squad/default'

In [None]:
run_details