In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("../")

In [16]:
import time
import requests, uuid, json
from typing import List
import spacy
import openai
import numpy as np
import wandb
from datasets import load_dataset
from mega.data.load_datasets import load_xnli_dataset
from mega.data.data_utils import choose_few_shot_examples
from mega.prompting.instructions import INSTRUCTIONS
from mega.prompting.prompting_utils import load_prompt_template
from mega.utils.env_utils import load_env
from mega.models.completion_models import get_model_pred, gpt3x_completion
from mega.prompting.prompting_utils import construct_prompt, construct_qa_prompt
from tqdm import tqdm
from evaluate import load

# Translator setup for bing
endpoint = "https://api.cognitive.microsofttranslator.com/"
with open("keys/bing_translate_key.txt") as f:
    subscription_key = f.read().split("\n")[0]
    
# Add your location, also known as region. The default is global.
# This is required if using a Cognitive Services resource.
location = "centralindia"
path = "/translate?api-version=3.0"
constructed_url = endpoint + path

headers = {
    "Ocp-Apim-Subscription-Key": subscription_key,
    "Content-type": "application/json",
    "X-ClientTraceId": str(uuid.uuid4()),
}

In [4]:
# Make sure that {env_name}.env file is present in the envs/ directory
env_name = "gpt4"
load_env(env_name=env_name)

In [5]:
openai.api_version

'2023-03-15-preview'

In [156]:
model = "gpt-4-32k"
pivot_lang = "ko"
tgt_lang = "ko"
prompt_name = "answer_given_context_and_question"
few_shot_k = 4
dataset = "tydiqa"
short_contexts = False
max_tokens = 100

In [157]:
config = {
    "model" : model,
    "pivot_lang": pivot_lang,
    "tgt_lang": tgt_lang,
    "prompt_name": prompt_name,
    "few_shot_k": few_shot_k,
    "dataset": dataset,
    "short_contexts": short_contexts,
    "max_tokens": max_tokens
}

# wandb.init(project="GPT-4-eval", entity="scai-msri", config=config)

In [32]:
class SpacySentenceTokenizer:
    
    def __init__(self):
        self.nlp = spacy.load('xx_ent_wiki_sm')
        self.nlp.add_pipe("sentencizer")
        
    def __call__(self, text: str) -> List[str]:
        return list(map(lambda span: span.text, self.nlp(text).sents))


In [33]:
def load_qa_dataset(dataset_name, lang, split, dataset_frac = 1, translate_test = False):
    if dataset_name == "indicqa":
        if split != "train":
            dataset = load_dataset("ai4bharat/IndicQA", f"indicqa.{lang}")[split]
        else:
            dataset = load_dataset("squad")[split]
    elif dataset_name == "xquad":
        if split != "train":
            dataset = load_dataset("xquad", f"xquad.{lang}")[split]
        else:
            dataset = load_dataset("squad")[split]
    elif dataset_name == "tydiqa":
        dataset = load_dataset("tydiqa", 'secondary_task')[split]
        dataset = dataset.map(lambda example: {"lang" : TYDIQA_LANG2CODES[example["id"].split("-")[0]]})
        dataset = dataset.filter(lambda example: example["lang"] == lang)
    elif dataset_name == "mlqa":
        if split == "train":
            print("No Training Data for MLQA, switching to validation!")
            split = "validation"
        if translate_test:
            dataset_name = f"mlqa-translate-test.{lang}"
        else:
            dataset_name = f"mlqa.{lang}.{lang}"
        
        dataset = load_dataset("mlqa", dataset_name)[split]
    
    else:
        raise NotImplementedError()
    N = len(dataset)
    selector = np.arange(int(N * dataset_frac))
    return dataset.select(selector)

In [34]:
TYDIQA_LANG2CODES = {
    "bengali": "bn",
    "korean" : "ko",
    "swahili" : "sw",
    "english" : "en",
    "indonesian" :"id",
    "arabic" : "ar",
    "finnish" : "fi",
    "telugu" : "te",
    "russian" : "ru"
}

In [35]:
LangCodes2LangNames = {
    "en" : "English",
    "ko" : "Korean",
    "es" : "Spanish",
    "de" : "German",
    "fr" : "French",
    "zh" : "Chinese"
}

In [36]:
train_dataset = load_qa_dataset(dataset,
                                lang = pivot_lang,
                                split="train")
test_dataset = load_qa_dataset(dataset,
                                lang = tgt_lang,
                                split="validation")

Found cached dataset tydiqa (/home/t-kabirahuja/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/t-kabirahuja/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148/cache-e999b14bc35f2874.arrow
Loading cached processed dataset at /home/t-kabirahuja/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148/cache-f1e0547ce5848305.arrow
Found cached dataset tydiqa (/home/t-kabirahuja/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148)


  0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/t-kabirahuja/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148/cache-9154e010a5c45085.arrow
Loading cached processed dataset at /home/t-kabirahuja/.cache/huggingface/datasets/tydiqa/secondary_task/1.0.0/b8a6c4c0db10bf5703d7b36645e5dbae821b8c0e902dac9daeecd459a8337148/cache-b2cf73e837a00e99.arrow


In [37]:
if short_contexts:
    sent_tokenizer = SpacySentenceTokenizer() 

    train_dataset = train_dataset.map(lambda example: {
        "context": [sent for sent in sent_tokenizer(example["context"]) if example["answers"]["text"][0] in sent][0]
    }, num_proc = 8)

In [57]:
train_examples = choose_few_shot_examples(
        train_dataset, few_shot_k, selection_criteria="random")

In [58]:
PROMPTS_DICT = {
    "answer_given_context_and_question" : """{context}
    Q: {question}

    Referring to the passage above, the correct answer to the given question is:
    {answer}""",
    
    "lang_instruct_answer_given_context_and_question" : """{context}
    Q: {question}

    Referring to the passage above, the correct answer to the given question is? Please try to answer in {language} and ensure that the answer appears as it is in the passage.
    A: {answer}""",
    
}

In [59]:
prompt_template = PROMPTS_DICT[prompt_name]

In [60]:
# Loading instruction for the task
instruction = INSTRUCTIONS[dataset]
print(instruction)

You are an NLP assistant whose purpose is to solve reading comprehension problems. You will be provided questions on a set of passages and you will need to provide the answer as it appears in the passage. The answer should be in the same language as the question and the passage.


In [80]:
instruction_prompt = {"role": "system", "content": instruction}
instruction_prompt

{'role': 'system',
 'content': 'You are an NLP assistant whose purpose is to solve reading comprehension problems. You will be provided questions on a set of passages and you will need to provide the answer as it appears in the passage. The answer should be in the same language as the question and the passage.'}

In [61]:
def translate_with_bing(text: str, src: str, dest: str) -> str:
    """Uses the bing translator to translate `text` from `src` language to `dest` language

    Args:
        text (str): Text to translate
        src (str): Source language to translate from
        dest (str): Language to translate to

    Returns:
        str: Translated text
    """
    params = {"api-version": "3.0", "from": src, "to": [dest]}
    body = [{"text": text}]

    try:
        request = requests.post(
            constructed_url, params=params, headers=headers, json=body
        )
        response = request.json()
        # pdb.set_trace()
        translation = response[0]["translations"][0]["text"]
    except:
        pdb.set_trace()
        translation = "<MT Failed>"

    return translation

In [140]:
def fill_template(template, example, fill_answer=True):
    if fill_answer:
        return template.replace("{context}", example["context"])\
                .replace("{question}", example["question"])\
                .replace("{answer}", example["answers"]["text"][0])
    else:
        return template.replace("{context}", example["context"])\
                .replace("{question}", example["question"])\
                .replace("{answer}", "").strip()
    
def construct_translate_cot_prompt(example, lang, test=False):
        
    answer_step_by_step_format = "Let's do it step by step."
    answer_passage_trans_format = "Translation of the passage in English is: \"{translated_en_passage}\""
    answer_question_trans_format = "Translation of the question in English is: \"{translated_en_question}\""
    answer_en_answer_format = "Then the answer in English will be: {answer_en}"
    answer_format = "Hence the final answer in {lang} is: {answer}"
    
    
    user_prompt = {"role": "user", "content": fill_template(prompt_template, example, fill_answer=False)}
    if test:
        return [user_prompt]
    
    assistant_answers = [
        answer_step_by_step_format,
        answer_passage_trans_format.replace("{translated_en_passage}", translate_with_bing(example["context"], src = lang, dest = "en")),
        answer_question_trans_format.replace("{translated_en_question}", translate_with_bing(example["question"], src = lang, dest = "en")),
        answer_en_answer_format.replace("{answer_en}", translate_with_bing(example["answers"]["text"][0], src = lang, dest = "en")),
        answer_format.replace("{answer}", example["answers"]["text"][0]).replace("{lang}", LangCodes2LangNames[lang])
    ]

    assistant_answers = [{"role": "assistant", "content": answer} for answer in assistant_answers]
    return [user_prompt] +  assistant_answers

In [141]:
train_dataset[0]["answers"]

{'text': ['예정'], 'answer_start': [59]}

In [142]:
train_dataset[0]

{'id': 'korean--5252295675424886259-0',
 'title': '2019년 FIFA U-20 월드컵',
 'context': '2019년 FIFA U-20 월드컵</b>은 2019년 5월 23일부터 6월 15일까지 폴란드에서 개최될 예정인 22번째 FIFA U-20 월드컵이다.',
 'question': '2019년까지 월드컵은 몇개국에서 개최되었는가?',
 'answers': {'text': ['예정'], 'answer_start': [59]},
 'lang': 'ko'}

In [143]:
construct_translate_cot_prompt(
    train_dataset[1],
    lang = tgt_lang
)

[{'role': 'user',
  'content': '연구방식은 크게 두 가지로 나눌 수 있다. 이는 탑다운(Top-down)방식과 바텀업(Bottom-up)방식이다. 탑다운 방식은 자연계에 존재하는 생명체의 유전자를 변형시키는 방식이다. 하나의 세포로 이뤄진 미생물을 예로 들면, 미생물의 유전자 일부를 바꾸는 것이다. 미국 스탠포드대에서 각각의 유전자를 기계의 부품처럼 만들어 다양하게 조합한 후 미생물에 삽입을 시도하고 있는 드루앤디(Drew Endy)의 접근이 한 가지 사례이다. 또 미국 UC버클리의 제이 키슬링(Jay Keasling)이 식물의 유용 유전자를 미생물에 대량으로 삽입하고 있는 것도 이에 해당한다. 이에 비해 바텀업 방식은 화학물질에서 시작해 생명체의 구성요소를 하나하나 만들어 내는 방식이다. 그래서 합성생물학 관련 영문저술에는 종종 ‘무에서 유를 창조한다’는 의미에서 ‘Starting from scratch’라는 즉, ‘처음에서 시작한다’는 뜻의 관용어구가 등장한다. 미생물의 유전자 염기서열을 하나하나를 만든 후 이들을 연결시켜 인공유전체를 합성한 미국의 크레이그 밴터(J.Craig Venter)의 접근이 대표사례이다.\n    Q: 합성생물학을 연구하는 방식은 탑다운 외 다른 방식은 무엇이 있나요?\n\n    Referring to the passage above, the correct answer to the given question is:'},
 {'role': 'assistant', 'content': "Let's do it step by step."},
 {'role': 'assistant',
  'content': 'Translation of the passage in English is: "There are two main types of research methods. This is a top-down and bottom-up method. The top-down method is a method of modifying

In [144]:
few_shot_prompts = []
for example in train_examples:
    few_shot_prompts += construct_translate_cot_prompt(example, lang = tgt_lang)

In [145]:
print(few_shot_prompts[4])

{'role': 'assistant', 'content': 'Then the answer in English will be: 30 December 1922'}


In [146]:
squad_metric = load("squad")

In [147]:
test_example = test_dataset[132]


test_prompt = construct_translate_cot_prompt(test_example, tgt_lang, test=True)

prompt = [instruction_prompt] + few_shot_prompts + test_prompt
label = test_example["answers"]["text"][0]


prompt

[{'role': 'system',
  'content': 'You are an NLP assistant whose purpose is to solve reading comprehension problems. You will be provided questions on a set of passages and you will need to provide the answer as it appears in the passage. The answer should be in the same language as the question and the passage.'},
 {'role': 'user',
  'content': '소비에트 사회주의 공화국 연방(, ), 약칭 소비에트 연방() 또는 소련(蘇聯)은 1922년 12월 30일부터 1991년 12월 26일까지 유라시아 북부에 존재하였던 세계 최초의 공산주의 국가였다.\n    Q: 소련의 설립일은 언제인가요?\n\n    Referring to the passage above, the correct answer to the given question is:'},
 {'role': 'assistant', 'content': "Let's do it step by step."},
 {'role': 'assistant',
  'content': 'Translation of the passage in English is: "The Union of Soviet Socialist Republics (, ), abbreviated Soviet Union (), or Soviet Union (蘇聯) was the world\'s first communist state that existed in northern Eurasia from December 30, 1922, to December 26, 1991."'},
 {'role': 'assistant',
  'content': 'Translation of the question in

In [148]:
type(prompt)

list

In [151]:
preds = []
for _ in range(20):
    
    pred = gpt3x_completion(
        prompt,
        model,
        temperature=0,
        max_tokens = 100
    )
    prompt += [{"role": "assistant", "content": pred}]
    preds.append(pred)
    if "Hence the final answer" in pred:
        break

In [152]:
preds

['Hence the final answer in Korean is: 오타와']

In [153]:
len(pred.split())

8

In [154]:
pred = preds[-1].split(":")[-1].strip()
pred

'오타와'

In [155]:
print(f"Prediction: {pred}")
print(f"Label: {label}")
prediction = {"prediction_text": pred, "id": test_example["id"]}
reference = {}
reference["answers"] = test_example["answers"]
reference["id"] = test_example["id"]
results = squad_metric.compute(
            predictions=[prediction],
            references=[reference]
        )
print(results)

Prediction: 오타와
Label: 오타와
{'exact_match': 100.0, 'f1': 100.0}


In [165]:
f1_sum = 0
em_sum = 0
avg_em = 0
avg_f1 = 0

run_details = {"num_calls": 0}

pbar = tqdm(enumerate(test_dataset))

for i, test_example in pbar:    
    test_prompt = construct_translate_cot_prompt(test_example, tgt_lang, test=True)

    prompt = [instruction_prompt] + few_shot_prompts + test_prompt
    label = test_example["answers"]["text"][0]
    preds = []
    for _ in range(20):
        try:
            pred = gpt3x_completion(
                prompt,
                model,
                temperature=0,
                evals_per_second=2,
                run_details=run_details,
                max_tokens=max_tokens
            )
            prompt += [{"role": "assistant", "content": pred}]
            preds.append(pred)
            if "Hence the final answer" in pred:
                break
        except:
            pred = "" if preds == [] else pred
            break
            
    if "Hence the final answer" in pred:
        answer_pred = preds[-1].split(":")[-1].strip()
    else:
        answer_pred = ""
    
    
    prediction = {"prediction_text": answer_pred, "id": test_example["id"]}
    reference = {}
    reference["answers"] = test_example["answers"]
    reference["id"] = test_example["id"]
    results = squad_metric.compute(
                predictions=[prediction],
                references=[reference])
    f1_sum += results["f1"]
    em_sum += results["exact_match"]
        
    avg_f1 = f1_sum / (i+1)
    avg_em = em_sum / (i+1)
    
#     wandb.log({"f1": avg_f1, "em": avg_em}, step = i+1)
#     wandb.log(run_details, step = i+1)
    pbar.set_description(f"em: {avg_em} f1: {avg_f1}")

em: 50.84033613445378 f1: 62.88800698400471: : 238it [2:57:23, 44.72s/it]  


FileNotFoundError: [Errno 2] No such file or directory: '/home/t-kabirahuja/.cache/huggingface/metrics/squad/default'

In [None]:
run_details