In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("../")

In [5]:
import time
from typing import List
import spacy
import openai
import numpy as np
import wandb
from datasets import load_dataset
from mega.data.load_datasets import load_xnli_dataset
from mega.data.data_utils import choose_few_shot_examples
from mega.prompting.instructions import INSTRUCTIONS
from mega.prompting.prompting_utils import load_prompt_template
from mega.utils.env_utils import load_env
from mega.models.completion_models import get_model_pred, gpt3x_completion
from mega.prompting.prompting_utils import construct_prompt, construct_qa_prompt
from tqdm import tqdm
from evaluate import load

In [6]:
# Make sure that {env_name}.env file is present in the envs/ directory
env_name = "gpt4"
load_env(env_name=env_name)

In [7]:
openai.api_version

'2023-03-15-preview'

In [8]:
model = "gpt-4-32k"
pivot_lang = "en"
tgt_lang = "en"
prompt_name = "answer_given_context_and_question"
few_shot_k = 4
dataset = "xquad"
short_contexts = False
max_tokens = 20

In [9]:
config = {
    "model" : model,
    "pivot_lang": pivot_lang,
    "tgt_lang": tgt_lang,
    "prompt_name": prompt_name,
    "few_shot_k": few_shot_k,
    "dataset": dataset,
    "short_contexts": short_contexts,
    "max_tokens": max_tokens
}

wandb.init(project="GPT-4-eval", entity="scai-msri", config=config)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkabirahuja2431[0m ([33mscai-msri[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
class SpacySentenceTokenizer:
    
    def __init__(self):
        self.nlp = spacy.load('xx_ent_wiki_sm')
        self.nlp.add_pipe("sentencizer")
        
    def __call__(self, text: str) -> List[str]:
        return list(map(lambda span: span.text, self.nlp(text).sents))


In [11]:
def load_qa_dataset(dataset_name, lang, split, dataset_frac = 1, translate_test = False):
    if dataset_name == "indicqa":
        if split != "train":
            dataset = load_dataset("ai4bharat/IndicQA", f"indicqa.{lang}")[split]
        else:
            dataset = load_dataset("squad")[split]
    elif dataset_name == "xquad":
        if split != "train":
            dataset = load_dataset("xquad", f"xquad.{lang}")[split]
        else:
            dataset = load_dataset("squad")[split]
    elif dataset_name == "tydiqa":
        dataset = load_dataset("tydiqa", 'secondary_task')[split]
        dataset = dataset.map(lambda example: {"lang" : TYDIQA_LANG2CODES[example["id"].split("-")[0]]})
        dataset = dataset.filter(lambda example: example["lang"] == lang)
    elif dataset_name == "mlqa":
        if split == "train":
            print("No Training Data for MLQA, switching to validation!")
            split = "validation"
        if translate_test:
            dataset_name = f"mlqa-translate-test.{lang}"
        else:
            dataset_name = f"mlqa.{lang}.{lang}"
        
        dataset = load_dataset("mlqa", dataset_name)[split]
    
    else:
        raise NotImplementedError()
    N = len(dataset)
    selector = np.arange(int(N * dataset_frac))
    return dataset.select(selector)

In [12]:
train_dataset = load_qa_dataset(dataset,
                                lang = pivot_lang,
                                split="train")
test_dataset = load_qa_dataset(dataset,
                                lang = tgt_lang,
                                split="validation")

Found cached dataset squad (/home/t-kabirahuja/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

Found cached dataset xquad (/home/t-kabirahuja/.cache/huggingface/datasets/xquad/xquad.en/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)


  0%|          | 0/1 [00:00<?, ?it/s]

In [13]:
if short_contexts:
    sent_tokenizer = SpacySentenceTokenizer() 

    train_dataset = train_dataset.map(lambda example: {
        "context": [sent for sent in sent_tokenizer(example["context"]) if example["answers"]["text"][0] in sent][0]
    }, num_proc = 8)

In [14]:
train_examples = choose_few_shot_examples(
        train_dataset, few_shot_k, selection_criteria="random")

Exception in thread SystemMonitor:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/mega2env/lib/python3.8/site-packages/wandb/sdk/internal/system/system_monitor.py", line 118, in _start
    asset.start()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/mega2env/lib/python3.8/site-packages/wandb/sdk/internal/system/assets/cpu.py", line 166, in start
    self.metrics_monitor.start()
  File "/home/t-kabirahuja/work/repos/MultilingualBlanketEval/envs/mega2env/lib/python3.8/site-packages/wandb/sdk/internal/system/assets/interfaces.py", line 168, in start
    logger.info(f"Started {self._process.name}")
AttributeError: 'NoneType' object has no attribute 'name'


In [16]:
PROMPTS_DICT = {
    "answer_given_context_and_question" : """{context}
    Q: {question}

    Referring to the passage above, the correct answer to the given question is:
    {answer}""",
    
    "lang_instruct_answer_given_context_and_question" : """{context}
    Q: {question}

    Referring to the passage above, the correct answer to the given question is? Please try to answer in {language} and ensure that the answer appears as it is in the passage.
    A: {answer}""",
    
}

In [17]:
prompt_template = PROMPTS_DICT[prompt_name]

In [18]:
# Loading instruction for the task
instruction = INSTRUCTIONS[dataset]
print(instruction)

You are an NLP assistant whose purpose is to solve reading comprehension problems. You will be provided questions on a set of passages and you will need to provide the answer as it appears in the passage. The answer should be in the same language as the question and the passage.


In [19]:
squad_metric = load("squad")

In [20]:
test_example = test_dataset[132]

prompt, label = construct_qa_prompt(
    train_examples,
    test_example,
    train_prompt_template=prompt_template,
    test_prompt_template=prompt_template,
    chat_prompt=True,
    instruction=instruction
)
prompt

[{'role': 'system',
  'content': 'You are an NLP assistant whose purpose is to solve reading comprehension problems. You will be provided questions on a set of passages and you will need to provide the answer as it appears in the passage. The answer should be in the same language as the question and the passage.'},
 {'role': 'user',
  'content': "Under contract from the U.S. Military, Matrox produced a combination computer/LaserDisc player for instructional purposes. The computer was a 286, the LaserDisc player only capable of reading the analog audio tracks. Together they weighed 43 lb (20 kg) and sturdy handles were provided in case two people were required to lift the unit. The computer controlled the player via a 25-pin serial port at the back of the player and a ribbon cable connected to a proprietary port on the motherboard. Many of these were sold as surplus by the military during the 1990s, often without the controller software. Nevertheless, it is possible to control the unit 

In [21]:
pred = gpt3x_completion(
    prompt,
    model,
    temperature=0,
    max_tokens=20
)

In [22]:
print(f"Prediction: {pred}")
print(f"Label: {label}")
prediction = {"prediction_text": pred, "id": test_example["id"]}
reference = {}
reference["answers"] = test_example["answers"]
reference["id"] = test_example["id"]
results = squad_metric.compute(
            predictions=[prediction],
            references=[reference]
        )
print(results)

Prediction: A Machine to End War
Label: "A Machine to End War"
{'exact_match': 100.0, 'f1': 100.0}


In [None]:
f1_sum = 0
em_sum = 0
avg_em = 0
avg_f1 = 0

run_details = {"num_calls": 0}

pbar = tqdm(enumerate(test_dataset))

for i, test_example in pbar:    
    prompt, label = construct_qa_prompt(
        train_examples,
        test_example,
        train_prompt_template=prompt_template,
        test_prompt_template=prompt_template,
        chat_prompt=True,
        instruction=instruction
    )
    pred = gpt3x_completion(
        prompt,
        model,
        temperature=0,
        run_details=run_details,
        max_tokens=max_tokens
    )
    prediction = {"prediction_text": pred, "id": test_example["id"]}
    reference = {}
    reference["answers"] = test_example["answers"]
    reference["id"] = test_example["id"]
    results = squad_metric.compute(
                predictions=[prediction],
                references=[reference])
    f1_sum += results["f1"]
    em_sum += results["exact_match"]
        
    avg_f1 = f1_sum / (i+1)
    avg_em = em_sum / (i+1)
    
    wandb.log({"f1": avg_f1, "em": avg_em})
    wandb.log(run_details)
    pbar.set_description(f"em: {avg_em} f1: {avg_f1}")
    time.sleep(1/2)

em: 80.0 f1: 91.11111111111109: : 30it [01:36,  2.87s/it]             