In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir("../")

In [4]:
import time
import random
from typing import List
import spacy
import openai
import numpy as np
import wandb
from datasets import load_dataset
from mega.data.load_datasets import load_xnli_dataset
from mega.data.data_utils import choose_few_shot_examples
from mega.prompting.instructions import INSTRUCTIONS
from mega.prompting.prompting_utils import load_prompt_template
from mega.utils.env_utils import load_env
# from mega.models.completion_models import get_model_pred, gpt3x_completion
from mega.models.tag_models import get_model_pred, gpt3x_tagger
from mega.prompting.prompting_utils import construct_prompt, construct_qa_prompt, construct_tagging_prompt
from mega.data.load_datasets import load_tagging_dataset
from seqeval.metrics import f1_score
from tqdm.notebook import tqdm
from evaluate import load

# Set seed
random.seed(42)
np.random.seed(42)

In [5]:
# Make sure that {env_name}.env file is present in the envs/ directory
env_name = "melange"
load_env(env_name=env_name)

In [6]:
openai.api_version = "2023-03-15-preview"
openai.api_version

'2023-03-15-preview'

In [7]:
openai.api_base

'https://gpttesting1.openai.azure.com/'

In [8]:
model = "gpt-35-turbo-deployment"
pivot_lang = "en"
tgt_lang = "en"
prompt_name = "structure_prompting_chat"
few_shot_k = 8
dataset = "panx"
# short_contexts = False
max_tokens = 100

In [11]:
config = {
    "model" : model,
    "pivot_lang": pivot_lang,
    "tgt_lang": tgt_lang,
    "prompt_name": prompt_name,
    "few_shot_k": few_shot_k,
    "dataset": dataset,
    "max_tokens": max_tokens
}

# wandb.init(project="GPT-4-eval", entity="scai-msri", config=config)

In [12]:
class SpacySentenceTokenizer:
    
    def __init__(self):
        self.nlp = spacy.load('xx_ent_wiki_sm')
        self.nlp.add_pipe("sentencizer")
        
    def __call__(self, text: str) -> List[str]:
        return list(map(lambda span: span.text, self.nlp(text).sents))


In [13]:
# def load_tagging_dataset(
#     dataset: str,
#     lang: str,
#     split: str,
#     dataset_frac: float = 1.0,
#     xtreme_dir: str = "xtreme/download",
#     delimiter: str = "_",
# ):

#     split = "dev" if split == "validation" else split

#     filename = f"{xtreme_dir}/{dataset}/{split}-{lang}.tsv"
#     inputs, labels = read_conll_data(filename)

#     dataset = Dataset.from_dict({"tokens": inputs, "tags": labels})
#     dataset = dataset.map(
#         lambda example: {
#             "tagged_tokens": [f"{token}{delimiter}{tag}"
#             for token, tag in zip(example["tokens"], example["tags"])]
#         }
#     )

#     N = len(dataset)
#     selector = np.arange(int(N * dataset_frac))
#     return dataset.select(selector)


In [14]:
train_dataset = load_tagging_dataset(dataset,
                                lang = pivot_lang,
                                split="dev")
test_dataset = load_tagging_dataset(dataset,
                                lang = tgt_lang,
                                split="test")

Map:   0%|          | 0/9999 [00:00<?, ? examples/s]

Map:   0%|          | 0/9999 [00:00<?, ? examples/s]

In [15]:
train_dataset[0]

{'tokens': ['Sioux',
  'Falls',
  'Arena',
  '(',
  'Sioux',
  'Falls',
  ',',
  'South',
  'Dakota',
  ')'],
 'tags': ['B-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'B-LOC',
  'I-LOC',
  'I-LOC',
  'I-LOC',
  'I-LOC',
  'O'],
 'tagged_tokens': ['Sioux_B-ORG',
  'Falls_I-ORG',
  'Arena_I-ORG',
  '(_O',
  'Sioux_B-LOC',
  'Falls_I-LOC',
  ',_I-LOC',
  'South_I-LOC',
  'Dakota_I-LOC',
  ')_O']}

In [16]:
train_examples = choose_few_shot_examples(
        train_dataset, few_shot_k, selection_criteria="random")

In [17]:
PROMPTS_DICT = {
    "structure_prompting": """C: {context}\nT: {tagged}""",
    "structure_prompting_chat": """{context}\n{tagged}"""
}

In [18]:
prompt_template = PROMPTS_DICT[prompt_name]

In [19]:
# Loading instruction for the task
instruction = INSTRUCTIONS[dataset]
print(instruction)

You are an NLP assistant whose purpose is to perform Named Entity Recognition (NER). NER involves identifying and classifying named entities in a text into predefined categories such as person names, organizations, locations, and others. You will need to use the tags defined below:
O means the word doesn’t correspond to any entity.
B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.
B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.
B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.


In [20]:
train_examples

[{'tokens': ["'", "''", 'With', 'James', 'Moody', "''", "'"],
  'tags': ['O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O'],
  'tagged_tokens': ["'_O",
   "''_O",
   'With_O',
   'James_B-PER',
   'Moody_I-PER',
   "''_O",
   "'_O"]},
 {'tokens': ['Derrick', 'Bell', '(', '1930–2011', ')'],
  'tags': ['B-PER', 'I-PER', 'O', 'O', 'O'],
  'tagged_tokens': ['Derrick_B-PER',
   'Bell_I-PER',
   '(_O',
   '1930–2011_O',
   ')_O']},
 {'tokens': ["'", "''", 'Men', "'s", 'Shot', 'Put', "''", "'"],
  'tags': ['O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O'],
  'tagged_tokens': ["'_O",
   "''_O",
   'Men_B-ORG',
   "'s_I-ORG",
   'Shot_I-ORG',
   'Put_I-ORG',
   "''_O",
   "'_O"]},
 {'tokens': ["'", "''", 'Kim', 'Lamarre', "''", "'"],
  'tags': ['O', 'O', 'B-PER', 'I-PER', 'O', 'O'],
  'tagged_tokens': ["'_O",
   "''_O",
   'Kim_B-PER',
   'Lamarre_I-PER',
   "''_O",
   "'_O"]},
 {'tokens': ["'",
   "''",
   'Houston',
   ':',
   'The',
   'Legend',
   'of',
   'Texas',
   "''",
   "'",
   "''"],


In [21]:
valid_labels = set()
for example in train_examples:
    valid_labels.update(example["tags"])
valid_labels = list(valid_labels)
valid_labels

['B-ORG', 'I-ORG', 'B-LOC', 'B-PER', 'O', 'I-PER']

In [22]:
test_example = test_dataset[0]

prompt, label = construct_tagging_prompt(
    train_examples,
    test_example,
    prompt_template=prompt_template,
    chat_prompt=True,
    instruction=instruction
)
prompt

[{'role': 'system',
  'content': 'You are an NLP assistant whose purpose is to perform Named Entity Recognition (NER). NER involves identifying and classifying named entities in a text into predefined categories such as person names, organizations, locations, and others. You will need to use the tags defined below:\nO means the word doesn’t correspond to any entity.\nB-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.\nB-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.\nB-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.'},
 {'role': 'user', 'content': "' '' With James Moody '' '"},
 {'role': 'assistant',
  'content': "'_O ''_O With_O James_B-PER Moody_I-PER ''_O '_O"},
 {'role': 'user', 'content': 'Derrick Bell ( 1930–2011 )'},
 {'role': 'assistant',
  'content': 'Derrick_B-PER Bell_I-PER (_O 1930–2011_O )_O'},
 {'role': 'user', 'content': "' '' Men 's Shot Put '' '"},
 {'ro

In [23]:
prompt[-2]["content"].strip()

"'_O ''_O 03_O :_O '_O ''_O Antwerp_B-LOC (_O ''Antwerpen_O ''_O /_O ''Anvers_O ''_O )_O ,_O Sint-Niklaas_B-LOC"

In [24]:
test_example["tokens"]

['Shortly',
 'afterward',
 ',',
 'an',
 'encouraging',
 'response',
 'influenced',
 'him',
 'to',
 'go',
 'to',
 'India',
 ';',
 'he',
 'arrived',
 'at',
 'Adyar',
 'in',
 '1884',
 '.']

In [25]:
preds = gpt3x_tagger(
    prompt,
    model,
    test_example["tokens"],
    one_shot_tag=True,
    temperature=0,
    max_tokens=100
)

In [26]:
get_model_pred(
    train_examples,
    test_example,
    prompt_template,
    verbalizer={},
    model=model,
    chat_prompt=True,
    instruction=instruction,
    one_shot_tag=True,
    max_tokens=max_tokens
    
)

{'prediction': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  ''],
 'ground_truth': ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O']}

In [27]:
preds = [pred if pred != "" else np.random.choice(valid_labels) for pred in preds]

In [28]:
print(preds)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'I-PER']


In [29]:
print(f"Prediction: {preds}")
print(f"Label: {label}")

f1_score([preds], [label])

# prediction = {"prediction_text": pred, "id": test_example["id"]}
# reference = {}
# reference["answers"] = test_example["answers"]
# reference["id"] = test_example["id"]
# results = squad_metric.compute(
#             predictions=[prediction],
#             references=[reference]
#         )

Prediction: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'I-PER']
Label: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O']


0.8

In [31]:
f1_sum = 0
em_sum = 0
avg_em = 0
avg_f1 = 0

run_details = {"num_calls": 0}

pbar = tqdm(enumerate(test_dataset.select(range(1000))))

for i, test_example in pbar:    
    prompt, label = construct_tagging_prompt(
        train_examples,
        test_example,
        prompt_template=prompt_template,
        chat_prompt=True,
        instruction=instruction
    )
    preds = gpt3x_tagger(
        prompt,
        model,
        test_example["tokens"],
        one_shot_tag=True,
        temperature=0,
        max_tokens=100
    )
    preds = [pred if pred != "" else np.random.choice(valid_labels) for pred in preds]
    f1_sum += f1_score([preds], [label])
        
    avg_f1 = f1_sum / (i+1)
    
#     wandb.log({"f1": avg_f1})
#     wandb.log(run_details)
    pbar.set_description(f"f1: {avg_f1}")
#     time.sleep(1/2)

0it [00:00, ?it/s]



KeyboardInterrupt: 