In [2]:
import json
from tqdm import tqdm
import os
from datasets import load_dataset
import re
import spacy
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
from collections import Counter

# Natural Questions (NQ)

In [3]:
tokenizer_UL2 = AutoTokenizer.from_pretrained("google/flan-ul2", model_max_length=4000)

In [27]:
NQ_data_dir = r"../data/NQ/raw/v1.0-simplified_nq-dev-all.jsonl" #r"../data/NQ/raw/full_train/v1.0/train/nq-train-01.jsonl" #r"data/NQ/raw/v1.0-simplified_nq-dev-all.jsonl" # replace in the end
with open(NQ_data_dir, 'r') as f1:
    NQ_data = [json.loads(line) for line in f1.readlines()]



# NQ_dev_data_dir=r"data/NQ/raw/v1.0-simplified_nq-dev-small.jsonl" #r"data/NQ/raw/v1.0-simplified_nq-dev-all.jsonl" # replace in the end
# with open(NQ_dev_data_dir, 'r') as f1:
#     NQ_dev_data = [json.loads(line) for line in f1.readlines()]

KeyboardInterrupt: 

In [5]:
len(NQ_data)

6150

In [13]:
is_adversarial = True
outdir_prefix = "adversarial" if is_adversarial else "control_group"
outdir = os.path.join(r"../data/NQ", f"{outdir_prefix}_NQ.jsonl") #os.path.join(r"data/NQ", f"{outdir_prefix}_NQ.jsonl")

In [None]:
# NQ_data_small = NQ_data[:10]


In [None]:
# # Check if has unanswerable questions
# sum([sum([annotation['long_answer']['start_byte'] >= 0 and not (bool(annotation['short_answers']) or annotation['yes_no_answer'] != 'NONE') for annotation in instance['annotations']]) >= 1 for instance in NQ_data_small])

In [None]:
# small_NQ_data_dir = r"data/NQ/raw/v1.0-simplified_nq-dev-small.jsonl"
# with open(small_NQ_data_dir, 'w') as f1:
#     for instance in NQ_data_small:
#         f1.write(json.dumps(instance))
#         f1.write("\n")

In [21]:
def get_sentence_embedding(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

In [17]:
# Load the Sentence-BERT model and tokenizer
model_name = "sentence-transformers/paraphrase-distilroberta-base-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to("cuda:0")

In [24]:
full_outputs = []

for i,instance in tqdm(enumerate(NQ_data), total=len(NQ_data)): 

    # taken from https://github.com/google-research-datasets/natural-questions/blob/master/nq_browser.py lines 106-114 and the https://arxiv.org/abs/2211.05655 paper
    instances_with_long_answers = [annotation for annotation in instance['annotations'] if annotation['long_answer']['start_byte'] >= 0]
    instances_with_short_answers = [annotation for annotation in instances_with_long_answers if bool(annotation['short_answers']) or annotation['yes_no_answer'].upper() != 'NONE']

    # if is_adversarial and (len(instances_with_long_answers) == 0 or len(instances_with_short_answers)>=1):
    #     continue
    if not is_adversarial and len(instances_with_short_answers)==0:
        continue
    
    if is_adversarial:
        # filter only long answer candidates that start with <P> (a.k.a paragraphs)
        filtered_candidates_indices = [i for i,candidate in enumerate(instance['long_answer_candidates']) if instance['document_tokens'][int(candidate["start_token"])]["token"]=="<P>"]
        
        # filter out candidates that were annotated as long answers by at least one of the annotators
        annotated_long_answers_indices = set([annotation["long_answer"]["candidate_index"] for annotation in instances_with_long_answers])
        filtered_candidates_indices = [i for i in filtered_candidates_indices if not i in annotated_long_answers_indices]

        if not filtered_candidates_indices: # no paragraphs that weren't annotated as long answers
            continue

        # get candidates
        filtered_candidates = [instance['long_answer_candidates'][i] for i in filtered_candidates_indices]
        
        # get string paragraphs
        filtered_candidates_tokens = [[tkn["token"] for tkn in instance['document_tokens'][int(elem["start_token"]):int(elem["end_token"])]] for elem in filtered_candidates]
        filtered_paragraphs = [" ".join(candidate[1:]) for candidate in filtered_candidates_tokens] # [1:] to remove the <P> token

        # Get embeddings for the question and paragraphs
        question_embedding = get_sentence_embedding(instance['question_text'], model, tokenizer)
        filtered_paragraphs_embedding = [get_sentence_embedding(paragraph, model, tokenizer) for paragraph in filtered_paragraphs]
        
        # find similarities between each paragraph and the question
        similarities = [1 - cosine(question_embedding, p_embedding) for p_embedding in filtered_paragraphs_embedding]
        
        # choose the paragraph that is the closest to the question in the embedding space
        closest_paragraph_ind = np.argmax(similarities)
        curr_candidate = filtered_candidates[closest_paragraph_ind]
        correct_answer = ""
        context_start_tkn = curr_candidate['start_token']
        context_end_tkn = curr_candidate['end_token']
        curr_annotation_id = None
        curr_long_answer_candidate_index = filtered_candidates_indices[closest_paragraph_ind]
    else:
        # filter only annotations that start with <P> (a.k.a paragraphs)
        filtered_annotations_indices = [i for i,candidate in enumerate(instances_with_short_answers) if instance['document_tokens'][int(candidate["long_answer"]["start_token"])]["token"]=="<P>"]

        if not filtered_annotations_indices: # none of the annotations with short answers was a paragraph
            continue

        long_answers_candidate_indices = [instances_with_short_answers[i]["long_answer"]["candidate_index"] for i in filtered_annotations_indices]
        prev_len = len(long_answers_candidate_indices)
        long_answers_candidate_indices = [cand_ind for cand_ind in long_answers_candidate_indices if not "<table>" in " ".join([tkn["token"] for tkn in instance['document_tokens'][int(instance['long_answer_candidates'][cand_ind]["start_token"]):int(instance['long_answer_candidates'][cand_ind]["end_token"])]]).lower()]
        
        if not filtered_annotations_indices:
            continue

        long_answers_candidate_indices_cnt = Counter(long_answers_candidate_indices)
        best_long_answers_candidate_index, _ = long_answers_candidate_indices_cnt.most_common(1)[0]
        annotation_ind = long_answers_candidate_indices.index(best_long_answers_candidate_index)
        annotation_ind = filtered_annotations_indices[annotation_ind]
        curr_annotation = instances_with_short_answers[annotation_ind]

        if curr_annotation["yes_no_answer"].upper() != "NONE":
            correct_answer = curr_annotation["yes_no_answer"]
        else:
            answer_start_tkn = curr_annotation['short_answers'][0]['start_token']
            answer_end_tkn = curr_annotation['short_answers'][0]['end_token']
            correct_answer = " ".join([elem["token"] for elem in instance['document_tokens'][answer_start_tkn:answer_end_tkn]])
    
        context_start_tkn = curr_annotation['long_answer']['start_token']
        context_end_tkn = curr_annotation['long_answer']['end_token']
        curr_annotation_id = curr_annotation['annotation_id']
        curr_long_answer_candidate_index = curr_annotation['long_answer']["candidate_index"]

    paragraph_text = " ".join([elem["token"] for elem in instance['document_tokens'][context_start_tkn:context_end_tkn]]).replace("<P>", "").replace("</P>", "").strip()
    question_text = instance['question_text']
    paragraph_tkns = tokenizer_UL2.encode(paragraph_text)
    question_tkns = tokenizer_UL2.encode(question_text)
    if len(paragraph_tkns) + len(question_tkns) > 400: # to accomodate also the instructions
        continue

    full_outputs.append({"example_id": instance['example_id'],
                         "annotation_id": curr_annotation_id,
                         "long_answer_candidate": curr_long_answer_candidate_index,
                         "Question": question_text,
                         "Paragraphs": paragraph_text,
                         "answer": correct_answer})
    # # for the train data (for the classifiers)
    if len(full_outputs) >= 2000:
        break


  2%|▏         | 100/6150 [00:15<15:08,  6.66it/s]


In [10]:
len(full_outputs)

1642

In [25]:
with open(outdir, 'w') as f1:
    f1.write(json.dumps(full_outputs, indent=2))

# Musique

In [28]:
tokenizer_UL2 = AutoTokenizer.from_pretrained("google/flan-ul2", model_max_length=4000)

In [8]:
musique_data_dir = r"../data/musique/raw/data/musique_full_v1.0_dev.jsonl" #r"data/musique/raw/data/musique_full_v1.0_train.jsonl" 
with open(musique_data_dir, 'r') as f1:
    musique_data = [json.loads(line) for line in f1.readlines()]

In [9]:
paragraphs = ["\n".join([p["paragraph_text"] for p in instance["paragraphs"]]) for instance in musique_data]

In [None]:
paragraphs_tkn = [tokenizer_UL2.encode(paragraph_text) for paragraph_text in paragraphs]

In [7]:
paragraphs_tkn_len = [len(elem) for elem in paragraphs_tkn]

In [8]:
paragraphs_tkn_len_small = [elem for elem in paragraphs_tkn_len if elem<2000]

In [9]:
len(paragraphs_tkn_len_small)

11921

In [11]:
def get_sentence_embedding(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [12]:
# Load the Sentence-BERT model and tokenizer
model_name = "sentence-transformers/paraphrase-distilroberta-base-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [13]:
def get_closest_paragaph_ind(qa_embedding, paragraphs_embedding):
    # find similarities between each paragraph and the question
    similarities = [1 - cosine(qa_embedding, p_embedding) for p_embedding in paragraphs_embedding]
    
    # choose the paragraph that is the closest to the question in the embedding space
    return np.argmax(similarities)

In [14]:
answerable = []
unanswerable = []
for instance in tqdm(musique_data):
    if instance["answerable"]:
        curr_paragraphs = [p["paragraph_text"] for p in instance['paragraphs'] if p["is_supporting"]]
        curr_paragraphs = "\n ".join([f"Paragraph {i+1}: {p}" for i,p in enumerate(curr_paragraphs)])
        answerable.append({"id": instance['id'],
                            "Question": instance['question'],
                            "Paragraphs": curr_paragraphs,
                            "answer": instance['answer']})
    else:
        curr_qa_decomposition = [q["question"] for q in instance['question_decomposition']]
        curr_qa_decomposition = curr_qa_decomposition + [q["answer"] for q in instance['question_decomposition']]

        curr_qa_decomposition_embed = [get_sentence_embedding(qa, model, tokenizer) for qa in curr_qa_decomposition]
        curr_paragraphs_embed = [get_sentence_embedding(p["paragraph_text"], model, tokenizer) for p in instance['paragraphs']]
        closest_paragaphs_ind = [get_closest_paragaph_ind(qa_embedding, curr_paragraphs_embed) for qa_embedding in curr_qa_decomposition_embed]
        closest_paragaphs_ind = set(closest_paragaphs_ind)
        curr_paragraphs = [instance['paragraphs'][paragraph_i]["paragraph_text"] for paragraph_i in closest_paragaphs_ind]
        curr_paragraphs = "\n ".join([f"Paragraph {i+1}: {p}" for i,p in enumerate(curr_paragraphs)])

        # take only not too long instances
        curr_paragraphs_tkns = tokenizer_UL2.encode(curr_paragraphs)
        if len(curr_paragraphs_tkns)>500:
            continue
        
        
        unanswerable.append({"id": instance['id'],
                            "Question": instance['question'],
                            "Paragraphs": curr_paragraphs,
                            "answer": "",
                            "actual_answer":instance['answer']})

  0%|          | 4/4834 [07:27<236:07:25, 175.99s/it]

In [14]:
answerable_outdir = "data/musique/control_group_musique_train.jsonl"
unanswerable_outdir = "data/musique/adversarial_musique_train.jsonl"

with open(answerable_outdir, 'w') as f1:
    f1.write(json.dumps(answerable, indent=2))

with open(unanswerable_outdir, 'w') as f1:
    f1.write(json.dumps(unanswerable, indent=2))

### filter out the long instances (musique)

In [15]:
answerable_outdir = "data/musique/control_group_musique_train.jsonl" #"data/musique/control_group_musique.jsonl"
unanswerable_outdir = "data/musique/adversarial_musique_train.jsonl" #"data/musique/adversarial_musique.jsonl"

with open(answerable_outdir, 'r') as f1:
    control_group_data = json.loads(f1.read())

with open(unanswerable_outdir, 'r') as f1:
    adversarial_data = json.loads(f1.read())

In [16]:
control_group_data_filtered = [instance for instance in control_group_data if len(tokenizer_UL2.encode(instance["Paragraphs"]))<=500]
adversarial_data_filtered = [instance for instance in adversarial_data if len(tokenizer_UL2.encode(instance["Paragraphs"]))<=500]

In [20]:
len(control_group_data_filtered)

17871

In [21]:
with open(answerable_outdir, 'w') as f1:
    f1.write(json.dumps(control_group_data_filtered, indent=2))

with open(unanswerable_outdir, 'w') as f1:
    f1.write(json.dumps(adversarial_data_filtered, indent=2))

# Extract (all annotated) Responses for NQ

In [None]:
indir_NQ_raw = "data/NQ/raw/v1.0-simplified_nq-dev-all.jsonl"
indir_extracted_control_group_NQ = "data/NQ/control_group_NQ.jsonl"
indir_extracted_adversarial_NQ = "data/NQ/adversarial_NQ.jsonl"

outdir_all_responses_NQ = "data/NQ/NQ_answers.jsonl"

In [None]:
with open(indir_NQ_raw, 'r') as f1:
    NQ_data_raw = [json.loads(line) for line in f1.readlines()]

In [None]:
with open(indir_extracted_control_group_NQ, 'r') as f1:
    NQ_data_extracted = json.loads(f1.read())
    extracted_ids = {elem["example_id"]:elem["annotation_id"] for elem in NQ_data_extracted}
    extracted_example_ids = list(extracted_ids.keys())

In [None]:
# NQ_data_raw = NQ_data[:500]

In [None]:
control_group_output_dict = dict()
for i,instance in tqdm(enumerate(NQ_data), total=len(NQ_data_raw)): 
    if not instance["example_id"] in extracted_example_ids:
        continue
    # if any(annotation["yes_no_answer"].upper() != "NONE" for annotation in instance["annotations"]):
    #     if not all(annotation["yes_no_answer"].upper() != "NONE" for annotation in instance["annotations"]):
    #         print("gotcha")
    curr_annotation_id = extracted_ids[instance["example_id"]]
    curr_long_answer_candidate = [annotation["long_answer"]["candidate_index"] for annotation in instance["annotations"] if annotation["annotation_id"]==curr_annotation_id][0]
    answers_limits = [(short_answer["start_token"], short_answer["end_token"]) for annotation in instance["annotations"] for short_answer in annotation["short_answers"] if annotation["long_answer"]["candidate_index"]==curr_long_answer_candidate]
    all_answers = [" ".join([tkn["token"] for tkn in instance['document_tokens'][s:e]]) for s,e in answers_limits]
    yes_no_answers = [annotation["yes_no_answer"] for annotation in instance["annotations"] if annotation["yes_no_answer"].upper() != "NONE" and annotation["long_answer"]["candidate_index"]==curr_long_answer_candidate]
    all_answers.extend(yes_no_answers)
    all_answers = list(set(all_answers))
    control_group_output_dict[instance["example_id"]] = all_answers

In [None]:
with open(indir_extracted_adversarial_NQ, 'r') as f1:
    adversarial_NQ_data_extracted = json.loads(f1.read())
    adversarial_output_dict = {f"{instance['example_id']}-unanswerable":"" for instance in adversarial_NQ_data_extracted}

In [None]:
# sanity check (make sure no "overlapping" keys)
set(adversarial_output_dict.keys()).intersection(set(control_group_output_dict.keys()))

In [None]:
# combine answerable and unanswerable questions
adversarial_output_dict.update(control_group_output_dict)

##### save

In [None]:
with open(outdir_all_responses_NQ, 'w') as f1:
    f1.write(json.dumps(adversarial_output_dict))

# Extract Responses for musique

In [110]:
indir_extracted_control_group_musique = "data/musique/control_group_musique.jsonl"
indir_extracted_adversarial_musique = "data/musique/adversarial_musique.jsonl"

outdir_all_responses_musique = "data/musique/musique_answers.jsonl"

In [111]:
with open(indir_extracted_control_group_musique, 'r') as f1:
    extracted_control_group_musique = json.loads(f1.read())

with open(indir_extracted_adversarial_musique, 'r') as f1:
    extracted_adversarial_musique = json.loads(f1.read())

In [112]:
extracted_adversarial_musique[0]

{'id': '2hop__153573_109006',
 'Question': "Who developed the eponymous character from the series that contains Mickey's Safari in Letterland?",
 'Paragraphs': "Paragraph 1: The White armored car was a series of armored cars developed by the White Motor Company in Cleveland, Ohio from 1915.\n Paragraph 2: The 100 (pronounced The Hundred) is an American post-apocalyptic science fiction drama television series developed by Jason Rothenberg, which premiered on March 19, 2014, on The CW. It is loosely based on a 2013 book of the same name, the first in a book series by Kass Morgan. The series follows a group of teens as they become the first people from a space habitat to return to Earth after a devastating nuclear apocalypse.\n Paragraph 3: Parc Safari is a zoo in Hemmingford, Quebec, Canada, and is one of the region's major tourist attractions; that has both African & Asian species of elephant.",
 'answer': '',
 'actual_answer': 'Walt Disney'}

In [113]:
musique_control_group_dict = {instance["id"]:[instance["answer"]] for instance in extracted_control_group_musique}
musique_adversarial_dict = {f"{instance['id']}-unanswerable":"" for instance in extracted_adversarial_musique}

In [114]:
# sanity check (make sure no "overlapping" keys)
set(musique_control_group_dict.keys()).intersection(set(musique_adversarial_dict.keys()))


set()

In [115]:
# combine answerable and unanswerable questions
musique_adversarial_dict.update(musique_control_group_dict)

##### save

In [116]:
with open(outdir_all_responses_musique, 'w') as f1:
        f1.write(json.dumps(musique_adversarial_dict))