In [79]:
from datasets import load_dataset
import random
import json
import pandas as pd
random.seed(666)

In [3]:
dataset = load_dataset("naver/processed-ambigqa", token="put_your_token")

In [4]:
dataset

DatasetDict({
    test: Dataset({
        features: ['nq_id', 'question', 'nq_answer', 'disambiguated_questions', 'disambiguated_answers', 'input_passages'],
        num_rows: 1069
    })
})

In [17]:
# small sample of ids to iterate on various prompting strategies
ids_ = dataset["test"]["nq_id"]
random.shuffle(ids_)
ids_ = ids_[:40]
print(ids_)

['4910574660426743103', '5706779148132378909', '82822746343941622', '-3775683158491019675', '-5333049627570569397', '5398964400508507960', '-7609206244947636807', '-4995398567532229571', '-2886061734189281128', '3886101581671349079', '7652576904728829439', '-196855123397981737', '-256357645980686898', '3152896460331899719', '651294105754681460', '3127369420834732535', '-6912552847579078268', '-6098775182466144557', '-91977775899598632', '1261841418358932814', '-1233203944270329571', '-7896024025216455001', '-5091579308296294034', '-648842070307266365', '7133553793803549181', '-6335401608477926483', '-4286108617234818000', '1040777011551693796', '4249908079104820528', '-6491913195208307840', '2547299592085156135', '-5333576035638611676', '-2046820450216612197', '1884293144811705689', '-9218899977338994404', '6061459584791759224', '-426004910266087839', '3863117877494111972', '-1382665261163735407', '7494804303418493420']


In [7]:
# is it really needed? maybe we can have something better
# the system prompt is used when sending data to GPT (not directly in the prompt)

system_prompt = "You are an expert at writing precise detailed instructions for language models. Your sole duty is to write instructions that can be used for precisely evaluating the ability of various retrieval models to follow such instructions. Answer succinctly and carefully, and follow all instructions.\n"
print("SYSTEM PROMPT:\n", system_prompt)

SYSTEM PROMPT:
 You are an expert at writing precise detailed instructions for language models. Your sole duty is to write instructions that can be used for precisely evaluating the ability of various retrieval models to follow such instructions. Answer succinctly and carefully, and follow all instructions.



# V1 prompt
### contrasting queries
(version used for the hackathon)
TODO  
- [x] change the output format (no need for the original query). 
- [ ] can be improved

In [8]:
template_prompt_basic = "## Input data  \nI have an original query that is naturally ambiguous. I also have {} queries that have been disambiguated ({}). \noriginal query: {}\n{}\n## Your task\nI need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to {} but not relevant to {}. Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. Output the response in JSON form only with no other text, with the key: “instruction” (str).\n## Your output (JSON only):"
print(template_prompt_basic)

## Input data  
I have an original query that is naturally ambiguous. I also have {} queries that have been disambiguated ({}). 
original query: {}
{}
## Your task
I need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to {} but not relevant to {}. Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. Output the response in JSON form only with no other text, with the key: “instruction” (str).
## Your output (JSON only):


In [90]:
def fill_template(query_ori, queries, template):
    """fill template prompt w/ input queries
    here, simplification: we contrast the **first** disambiguated query to the other ones
    later we might change
    """
    n = len(queries)
    query_list = [f"query_{i}" for i in range(1, n+1)]
    return template.format(n, 
                           ", ".join(query_list), 
                           query_ori, 
                           "\n".join([f"query_{i}: {queries[i-1]}" for i in range(1, n+1)]), 
                           "query_1", 
                           ", ".join(query_list[1:]))

def build_gpt_input(data, template, filter_ids=None):
    out = {}
    for item in data:
        if (filter_ids is None) or (item["nq_id"] in filter_ids):
            queries = item["disambiguated_questions"]
            queries = queries[:6]  # to simplify, only consider six queries max
            query_ori = item["question"]
            out["{}_{}".format(item["nq_id"], 1)] = fill_template(query_ori, queries, template)
    return out

In [19]:
out_basic = build_gpt_input(dataset["test"], template_prompt_basic, ids_)

In [20]:
print(out_basic["4910574660426743103_1"])

## Input data  
I have an original query that is naturally ambiguous. I also have 3 queries that have been disambiguated (query_1, query_2, query_3). 
original query: Who opened the gate in the prison walking dead?
query_1: From the viewer's persepctive, who opened the gate to the prison, at the beginning of the walking dead's "Killer Within" episode?
query_2: Who is the character that is discovered to have opened the gate in the prison walking dead?
query_3: Who is the actor that is discovered to have opened the gate in the prison walking dead?
## Your task
I need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to query_1 but not relevant to query_2, query_3. Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the

In [24]:
json.dump(out_basic, open("data/prompt_samples/prompt_v1.json", "w"))

# V2 prompt
### contrasting queries after estimating passage relevance.  
version used for the hackathon (the basic form, without the passage filtering).  
TODO  
- [x] change the output format (no need for the original query).
- [ ] can be improved
- [ ] maybe check the prompt from RankGPT?
- [ ] filter passages that actually contain the target answer? make the life easier for the LLM?

In [30]:
template_prompt_basic_with_docs = "## Input Data  \nI have an original query that is naturally ambiguous. I also have {} queries that have been disambiguated ({}). In addition, I have a list of ten documents which should contain the answers to each query.\noriginal query: {}\n{}\n{}\n## Your task\nI need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to {} but not relevant to {}. You first need to identify which document is most relevant to {} (if any), before generating the corresponding instruction. Provide detailed specifics in the instruction for what makes this specific document relevant. Remember that this criteria should make the one document relevant and all others irrelevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. If no document is judged relevant, output “null“ in the corresponding “relevant_document_id” field. Output the response in JSON form only with no other text, with the keys: “instruction” (str) and “relevant_document_id” (str).\n## Your output (JSON only):"
print(template_prompt_basic_with_docs)

## Input Data  
I have an original query that is naturally ambiguous. I also have {} queries that have been disambiguated ({}). In addition, I have a list of ten documents which should contain the answers to each query.
original query: {}
{}
{}
## Your task
I need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to {} but not relevant to {}. You first need to identify which document is most relevant to {} (if any), before generating the corresponding instruction. Provide detailed specifics in the instruction for what makes this specific document relevant. Remember that this criteria should make the one document relevant and all others irrelevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. If no document is judged re

In [31]:
def fill_template_with_docs(query_ori, queries, docs, template):
    """here, simplification: we contrast the **first** disambiguated query to the other ones
    """
    n = len(queries)
    query_list = [f"query_{i}" for i in range(1, n+1)]
    return template.format(n, 
                           ", ".join(query_list), 
                           query_ori, 
                           "\n".join([f"query_{i}: {queries[i-1]}" for i in range(1, n+1)]), 
                           "\n".join([f"document_{i}: {docs[i-1]}" for i in range(1, len(docs) + 1)]),
                           "query_1", 
                           ", ".join(query_list[1:]),
                           "query_1"
                          )

def build_gpt_input_with_docs(data, template, filter_ids=None):
    out = {}
    for item in data:
        if (filter_ids is None) or (item["nq_id"] in filter_ids):
            queries = item["disambiguated_questions"]
            queries = queries[:6]  # to simplify, only consider six queries max
            query_ori = item["question"]
            docs = item["input_passages"]
            out["{}_{}".format(item["nq_id"], 1)] = fill_template_with_docs(query_ori, queries, docs, template)
    return out

In [32]:
out_with_docs = build_gpt_input_with_docs(dataset["test"], template_prompt_basic_with_docs, ids_)

In [33]:
print(out_with_docs["4910574660426743103_1"])

## Input Data  
I have an original query that is naturally ambiguous. I also have 3 queries that have been disambiguated (query_1, query_2, query_3). In addition, I have a list of ten documents which should contain the answers to each query.
original query: Who opened the gate in the prison walking dead?
query_1: From the viewer's persepctive, who opened the gate to the prison, at the beginning of the walking dead's "Killer Within" episode?
query_2: Who is the character that is discovered to have opened the gate in the prison walking dead?
query_3: Who is the actor that is discovered to have opened the gate in the prison walking dead?
document_1: # The Prisoners (The Walking Dead)
been better off just leaving once he managed to get the gate open Bex Schwartz wrote in her review for Rolling Stone magazine that when Andrew the tiny prisoner tries to get Oscar to shoot Rick Oscar shoots Andrew instead because Oscar understands life and death and remembers that Andrew was one of the bad du

In [29]:
json.dump(out_with_docs, open("data/prompt_samples/prompt_v2.json", "w"))

# V3 prompt
### constrasting with answers only

In [38]:
template_prompt_answers = "## Input data  \nI have an original query that is naturally ambiguous. I also have a query that has been disambiguated. \noriginal query: {}\n{}\n## Your task\nI need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to the disambiguated query, but not relevant to the following topics: {}. Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. Output the response in JSON form only with no other text, with the key: “instruction” (str).\n## Your output (JSON only):"
print(template_prompt_answers)

## Input data  
I have an original query that is naturally ambiguous. I also have a query that has been disambiguated. 
original query: {}
{}
## Your task
I need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to the disambiguated query, but not relevant to the following topics: {}. Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. Output the response in JSON form only with no other text, with the key: “instruction” (str).
## Your output (JSON only):


In [40]:
def fill_template_answers(query_ori, query, answers, template):
    """
    """
    return template.format(
                           query_ori, 
                           f"disambiguated query : {query}", 
                           ", ".join(answers))

def build_gpt_input_answers(data, template, filter_ids=None):
    out = {}
    for item in data:
        if (filter_ids is None) or (item["nq_id"] in filter_ids):
            query = item["disambiguated_questions"][0]
            query_ori = item["question"]
            answers = item["disambiguated_answers"][1:]
            out["{}_{}".format(item["nq_id"], 1)] = fill_template_answers(query_ori, query, answers, template)
    return out

In [42]:
out_with_answers = build_gpt_input_answers(dataset["test"], template_prompt_answers, ids_)

In [43]:
print(out_with_answers["4910574660426743103_1"])

## Input data  
I have an original query that is naturally ambiguous. I also have a query that has been disambiguated. 
original query: Who opened the gate in the prison walking dead?
disambiguated query : From the viewer's persepctive, who opened the gate to the prison, at the beginning of the walking dead's "Killer Within" episode?
## Your task
I need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to the disambiguated query, but not relevant to the following topics: Andrew, Markice Moore. Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. Output the response in JSON form only with no other text, with the key: “instruction” (str).
## Your o

In [45]:
json.dump(out_with_answers, open("data/prompts_samples/prompt_v3.json", "w"))

# V4 prompt
### constrasting with queries and answers

In [48]:
template_prompt_queries_answers = "## Input data  \nI have an original query that is naturally ambiguous. I also have {} queries that have been disambiguated ({}). \noriginal query: {}\n{}\n## Your task\nI need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to {} (whose answer is unknown) but not relevant to {} (whose answers are given). Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. Output the response in JSON form only with no other text, with the key: “instruction” (str).\n## Your output (JSON only):"
print(template_prompt_queries_answers)

## Input data  
I have an original query that is naturally ambiguous. I also have {} queries that have been disambiguated ({}). 
original query: {}
{}
## Your task
I need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to {} (whose answer is unknown) but not relevant to {} (whose answers are given). Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a test for strong frontier language models to determine if they can follow instructions. Also, be very sure that the instruction is generic and does not contain the answer to the query. Output the response in JSON form only with no other text, with the key: “instruction” (str).
## Your output (JSON only):


In [54]:
def fill_template_queries_answers(query_ori, queries, answers, template):
    """
    """
    n = len(queries)
    query_list = [f"query_{i}" for i in range(1, n+1)]
    return template.format(n, 
                           ", ".join(query_list), 
                           query_ori, 
                           "\n".join([f"query_{i}: {queries[i-1]} => answer: {answers[i-1]}" for i in range(1, n+1)]), 
                           "query_1", 
                           ", ".join(query_list[1:]))

def build_gpt_input_queries_answers(data, template, filter_ids=None):
    out = {}
    for item in data:
        if (filter_ids is None) or (item["nq_id"] in filter_ids):
            queries = item["disambiguated_questions"]
            queries = queries[:6]  # to simplify, only consider six queries max
            query_ori = item["question"]
            answers = item["disambiguated_answers"]
            answers[0] = "unknown"
            out["{}_{}".format(item["nq_id"], 1)] = fill_template_queries_answers(query_ori, queries, answers, template_prompt_queries_answers)
    return out

In [55]:
out_with_queries_answers = build_gpt_input_queries_answers(dataset["test"], fill_template_queries_answers, ids_)

In [56]:
print(out_with_queries_answers["4910574660426743103_1"])

## Input data  
I have an original query that is naturally ambiguous. I also have 3 queries that have been disambiguated (query_1, query_2, query_3). 
original query: Who opened the gate in the prison walking dead?
query_1: From the viewer's persepctive, who opened the gate to the prison, at the beginning of the walking dead's "Killer Within" episode? => answer: unknown
query_2: Who is the character that is discovered to have opened the gate in the prison walking dead? => answer: Andrew
query_3: Who is the actor that is discovered to have opened the gate in the prison walking dead? => answer: Markice Moore
## Your task
I need you to come up with an instruction that can be appended to the end of the original query, to help distinguish documents that are relevant to query_1 (whose answer is unknown) but not relevant to query_2, query_3 (whose answers are given). Provide detailed specifics in the instruction for what makes a document relevant. This additional instruction should provide a 

In [57]:
json.dump(out_with_queries_answers, open("data/prompts_samples/prompt_v4.json", "w"))

# V5 prompt
### ?

# post-processing

In [104]:
def post_process(output_file, data, with_docs=False):
    """messy post process gpt output to csv
    """
    file_name = output_file.split("/")[-1].split(".")[0]
    output = json.load(open(output_file))
    nq_id = []
    ori_query = []
    disamb_query = []
    disamb_answer = []
    inst = []
    if with_docs:
        docs = []
    for elem in output:
        id_ = list(elem.keys())[0]
        d = elem[id_]
        if len(d) > 0:
            id_ = id_.split("_")[0]
            nq_id.append(id_)
            line_in_data = data.filter(lambda example: example["nq_id"].startswith(id_))
            ori_query.append(line_in_data["question"][0])
            disamb_query.append(line_in_data["disambiguated_questions"][0][0])
            disamb_answer.append(line_in_data["disambiguated_answers"][0][0])
            inst.append(d["instruction"])
            if with_docs:
                docs.append(d["relevant_document_id"])
        else:
            # failed gpt generation
            print(id_)
            print(d)
    d = {"nq id": nq_id, 
         "original query": ori_query,
         "disambiguated query": disamb_query,
         "disambiguated answer": disamb_answer,
         "instruction": inst}
    if with_docs:
        d["relevant_document_id"] = docs
    df = pd.DataFrame.from_dict(d)
    df.to_csv(f'data/prompts_samples/instructions/{file_name}.csv', index=False)  

In [105]:
post_process("/beegfs/scratch/user/hdejean/bergen_branches_test/TAR/scripts/prompt_v2_gpt4.json",
             dataset["test"], with_docs=True)