In [1]:
import pandas as pd
from tqdm import tqdm
from IPython.display import display 
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain

from utils import evaluate, ColBERTv2


train = [('Who produced the album that included a re-recording of "Lithium"?', ['Butch Vig']),
         ('Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?', ['Kevin Greutert']),
         ('The heir to the Du Pont family fortune sponsored what wrestling team?', ['Foxcatcher', 'Team Foxcatcher', 'Foxcatcher Team']),
         ('In what year was the star of To Hell and Back born?', ['1925']),
         ('Which award did the first book of Gary Zukav receive?', ['U.S. National Book Award', 'National Book Award']),
         ('What city was the victim of Joseph Druces working in?', ['Boston, Massachusetts', 'Boston']),]

dev = [('Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?', ['E. L. Doctorow', 'E.L. Doctorow', 'Doctorow']),
       ('What documentary about the Gilgo Beach Killer debuted on A&E?', ['The Killing Season']),
       ('Right Back At It Again contains lyrics co-written by the singer born in what city?', ['Gainesville, Florida', 'Gainesville']),
       ('What year was the party of the winner of the 1971 San Francisco mayoral election founded?', ['1828']),
       ('Which author is English: John Braine or Studs Terkel?', ['John Braine']),
       ('Anthony Dirrell is the brother of which super middleweight title holder?', ['Andre Dirrell']),
       ('In which city is the sports nutrition business established by Oliver Cookson based ?', ['Cheshire', 'Cheshire, UK']),
       ('Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.', ['February 13, 1980']),
       ('Kyle Moran was born in the town on what river?', ['Castletown', 'Castletown River']),
       ("What is the name of one branch of Robert D. Braun's speciality?", ['aeronautical engineering', 'astronautical engineering', 'aeronautics', 'astronautics']),
       ("Where was the actress who played the niece in the Priest film born?", ['Surrey', 'Guildford, Surrey']),
       ('Name the movie in which the daughter of Noel Harrison plays Violet Trefusis.', ['Portrait of a Marriage']),
       ('What year was the father of the Princes in the Tower born?', ['1442'])]

# Initialize training and dev sets
train = [{'question': q, 'answer': a[0]} for q, a in train]
dev = [{'question': q, 'answers': a} for q, a in dev]

# Initialize LLM and retrieval model
llm = OpenAI(temperature=0)
rm = ColBERTv2('http://ec2-44-228-128-229.us-west-2.compute.amazonaws.com:8893/api/search')

# 1 - Vanilla GPT-3.5

In [2]:
example_question_answer_template = """
Question: {question}
Answer: {answer}
"""
example_prompt = PromptTemplate(
    input_variables=['question', 'answer'],
    template=example_question_answer_template,
)

vanilla_prefix = """
Answer questions with short factoid answers.

---

Follow the following format.

Question: $[the question to be answered]
Answer: $[a short factoid answer, often between 1 and 5 words]

---
"""

few_shot_prompt = FewShotPromptTemplate(
    examples=train,
    example_prompt=example_prompt,
    prefix=vanilla_prefix,
    suffix="\nQuestion: {question}\nAnswer:",
    input_variables=['question'],
    example_separator=''
)

vanilla_chain = LLMChain(llm=llm, prompt=few_shot_prompt)

def run_vanilla_chain(example):
    return vanilla_chain.run(question=example['question'])

In [3]:
evaluate(run_vanilla_chain, dev)

100%|██████████| 13/13 [00:12<00:00,  1.02it/s]

Correct: 3 / 13





Unnamed: 0,question,answers,prediction,correct
0,Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?,"[E. L. Doctorow, E.L. Doctorow, Doctorow]",E. L. Doctorow,✅
1,What documentary about the Gilgo Beach Killer debuted on A&E?,[The Killing Season],The Long Island Serial Killer,❌
2,Right Back At It Again contains lyrics co-written by the singer born in what city?,"[Gainesville, Florida, Gainesville]","Austin, Texas",❌
3,What year was the party of the winner of the 1971 San Francisco mayoral election founded?,[1828],1968,❌
4,Which author is English: John Braine or Studs Terkel?,[John Braine],John Braine,✅
5,Anthony Dirrell is the brother of which super middleweight title holder?,[Andre Dirrell],Andre Dirrell,✅
6,In which city is the sports nutrition business established by Oliver Cookson based ?,"[Cheshire, Cheshire, UK]","London, England",❌
7,Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.,"[February 13, 1980]","August 12, 1976",❌
8,Kyle Moran was born in the town on what river?,"[Castletown, Castletown River]",Thames,❌
9,What is the name of one branch of Robert D. Braun's speciality?,"[aeronautical engineering, astronautical engineering, aeronautics, astronautics]",Aerospace engineering,❌


# 2 - Retrieve then read

In [4]:
retrieval_prefix = """
Answer questions with short factoid answers.
"""

retrieval_suffix = """
---

Follow the following format.

Context:
$[sources that may contain relevant content]

Question: $[the question to be answered]

Answer: $[a short factoid answer, often between 1 and 5 words]

---

Context:
{context}

Question: {question}

Answer:
"""

retrieval_prompt = FewShotPromptTemplate(
    examples=train,
    example_prompt=example_prompt,
    prefix=retrieval_prefix,
    suffix=retrieval_suffix,
    input_variables=['context', 'question'],
    example_separator=''
)

chain = LLMChain(llm=llm, prompt=retrieval_prompt)

def run_retrieval_chain(example):
    context = rm(example['question'])[0]
    return chain.run(context=context, question=example['question'])

In [5]:
evaluate(run_retrieval_chain, dev)

100%|██████████| 13/13 [00:15<00:00,  1.20s/it]

Correct: 5 / 13





Unnamed: 0,question,answers,prediction,correct
0,Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?,"[E. L. Doctorow, E.L. Doctorow, Doctorow]",E.L. Doctorow,✅
1,What documentary about the Gilgo Beach Killer debuted on A&E?,[The Killing Season],The Killing Season,✅
2,Right Back At It Again contains lyrics co-written by the singer born in what city?,"[Gainesville, Florida, Gainesville]","Ocala, Florida",❌
3,What year was the party of the winner of the 1971 San Francisco mayoral election founded?,[1828],1925,❌
4,Which author is English: John Braine or Studs Terkel?,[John Braine],Studs Terkel,❌
5,Anthony Dirrell is the brother of which super middleweight title holder?,[Andre Dirrell],Andre Dirrell,✅
6,In which city is the sports nutrition business established by Oliver Cookson based ?,"[Cheshire, Cheshire, UK]","Manchester, UK.",❌
7,Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.,"[February 13, 1980]","August 5, 1862",❌
8,Kyle Moran was born in the town on what river?,"[Castletown, Castletown River]",Liffey,❌
9,What is the name of one branch of Robert D. Braun's speciality?,"[aeronautical engineering, astronautical engineering, aeronautics, astronautics]",Jet Propulsion,❌


# 3 - Retrieve then read with self-consistency

In [39]:
from collections import Counter

prefix = """
Answer questions with short factoid answers.
"""

suffix = """
---

Follow the following format.

Context:
$[sources that may contain relevant content]

Question: $[the question to be answered]

Rationale: Let's think step by step. $[a step-by-step deduction that identifies the correct response, which will be provided below]

Answer: $[a short factoid answer, often between 1 and 5 words]

---

Context:
{context}

Question: {question}

Rationale: Let's think step by step."""

rationale_prompt = FewShotPromptTemplate(
    examples=train,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=['context', 'question'],
    example_separator=''
)

chain = LLMChain(llm=llm, prompt=rationale_prompt)

def run_rationale_chain(example, n=20):
    context = rm(example['question'], k=5)
    context = '\n'.join([f'[{i+1}] {c}' for i, c in enumerate(context)])

    # Self consistency
    answers = []
    for i in range(n):
        completion = chain.run(context=context, question=example['question'])
        last_line = completion.split('\n')[-1].strip()
        answer = last_line.split(':')[-1].strip()
        answers.append(answer)

    return Counter(answers).most_common(1)[0][0]

In [40]:
evaluate(run_rationale_chain, dev)

  8%|▊         | 1/13 [01:13<14:47, 73.96s/it]

['Physician', 'E.L. Doctorow', 'Physician', 'E. L. Doctorow', 'E.L. Doctorow', 'Julia Peterkin', 'E. L. Doctorow', 'Doctorow', 'Physician', 'E.L. Doctorow', 'Doctorow', 'Julia Peterkin', 'E. L. Doctorow', 'Physician', 'Physician', 'E. L. Doctorow', 'E. L. Doctorow', 'Doctorow', 'Physician', 'E. L. Doctorow']


 15%|█▌        | 2/13 [02:14<12:07, 66.14s/it]

['The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season', 'The Killing Season']


 23%|██▎       | 3/13 [03:35<12:10, 73.02s/it]

['Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Canada', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida', 'Ottawa', 'Toronto, Ontario', 'Ocala, Florida.', 'Ocala, Florida', 'Ocala, Florida', 'Ocala, Florida']


 31%|███       | 4/13 [04:50<11:03, 73.72s/it]

['1965', '1828', '1969', 'Prior to 1971', '1849', '1933', '1968', '1828', '1828', '1971', '1934', '1952', '1828', '1968', '1968', '1828', '1971', '1934', '1934', '1945']


 38%|███▊      | 5/13 [06:11<10:10, 76.31s/it]

['John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine.', 'John Braine', 'John Braine', 'John Braine.', 'John Braine', 'John Braine', 'John Braine', 'John Braine', 'John Braine']


 46%|████▌     | 6/13 [07:27<08:52, 76.14s/it]

['Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell', 'Andre Dirrell.']


 54%|█████▍    | 7/13 [08:28<07:06, 71.12s/it]

['Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK', 'Cheshire, UK']


 62%|██████▏   | 8/13 [09:27<05:37, 67.46s/it]

['27 September 1974', '1978', '27 September 1974', '1938', '1943', '1945', '27 September 1974', '27 September 1974', '1978', '1936', '27 September 1974', '27 September 1974', '1943', '1941', '19 December 1944', '1954', '1939', '1974', '1978', '1945']


 69%|██████▉   | 9/13 [10:19<04:09, 62.40s/it]

['Castletown River', 'Boyne', 'Liffey', 'Castletown River', 'River Foyle', 'River Louth', 'River Boyne', 'Irish Sea', 'East River', 'Castletown River', 'Liffey', 'River Foyle', 'River Foyle', 'River Fane', 'Foyle', 'Liffey', 'Castletown River', 'River of Ireland', 'Liffey', 'River Boyne']


 77%|███████▋  | 10/13 [11:27<03:12, 64.20s/it]

['Aerospace Engineering', 'Aerospace Engineering', 'Aeronautics and Astronautics', 'Aeronautics and Astronautics', 'Space technology', 'Aerospace engineering', 'Aerospace engineering', 'Astronautics', 'Space Technology', 'Aeronautics and Astronautics', 'Aerospace engineering', 'Aeronautics and Astronautics', 'Aerospace engineering', 'Aeronautics and Astronautics', 'Aerospace engineering', 'Astronautics', 'Aerospace engineering', 'Aeronautics and astronautics.', 'Aerospace engineering', 'Aerospace engineering']


 85%|████████▍ | 11/13 [13:52<02:58, 89.02s/it]

['London', '1990', '1995', 'London, England', '1990', '1990', 'United States', 'England', '1990', 'London', 'London', '1990', '1995', 'London', 'London', 'New York City, 1958', 'London', 'London, England', '1990', 'London']


 92%|█████████▏| 12/13 [14:50<01:19, 79.62s/it]

['Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', '"Portrait of a Marriage"', 'Portrait of a Marriage.', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage', 'Portrait of a Marriage']


100%|██████████| 13/13 [16:08<00:00, 74.49s/it]

['1442', '1462', '1312', '1312', '1455', 'c. 1455', 'c. 1455', '1442', '1442', '1442', '1442', '1455', 'c. 1455', '1442', '1442', '1442', '1340', '1442', '1455', 'c. 1455']
Correct: 8 / 13





Unnamed: 0,question,answers,prediction,correct
0,Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?,"[E. L. Doctorow, E.L. Doctorow, Doctorow]",Physician,❌
1,What documentary about the Gilgo Beach Killer debuted on A&E?,[The Killing Season],The Killing Season,✅
2,Right Back At It Again contains lyrics co-written by the singer born in what city?,"[Gainesville, Florida, Gainesville]","Ocala, Florida",❌
3,What year was the party of the winner of the 1971 San Francisco mayoral election founded?,[1828],1828,✅
4,Which author is English: John Braine or Studs Terkel?,[John Braine],John Braine,✅
5,Anthony Dirrell is the brother of which super middleweight title holder?,[Andre Dirrell],Andre Dirrell,✅
6,In which city is the sports nutrition business established by Oliver Cookson based ?,"[Cheshire, Cheshire, UK]","Cheshire, UK",✅
7,Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.,"[February 13, 1980]",27 September 1974,❌
8,Kyle Moran was born in the town on what river?,"[Castletown, Castletown River]",Castletown River,✅
9,What is the name of one branch of Robert D. Braun's speciality?,"[aeronautical engineering, astronautical engineering, aeronautics, astronautics]",Aerospace engineering,❌


# 4 - Multi-hop retrieval

In [6]:
search_retrieval_template_first_hop = """
Write a search query that will help answer a complex question.

---

Follow the following format.

Question: $[the question to be answered]
Rationale: Let's think step by step. To answer this question, we first need to find out $[the missing information]
Search Query: $[a simple question for seeking the missing information]

---

Question: {question}
Rationale: Let's think step by step. To answer this question, we first need to find out"""

first_hop_prompt = PromptTemplate(
    input_variables=['question'],
    template=search_retrieval_template_first_hop,
)

first_hop_chain = LLMChain(llm=llm, prompt=first_hop_prompt)

In [7]:
def format_context(context):
    """
    Format and enumerate a list of context strings for use in a prompt.
    """
    return '\n'.join([f'[{i+1}] {c}' for i, c in enumerate(context)])


def extract_last_line(completion, remove_prefix=True):
    """
    Extract the last line of a completion, optionally removing the prefix.
    """
    last_line = completion.split('\n')[-1].strip()
    if remove_prefix:
        last_line = last_line.split(':')[-1].strip()
    return last_line


search_retrieval_template_followup_hop = """
Write a search query that will help answer a complex question.

---

Follow the following format.

Context:
$[sources that may contain relevant content]

Question: $[the question to be answered]
Rationale: Let's think step by step. Based on the context, we have learned the following. $[information from the context that provides useful clues]
Search Query: $[a simple question for seeking the missing information]

---

Context:
{context}

Question: {question}
Rationale: Let's think step by step. Based on the context, we have learned the following."""

followup_hop_prompt = PromptTemplate(
    input_variables=['context', 'question'],
    template=search_retrieval_template_followup_hop,
)

followup_hop_chain = LLMChain(llm=llm, prompt=followup_hop_prompt)

In [8]:
prefix = """
Answer questions with short factoid answers.
"""

suffix = """
---

Follow the following format.

Context:
$[sources that may contain relevant content]

Question: $[the question to be answered]

Rationale: Let's think step by step. $[a step-by-step deduction that identifies the correct response, which will be provided below]

Answer: $[a short factoid answer, often between 1 and 5 words]

---

Context:
{context}

Question: {question}

Rationale: Let's think step by step."""

rationale_prompt = FewShotPromptTemplate(
    examples=train,
    example_prompt=example_prompt,
    prefix=prefix,
    suffix=suffix,
    input_variables=['context', 'question'],
    example_separator=''
)

answer_chain = LLMChain(llm=llm, prompt=rationale_prompt)

In [9]:
def run_multihop_chain(question):
    context = []

    # Get first hop retrieval question and context
    first_hop_completion = first_hop_chain.run(question=question)
    retrieval_question_first_hop = extract_last_line(first_hop_completion)
    context.extend(rm(retrieval_question_first_hop, k=2))

    # Get second hop retrieval question and context
    second_hop_completion = followup_hop_chain.run(context=format_context(context), question=question)
    retrieval_question_second_hop = extract_last_line(second_hop_completion)

    context.extend(rm(retrieval_question_second_hop, k=2))

    # Get final answer
    final_completion = answer_chain.run(context=format_context(context), question=question)
    answer = extract_last_line(final_completion)

    return answer

In [11]:
evaluate(run_multihop_chain, dev)

100%|██████████| 13/13 [02:00<00:00,  9.26s/it]

Correct: 10 / 13





Unnamed: 0,question,answers,prediction,correct
0,Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?,"[E. L. Doctorow, E.L. Doctorow, Doctorow]",E.L. Doctorow,✅
1,What documentary about the Gilgo Beach Killer debuted on A&E?,[The Killing Season],The Killing Season,✅
2,Right Back At It Again contains lyrics co-written by the singer born in what city?,"[Gainesville, Florida, Gainesville]","Gainesville, Florida",✅
3,What year was the party of the winner of the 1971 San Francisco mayoral election founded?,[1828],1986,❌
4,Which author is English: John Braine or Studs Terkel?,[John Braine],John Braine,✅
5,Anthony Dirrell is the brother of which super middleweight title holder?,[Andre Dirrell],Andre Dirrell,✅
6,In which city is the sports nutrition business established by Oliver Cookson based ?,"[Cheshire, Cheshire, UK]","Cheshire, UK",✅
7,Find the birth date of the actor who played roles in First Wives Club and Searching for the Elephant.,"[February 13, 1980]",1958,❌
8,Kyle Moran was born in the town on what river?,"[Castletown, Castletown River]",Castletown River,✅
9,What is the name of one branch of Robert D. Braun's speciality?,"[aeronautical engineering, astronautical engineering, aeronautics, astronautics]",Aeronautics and Astronautics,❌


# 5 - Multi-hop condensed retrieval with automatic demos and query fusion

This is the final program where we'll fix the following problems:

1. The search transformations invoke the LM without demonstrations in the prompt, as we only have training data for question-answer pairs and not intermediate labels (e.g., search queries)

2. The QA prompt uses passages (context) and CoT to answer the question, however the training demonstrations include neither

3. The search transformations commit to a single query per hop, which may single out an unproductive chain of passages and fail to uncover relevant information

In [12]:
# Make copy of train set to annotate
train_copy = train.copy()

In [13]:
def multihop_search_v1(example, max_hops=2, k=2):
    """
    Run multiple hops of search retrieval and return the retrieved passages as context.
    """
    context = []

    for hop in range(max_hops):
        # Generate query
        if hop == 0:
            completion = first_hop_chain.run(question=example['question'])
        else:
            completion = followup_hop_chain.run(context=format_context(context), question=example['question'])

        search_query = extract_last_line(completion)

        # Retrieve k results based on the generated query and add to context
        passages = rm(search_query, k=k)
        context.extend(passages)

    # Add context for later use as demonstration
    example['context'] = context
    return example


def QA_predict(example):
    """
    Run QA on the given example and context.
    """
    print(example['context'])
    completion = answer_chain.run(context=format_context(example['context']), question=example['question'])

    # Add predicted answer for later use as demonstration
    example['prediction'] = extract_last_line(completion)
    return example

In [14]:
def passage_has_answer(passage, answer):
    # TODO: Improve this
    return answer in passage
    

def passage_match(passages, answer):
    """
    Check if any of the passages contains the answer.
    """ 
    return any(passage_has_answer(passage, answer) for passage in passages)


In [15]:
def multihop_attempt(example):
    # Search and skip examples where search fails
    example = multihop_search_v1(example)
    # if not passage_match(example['context'], example['answer']): return None

    # Predict and skip examples where prediction fails
    example = QA_predict(example)
    # if not passage_match(example['prediction'], example['answer']): return None

    return example
    