##Set up

Import all the necessary packages, set up and configure llms. Set GPT-3 as default and switch to Llama 2 when necessary

In [1]:
try:
    # This library is our indicator that the required installs
    # need to be done.
    import datasets
    root_path = '.'
except ModuleNotFoundError:
    !pip install -r requirements.txt
    root_path = 'dspy'

Collecting jupyter>=1.0.0 (from -r requirements.txt (line 7))
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting transformers==4.37.1 (from -r requirements.txt (line 14))
  Downloading transformers-4.37.1-py3-none-any.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.14.6 (from -r requirements.txt (line 15))
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting spacy==3.7.2 (from -r requirements.txt (line 16))
  Downloading spacy-3.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dspy-ai==2.3.1 (from -r requirements.txt (line 17))
  Downloading dspy_ai-2.3.1-py3-n

In [2]:
from datasets import load_dataset
import openai
import os
import dspy
from dotenv import load_dotenv

In [3]:
# keep the API keys in a `.env` file in the local root directory
load_dotenv()

True

In [4]:
os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join(root_path, 'cache')

openai_key = os.getenv('OPENAI_API_KEY')  # or replace with your API key (optional)

anyscale_key = os.getenv('ANYSCALE_API_KEY')  # or replace with your API key (optional)

anyscale_base = os.getenv('ANYSCALE_API_BASE')

In [5]:
gpt_3_turbo = dspy.OpenAI(model='gpt-3.5-turbo', api_key=openai_key)

llama_2 = dspy.Anyscale(model='meta-llama/Llama-2-70b-chat-hf', api_key=anyscale_key, api_base=anyscale_base)

dspy.configure(lm=gpt_3_turbo)

# SQuAD

import SQuAD dataset. Will be using the training set for few-shot examples and the validation set for testing

In [6]:
squad = load_dataset("squad")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [7]:
def get_squad_split(squad, split="validation"):
    """
    Use `split='train'` for the train split.

    Returns
    -------
    list of dspy.Example with attributes question, answer

    """
    data = zip(*[squad[split][field] for field in squad[split].features])
    exs = [dspy.Example(context=context, answer=a['text'][0], question=q).with_inputs("context", "answer")
           for eid, title, context, q, a in data]
    return exs

In [8]:
squad_train = get_squad_split(squad, split="train")

In [9]:
squad_dev = get_squad_split(squad)

In [10]:
squad_train[0]

Example({'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'answer': 'Saint Bernadette Soubirous', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?'}) (input_keys={'context', 'answer'})

In [11]:
squad_dev[200]

Example({'context': 'Despite waiving longtime running back DeAngelo Williams and losing top wide receiver Kelvin Benjamin to a torn ACL in the preseason, the Carolina Panthers had their best regular season in franchise history, becoming the seventh team to win at least 15 regular season games since the league expanded to a 16-game schedule in 1978. Carolina started the season 14–0, not only setting franchise records for the best start and the longest single-season winning streak, but also posting the best start to a season by an NFC team in NFL history, breaking the 13–0 record previously shared with the 2009 New Orleans Saints and the 2011 Green Bay Packers. With their NFC-best 15–1 regular season record, the Panthers clinched home-field advantage throughout the NFC playoffs for the first time in franchise history. Ten players were selected to the Pro Bowl (the most in franchise history) along with eight All-Pro selections.', 'answer': 'Carolina Panthers', 'question': 'Who had the bes

# Testing GPT-3 and Llama 2

Run some initial tests to make sure both models are working and can output

In [12]:
gpt_3_turbo("Which award did Gary Zukav's first book receive?")

['Gary Zukav\'s first book, "The Dancing Wu Li Masters: An Overview of the New Physics," received the 1979 American Book Award for Science.']

In [13]:
llama_2("Which award did Gary Zukav's first book receive?")

['  Gary Zukav\'s first book, "The Dancing Wu Li Masters," received the National Book Award for Science in 1979.']

# Basic Signature and Module

Set up the question generation signature that prompts the LLM to generate the question asked from the given context and the answer. Set up the module for the question generation model, using the signature.

In [58]:
class BasicQGSignature(dspy.Signature):
    __doc__ = """Given the context and an answer, provide a single question that can be answered by the answer based on the context."""

    context = dspy.InputField(desc="will contain answer")
    answer = dspy.InputField()
    question = dspy.OutputField(desc="short question")

In [59]:
class BasicQG(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_question = dspy.Predict(BasicQGSignature)

    def forward(self, context, answer):
        return self.generate_question(context=context, answer=answer)

# Testing question generation

Run test of the models to make sure question generation if working as expected.

In [60]:
basic_qg_model = BasicQG()

In [61]:
test_example = squad_dev[200]
test_context = test_example.context
test_answer = test_example.answer
test_question = test_example.question

In [62]:
basic_qg_model(context=test_context, answer=test_answer)

Prediction(
    question='Which NFL team had their best regular season in franchise history, becoming the seventh team to win at least 15 regular season games since the league expanded to a 16-game schedule in 1978?'
)

In [63]:
with dspy.context(lm=llama_2):
  response = basic_qg_model(context=test_context, answer=test_answer)
response

Prediction(
    question='What NFL team had their best regular season in franchise history in 2015, winning 15 games and clinching home-field advantage throughout the NFC playoffs for the first time?'
)

In [64]:
llama_2.inspect_history(n=1)





Given the context and an answer, provide a single question that can be answered by the answer based on the context.

---

Follow the following format.

Context: will contain answer
Answer: ${answer}
Question: short question

---

Context: Despite waiving longtime running back DeAngelo Williams and losing top wide receiver Kelvin Benjamin to a torn ACL in the preseason, the Carolina Panthers had their best regular season in franchise history, becoming the seventh team to win at least 15 regular season games since the league expanded to a 16-game schedule in 1978. Carolina started the season 14–0, not only setting franchise records for the best start and the longest single-season winning streak, but also posting the best start to a season by an NFC team in NFL history, breaking the 13–0 record previously shared with the 2009 New Orleans Saints and the 2011 Green Bay Packers. With their NFC-best 15–1 regular season record, the Panthers clinched home-field advantage throughout the NFC 

# Testing few-shot prompting

In [22]:
from dspy.teleprompt import LabeledFewShot

dspy/cache/compiler


In [65]:
fewshot_teleprompter = LabeledFewShot(k=3)

In [66]:
fewshot_qg_model = fewshot_teleprompter.compile(basic_qg_model, trainset=squad_train)

In [67]:
fewshot_qg_model(context=test_context, answer=test_answer)

Prediction(
    question='What team had their best regular season in franchise history in 2015?'
)

In [68]:
with dspy.context(lm=llama_2):
  response = fewshot_qg_model(context=test_context, answer=test_answer)
response

Prediction(
    question="Sure! Here are the questions based on the context and answer:\n\n1. What group did Paul VI address in New York in 1965?\n2. What did Sander's study show in terms of black law students rankings?\n3. What problems does linguistic anthropology bring linguistic methods to bear on?\n4. Which NFL team had their best regular season in franchise history in 2015?"
)

In [69]:
gpt_3_turbo.inspect_history(n=1)





Given the context and an answer, provide a single question that can be answered by the answer based on the context.

---

Follow the following format.

Context: will contain answer
Answer: ${answer}
Question: short question

---

Context: Pope Paul VI became the first reigning pontiff ever to visit the Americas when he flew to New York in October 1965 to address the United Nations. As a gesture of goodwill, the pope gave to the UN two pieces of papal jewelry, a diamond cross and ring, with the hopes that the proceeds from their sale at auction would contribute to the UN's efforts to end human suffering. During the pope's visit, as the U.S. involvement in the Vietnam War escalated under President Johnson, Paul VI pleaded for peace before the UN:
Answer: United Nations
Question: What group did Paul VI address in New York in 1965?

---

Context: UCLA professor Richard H. Sander published an article in the November 2004 issue of the Stanford Law Review that questioned the effectiveness

In [70]:
llama_2.inspect_history(n=1)





Given the context and an answer, provide a single question that can be answered by the answer based on the context.

---

Follow the following format.

Context: will contain answer
Answer: ${answer}
Question: short question

---

Context: Pope Paul VI became the first reigning pontiff ever to visit the Americas when he flew to New York in October 1965 to address the United Nations. As a gesture of goodwill, the pope gave to the UN two pieces of papal jewelry, a diamond cross and ring, with the hopes that the proceeds from their sale at auction would contribute to the UN's efforts to end human suffering. During the pope's visit, as the U.S. involvement in the Vietnam War escalated under President Johnson, Paul VI pleaded for peace before the UN:
Answer: United Nations
Question: What group did Paul VI address in New York in 1965?

---

Context: UCLA professor Richard H. Sander published an article in the November 2004 issue of the Stanford Law Review that questioned the effectiveness

# Running experiments

Models are set up. Run experiments on the dev set, using the train set as the few shot examples. Run evaluation on the responses using BLUE metrics, which test similarity of the generated questions to the questions given in the dataset.

In [2]:
import random
import tqdm
import time
import pandas as pd

In [29]:
len(squad_dev)

10570

In [36]:
random.seed(0)
test_examples = random.sample(squad_dev, 400)
len(test_examples)

400

In [43]:
# store results as a csv
# context, answer, gold question, gpt-3, llama2
contexts = []
answers = []
gold_questions = []

for example in test_examples:
  contexts.append(example.context)
  answers.append(example.answer)
  gold_questions.append(example.question)

In [44]:
examples_dict = {"context" : contexts,
                 "answer" : answers,
                 "gold_question" : gold_questions}

In [45]:
examples_df = pd.DataFrame.from_dict(examples_dict)
examples_df.head()

In [71]:
# necessary experiments to run
# GPT-3 zero-shot (baseline) -> gpt3_zero_shot
# GPT-3 few-shot -> gpt3_few_shot
# Llama2 zero-shot -> llama2_zero_shot
# llama2 few-shot -> llama2_few_shot

In [72]:
examples_df.head()

Unnamed: 0,context,answer,gold_question
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?


In [77]:
# baseline model
gpt3_zero_shot = []
for example in tqdm.tqdm(test_examples):
    generated_question = basic_qg_model(context=example.context, answer=example.answer).question
    gpt3_zero_shot.append(generated_question)
    time.sleep(5)

100%|██████████| 400/400 [37:48<00:00,  5.67s/it]


In [78]:
if len(gpt3_zero_shot) < len(examples_df):
  gpt3_zero_shot += [''] * (len(examples_df) - len(gpt3_zero_shot))

examples_df["gpt3_zero_shot"] = gpt3_zero_shot
examples_df.head()

Unnamed: 0,context,answer,gold_question,gpt3_zero_shot
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's historical legac...
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,In what year did the UK formally subscribe to ...
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...


In [79]:
examples_df.to_csv('examples_df_baseline.csv', index=False)

In [84]:
# GPT-3 few shot
gpt3_few_shot = []
for example in tqdm.tqdm(test_examples):
    generated_question = fewshot_qg_model(context=example.context, answer=example.answer).question
    gpt3_few_shot.append(generated_question)
    time.sleep(5)

100%|██████████| 400/400 [37:44<00:00,  5.66s/it]


In [85]:
if len(gpt3_few_shot) < len(examples_df):
  gpt3_few_shot += [''] * (len(examples_df) - len(gpt3_few_shot))

examples_df["gpt3_few_shot"] = gpt3_few_shot
examples_df.head()

Unnamed: 0,context,answer,gold_question,gpt3_zero_shot,gpt3_few_shot
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's historical legac...,What aspect of Genghis Khan's legacy is percei...
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...,What did the U.S. Court of Appeals for the Fir...
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,How many Grammy awards has Lady Gaga won?
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,In what year did the UK formally subscribe to ...,When did the UK Labour Party come to power?
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,When does the annual animal migration occur in...


In [86]:
examples_df.to_csv('examples_df_baseline_gpt3_few_shot.csv', index=False)

In [90]:
# Llama2 zero shot
llama2_zero_shot = []
with dspy.context(lm=llama_2):
  for example in tqdm.tqdm(test_examples):
      generated_question = basic_qg_model(context=example.context, answer=example.answer).question
      llama2_zero_shot.append(generated_question)
      time.sleep(5)

100%|██████████| 400/400 [44:22<00:00,  6.66s/it]


In [91]:
if len(llama2_zero_shot) < len(examples_df):
  llama2_zero_shot += [''] * (len(examples_df) - len(llama2_zero_shot))

examples_df["llama2_zero_shot"] = llama2_zero_shot
examples_df.head()

Unnamed: 0,context,answer,gold_question,gpt3_zero_shot,gpt3_few_shot,llama2_zero_shot
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's historical legac...,What aspect of Genghis Khan's legacy is percei...,What is Genghis Khan's reputation among Mongol...
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...,What did the U.S. Court of Appeals for the Fir...,What was the reason for the judge's decision t...
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,How many Grammy awards has Lady Gaga won?
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,In what year did the UK formally subscribe to ...,When did the UK Labour Party come to power?,What year did the UK formally subscribe to the...
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,When does the annual animal migration occur in...,When does the annual animal migration occur in...


In [92]:
examples_df.to_csv('examples_df_baseline_gpt3_few_shot_llama2.csv', index=False)

In [97]:
# Llama2 few shot
llama2_few_shot = []
with dspy.context(lm=llama_2):
  for example in tqdm.tqdm(test_examples):
      generated_question = fewshot_qg_model(context=example.context, answer=example.answer).question
      llama2_few_shot.append(generated_question)
      time.sleep(5)

100%|██████████| 400/400 [48:27<00:00,  7.27s/it]


In [98]:
if len(llama2_few_shot) < len(examples_df):
  llama2_few_shot += [''] * (len(examples_df) - len(llama2_few_shot))

examples_df["llama2_few_shot"] = llama2_few_shot
examples_df.head()

Unnamed: 0,context,answer,gold_question,gpt3_zero_shot,gpt3_few_shot,llama2_zero_shot,llama2_few_shot
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's historical legac...,What aspect of Genghis Khan's legacy is percei...,What is Genghis Khan's reputation among Mongol...,What is the perception of Genghis Khan's bruta...
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...,What did the U.S. Court of Appeals for the Fir...,What was the reason for the judge's decision t...,What was the reason given by the U.S. Court of...
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,"Sure, here are the questions based on the give..."
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,In what year did the UK formally subscribe to ...,When did the UK Labour Party come to power?,What year did the UK formally subscribe to the...,"Sure, here are the questions based on the cont..."
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,When does the annual animal migration occur in...,When does the annual animal migration occur in...,"Sure, here are the questions based on the give..."


In [99]:
examples_df.to_csv('examples_df.csv', index=False)

# Evaluation

Quantitative evaluation: Use BLEU to evaluate the results similarity to the gold answers. Run sentence BLEU for each sentence (in comparison with gold sentence) and run corpus BLEU for entire corpus.

Qualitative evaluation: examine the generated questions and look at question types and see if there were patterns in which kinds of questions the models answered correctly. Llama2 returned a lot of extra fluff in its answers. Examine how often it did that and if it was more common in the zero-shot and few-shot.

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize
import string

In [3]:
# read results file as csv
examples_df = pd.read_csv('examples_df.csv')
examples_df.head(5)

Unnamed: 0,context,answer,gold_question,gpt3_zero_shot,gpt3_few_shot,llama2_zero_shot,llama2_few_shot
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's historical legac...,What aspect of Genghis Khan's legacy is percei...,What is Genghis Khan's reputation among Mongol...,What is the perception of Genghis Khan's bruta...
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...,What did the U.S. Court of Appeals for the Fir...,What was the reason for the judge's decision t...,What was the reason given by the U.S. Court of...
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,"Sure, here are the questions based on the give..."
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,In what year did the UK formally subscribe to ...,When did the UK Labour Party come to power?,What year did the UK formally subscribe to the...,"Sure, here are the questions based on the cont..."
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,When does the annual animal migration occur in...,When does the annual animal migration occur in...,"Sure, here are the questions based on the give..."


In [None]:
# add new columns containing the sentence bleu scores for each result
# gpt3_zero_shot_bleu
# gpt3_few_shot_bleu
# llama2_zero_shot_bleu
# llama2_few_shot_bleu

In [44]:
exclude = set(string.punctuation)

def tokenize(sent):
  sent_stripped = ''.join(ch for ch in sent if ch not in exclude)
  return (word_tokenize(sent_stripped))

In [57]:
ind = 0
test_reference = tokenize(examples_df["gold_question"].iloc[ind])
test_sent = tokenize(examples_df["gpt3_zero_shot"].iloc[ind])
print(test_reference)
print(test_sent)

['What', 'do', 'some', 'Mongolians', 'feel', 'nonMongolian', 'historians', 'exaggerate', 'about', 'Genghis', 'Khan']
['What', 'aspect', 'of', 'Genghis', 'Khans', 'historical', 'legacy', 'do', 'Mongolians', 'believe', 'has', 'been', 'unfairly', 'exaggerated', 'by', 'nonMongolian', 'historical', 'records']


In [58]:
score = sentence_bleu([test_reference], test_sent)
print(score)

1.322612729825152e-231


In [49]:
gpt3_zero_shot_bleu	= []
gpt3_few_shot_bleu = []
llama2_zero_shot_bleu = []
llama2_few_shot_bleu = []
corpus_references = []

for i in range(len(examples_df)):
  gold = examples_df["gold_question"].iloc[i]
  gpt3_zero_shot_sent = examples_df["gpt3_zero_shot"].iloc[i]
  gpt3_few_shot_sent = examples_df["gpt3_few_shot"].iloc[i]
  llama2_zero_shot_sent = examples_df["llama2_zero_shot"].iloc[i]
  llama2_few_shot_sent = examples_df["llama2_few_shot"].iloc[i]

  gpt3_zero_shot_bleu.append(sentence_bleu([gold], gpt3_zero_shot_sent))
  gpt3_few_shot_bleu.append(sentence_bleu([gold], gpt3_few_shot_sent))
  llama2_zero_shot_bleu.append(sentence_bleu([gold], llama2_zero_shot_sent))
  llama2_few_shot_bleu.append(sentence_bleu([gold], llama2_few_shot_sent))

  corpus_references.append([gold])


In [59]:
gpt3_zero_shot_bleu[0:5]

[0.4643325880151344,
 0.12565920767043673,
 0.80846720196545,
 0.8538651765124674,
 0.8103667351656416]

In [None]:
examples_df["gpt3_zero_shot_bleu"] = gpt3_zero_shot_bleu
examples_df["gpt3_few_shot_bleu"] = gpt3_few_shot_bleu
examples_df["llama2_zero_shot_bleu"] = llama2_zero_shot_bleu
examples_df["llama2_few_shot_bleu"] = llama2_few_shot_bleu

In [61]:
cols = list(examples_df.columns.values)
cols

['context',
 'answer',
 'gold_question',
 'gpt3_zero_shot',
 'gpt3_few_shot',
 'llama2_zero_shot',
 'llama2_few_shot',
 'gpt3_zero_shot_bleu',
 'gpt3_few_shot_bleu',
 'llama2_zero_shot_bleu',
 'llama2_few_shot_bleu']

In [63]:
cols_list = ['context',
 'answer',
 'gold_question',
 'gpt3_zero_shot',
 'gpt3_zero_shot_bleu',
 'gpt3_few_shot',
 'gpt3_few_shot_bleu',
 'llama2_zero_shot',
 'llama2_zero_shot_bleu',
 'llama2_few_shot',
 'llama2_few_shot_bleu']

In [66]:
examples_df = examples_df[cols_list]
examples_df

Unnamed: 0,context,answer,gold_question,gpt3_zero_shot,gpt3_zero_shot_bleu,gpt3_few_shot,gpt3_few_shot_bleu,llama2_zero_shot,llama2_zero_shot_bleu,llama2_few_shot,llama2_few_shot_bleu
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's historical legac...,0.464333,What aspect of Genghis Khan's legacy is percei...,0.490631,What is Genghis Khan's reputation among Mongol...,0.347810,What is the perception of Genghis Khan's bruta...,0.387621
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...,0.125659,What did the U.S. Court of Appeals for the Fir...,0.172925,What was the reason for the judge's decision t...,0.140713,What was the reason given by the U.S. Court of...,0.174414
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,0.808467,How many Grammy awards has Lady Gaga won?,0.808467,How many Grammy awards has Lady Gaga won?,0.808467,"Sure, here are the questions based on the give...",0.098372
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,In what year did the UK formally subscribe to ...,0.853865,When did the UK Labour Party come to power?,0.286595,What year did the UK formally subscribe to the...,0.457002,"Sure, here are the questions based on the cont...",0.179999
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,0.810367,When does the annual animal migration occur in...,0.810367,When does the annual animal migration occur in...,0.646982,"Sure, here are the questions based on the give...",0.113442
...,...,...,...,...,...,...,...,...,...,...,...
395,Oxygen was discovered independently by Carl Wi...,1774,In what year did Joseph Priestley recognize ox...,In what year did Joseph Priestley independentl...,0.532002,When was oxygen discovered by Joseph Priestley?,0.536630,What was the year in which Joseph Priestley di...,0.254374,Sure! Here are the questions based on the cont...,0.104158
396,The Standard Industrial Classification and the...,"building construction, heavy and civil enginee...",What are the three construction subsectors?,What are the three subsectors of the construct...,0.248688,What are the three subsectors of the construct...,0.248688,What are the three subsectors that the classif...,0.411902,Sure! Here are the questions based on the give...,0.088775
397,Tesla's demonstration of his induction motor a...,lighting systems,What did the war first start out as being betw...,What were Edison and Westinghouse initially co...,0.179065,What was the initial competition between Thoma...,0.247633,"What was the focus of the ""War of Currents"" co...",0.213442,"What was the initial focus of the ""War of Curr...",0.222291
398,"By 1954, all U.S. networks had regained contro...",All-Channel Receiver Act,What important legislation was passed by Congr...,What legislation passed by Congress in 1961 ma...,0.573852,What act passed by Congress in 1961 mandated t...,0.483414,What piece of legislation mandated the inclusi...,0.302451,What was the name of the act passed by Congres...,0.465998


In [68]:
corpus_references[0:5]

[['What do some Mongolians feel non-Mongolian historians exaggerate about Genghis Khan?'],
 ['Why is giving a defiant speech sometimes more harmful for the individual?'],
 ['How many Grammys has Lady Gaga won?'],
 ['When did the UK formally subscribe to the Agreement on Social Policy?'],
 ['When does the annual animal migration occur?']]

In [69]:
len(corpus_references)

400

In [71]:
corpus_scores = []

corpus_scores.append(corpus_bleu(corpus_references, list(examples_df["gpt3_zero_shot"])))
corpus_scores.append(corpus_bleu(corpus_references, list(examples_df["gpt3_few_shot"])))
corpus_scores.append(corpus_bleu(corpus_references, list(examples_df["llama2_zero_shot"])))
corpus_scores.append(corpus_bleu(corpus_references, list(examples_df["llama2_few_shot"])))

corpus_scores

[0.4237173464587261, 0.4752617232408119, 0.3355739508589386, 0.176668410026455]

In [72]:
models_list = ["gpt3_zero_shot", "gpt3_few_shot", "llama2_zero_shot", "llama2_few_shot"]

In [73]:
corpus_dict = {"model" : models_list, "corpus_BLEU_score" : corpus_scores}

In [75]:
corpus_df = pd.DataFrame.from_dict(corpus_dict)
corpus_df

Unnamed: 0,model,corpus_BLEU_score
0,gpt3_zero_shot,0.423717
1,gpt3_few_shot,0.475262
2,llama2_zero_shot,0.335574
3,llama2_few_shot,0.176668


In [67]:
examples_df.to_csv('examples_df_BLEU.csv', index=False)

In [76]:
corpus_df.to_csv('corpus_BLEU.csv', index=False)

In [3]:
# read results file as csv
examples_df = pd.read_csv('examples_df_BLEU.csv')
examples_df.head(5)

Unnamed: 0,context,answer,gold_question,gpt3_zero_shot,gpt3_zero_shot_bleu,gpt3_few_shot,gpt3_few_shot_bleu,llama2_zero_shot,llama2_zero_shot_bleu,llama2_few_shot,llama2_few_shot_bleu
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's historical legac...,0.464333,What aspect of Genghis Khan's legacy is percei...,0.490631,What is Genghis Khan's reputation among Mongol...,0.34781,What is the perception of Genghis Khan's bruta...,0.387621
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...,0.125659,What did the U.S. Court of Appeals for the Fir...,0.172925,What was the reason for the judge's decision t...,0.140713,What was the reason given by the U.S. Court of...,0.174414
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,0.808467,How many Grammy awards has Lady Gaga won?,0.808467,How many Grammy awards has Lady Gaga won?,0.808467,"Sure, here are the questions based on the give...",0.098372
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,In what year did the UK formally subscribe to ...,0.853865,When did the UK Labour Party come to power?,0.286595,What year did the UK formally subscribe to the...,0.457002,"Sure, here are the questions based on the cont...",0.179999
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,0.810367,When does the annual animal migration occur in...,0.810367,When does the annual animal migration occur in...,0.646982,"Sure, here are the questions based on the give...",0.113442


In [4]:
gpt3_zero_shot_df = examples_df[["context", "answer", "gold_question", "gpt3_zero_shot", "gpt3_zero_shot_bleu"]]
gpt3_zero_shot_df.head(5)

Unnamed: 0,context,answer,gold_question,gpt3_zero_shot,gpt3_zero_shot_bleu
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's historical legac...,0.464333
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...,0.125659
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,0.808467
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,In what year did the UK formally subscribe to ...,0.853865
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,0.810367


In [5]:
gpt3_zero_shot_df.to_csv('gpt3_zero_shot_df_BLEU.csv', index=False)

In [6]:
gpt3_few_shot_df = examples_df[["context", "answer", "gold_question", "gpt3_few_shot", "gpt3_few_shot_bleu"]]
gpt3_few_shot_df.head(5)

Unnamed: 0,context,answer,gold_question,gpt3_few_shot,gpt3_few_shot_bleu
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What aspect of Genghis Khan's legacy is percei...,0.490631
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What did the U.S. Court of Appeals for the Fir...,0.172925
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,0.808467
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,When did the UK Labour Party come to power?,0.286595
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,0.810367


In [7]:
gpt3_few_shot_df.to_csv('gpt3_few_shot_df_BLEU.csv', index=False)

In [8]:
llama2_zero_shot_df = examples_df[["context", "answer", "gold_question", "llama2_zero_shot", "llama2_zero_shot_bleu"]]
llama2_zero_shot_df.head(5)

Unnamed: 0,context,answer,gold_question,llama2_zero_shot,llama2_zero_shot_bleu
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What is Genghis Khan's reputation among Mongol...,0.34781
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What was the reason for the judge's decision t...,0.140713
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,How many Grammy awards has Lady Gaga won?,0.808467
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,What year did the UK formally subscribe to the...,0.457002
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,When does the annual animal migration occur in...,0.646982


In [10]:
llama2_zero_shot_df.to_csv('llama2_zero_shot_df_BLEU.csv', index=False)

In [9]:
llama2_few_shot_df = examples_df[["context", "answer", "gold_question", "llama2_few_shot", "llama2_few_shot_bleu"]]
llama2_few_shot_df.head(5)

Unnamed: 0,context,answer,gold_question,llama2_few_shot,llama2_few_shot_bleu
0,In the early 1990s the memory of Genghis Khan ...,his brutality,What do some Mongolians feel non-Mongolian his...,What is the perception of Genghis Khan's bruta...,0.387621
1,Some civil disobedience defendants choose to m...,lack of remorse,Why is giving a defiant speech sometimes more ...,What was the reason given by the U.S. Court of...,0.174414
2,Six-time Grammy winner and Academy Award nomin...,Six,How many Grammys has Lady Gaga won?,"Sure, here are the questions based on the give...",0.098372
3,Following the election of the UK Labour Party ...,1997,When did the UK formally subscribe to the Agre...,"Sure, here are the questions based on the cont...",0.179999
4,"The ""Big Five"" game animals of Africa, that is...",between June and September,When does the annual animal migration occur?,"Sure, here are the questions based on the give...",0.113442


In [11]:
llama2_few_shot_df.to_csv('llama2_few_shot_df_BLEU.csv', index=False)