# MultiDoc-QA - Indexing, preparing to rerank, generating explanations and evaluating

Author: Monique Monteiro (moniquelouise@gmail.com)

Inspired by https://github.com/neuralmind-ai/visconde

@inproceedings{10.1007/978-3-031-28238-6_44,
author = {Pereira, Jayr and Fidalgo, Robson and Lotufo, Roberto and Nogueira, Rodrigo},
title = {Visconde: Multi-Document QA With&nbsp;GPT-3 And&nbsp;Neural Reranking},
year = {2023},
isbn = {978-3-031-28237-9},
publisher = {Springer-Verlag},
address = {Berlin, Heidelberg},
url = {https://doi.org/10.1007/978-3-031-28238-6_44},
doi = {10.1007/978-3-031-28238-6_44},
booktitle = {Advances in Information Retrieval: 45th European Conference on Information Retrieval, ECIR 2023, Dublin, Ireland, April 2–6, 2023, Proceedings, Part II},
pages = {534–543},
numpages = {10},
location = {Dublin, Ireland}
}

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
main_dir = "/content/gdrive/MyDrive/Unicamp-aula-11"

## Libraries Installation

In [4]:
%%shell
pip install openai -q
pip install pyserini -q
pip install faiss-cpu -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.9/71.9 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m124.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━



In [10]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125926 sha256=f8018607602961822d5989958941d69d2fd813265f0927123ec69f903e678a7b
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2


## Dataset Download

In [None]:
%%shell
wget https://iirc-dataset.s3.us-west-2.amazonaws.com/context_articles.tar.gz 
wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_test.json 

tar -xf context_articles.tar.gz

--2023-05-14 19:04:24--  https://iirc-dataset.s3.us-west-2.amazonaws.com/context_articles.tar.gz
Resolving iirc-dataset.s3.us-west-2.amazonaws.com (iirc-dataset.s3.us-west-2.amazonaws.com)... 52.92.181.138, 52.92.181.146, 3.5.78.195, ...
Connecting to iirc-dataset.s3.us-west-2.amazonaws.com (iirc-dataset.s3.us-west-2.amazonaws.com)|52.92.181.138|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 385263479 (367M) [application/x-gzip]
Saving to: ‘context_articles.tar.gz’


2023-05-14 19:04:44 (19.4 MB/s) - ‘context_articles.tar.gz’ saved [385263479/385263479]

--2023-05-14 19:04:44--  https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_test.json
Resolving iirc-dataset.s3.us-west-2.amazonaws.com (iirc-dataset.s3.us-west-2.amazonaws.com)... 52.218.132.81, 52.218.217.57, 52.92.229.210, ...
Connecting to iirc-dataset.s3.us-west-2.amazonaws.com (iirc-dataset.s3.us-west-2.amazonaws.com)|52.218.132.81|:443... connected.
HTTP request sent, awaiting response... 200 OK
Leng



In [None]:
!mv context_articles.json {main_dir}/
!mv iirc_test.json {main_dir}/

In [None]:
!wget https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_train_dev.tgz 

--2023-05-16 01:10:47--  https://iirc-dataset.s3.us-west-2.amazonaws.com/iirc_train_dev.tgz
Resolving iirc-dataset.s3.us-west-2.amazonaws.com (iirc-dataset.s3.us-west-2.amazonaws.com)... 52.92.138.218, 52.218.168.217, 52.218.235.17, ...
Connecting to iirc-dataset.s3.us-west-2.amazonaws.com (iirc-dataset.s3.us-west-2.amazonaws.com)|52.92.138.218|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5713428 (5.4M) [application/gzip]
Saving to: ‘iirc_train_dev.tgz’


2023-05-16 01:10:49 (5.17 MB/s) - ‘iirc_train_dev.tgz’ saved [5713428/5713428]



In [None]:
!tar -xf iirc_train_dev.tgz

In [None]:
!mv iirc_train_dev {main_dir}/

## OpenAI GPT 3.5 Turbo (ChatCPT) integration

In [5]:
import openai
import os

In [None]:
openai.api_key = input("Key:")

In [7]:
def generate(system_prompt, prompts, temperature=0):
  messages = [{"role": "system", "content": system_prompt}] + [{"role": "user", "content": prompt} for prompt in prompts]

  response = openai.ChatCompletion.create(model = "gpt-3.5-turbo",
            messages = messages, temperature=temperature         
          )
  return response["choices"][0]['message']['content']

## Query Decomposition

In [8]:
def decompose(question):
  system_prompt = "Decompose a question in self-contained sub-questions. Use \"The question needs no decomposition\" when no decomposition is needed."
  user_prompt_1 = "\n\nExample 1:\n\nQuestion: Is Hamlet more common on IMDB than Comedy of Errors?\n\nDecompositions: \n1: How many listings of Hamlet are there on IMDB?\n2: How many listing of Comedy of Errors is there on IMDB?\n\nExample 2:\n\nQuestion: Are birds important to badminton?\n\nDecompositions:\nThe question needs no decomposition\n\nExample 3:\n\nQuestion: Is it legal for a licensed child driving Mercedes-Benz to be employed in US?\n\nDecompositions:\n1: What is the minimum driving age in the US?\n2: What is the minimum age for someone to be employed in the US?\n\nExample 4:\n\nQuestion: Are all cucumbers the same texture?\n\nDecompositions:\nThe question needs no decomposition\n\nExample 5:\n\nQuestion: Hydrogen's atomic number squared exceeds number of Spice Girls?\n\nDecompositions:\n1: What is the atomic number of hydrogen?\n2: How many Spice Girls are there?"
  user_prompt_2 = "\n\nExample 6:\n\nQuestion: {0}" + "\n\nDecompositions:"

  res = generate(system_prompt, [user_prompt_1, user_prompt_2.format(question)])
  # print(res)
  if res.lower().strip() == "the question needs no decomposition.":
      return [question]
  try:
      questions = [l for l in res.splitlines() if l != ""]
      questions = [q.split(':')[1].strip() for q in questions]
      return questions
  except:
      return [question]


## Indexing

In [None]:
import json

test_set = json.load(open(f'{main_dir}/iirc_test.json','r'))
context_articles = json.load(open(f"{main_dir}/context_articles.json",'r'))

In [None]:
from bs4 import BeautifulSoup

In [None]:
def remove_html_tags(html):
  soup = BeautifulSoup(html, 'html.parser')
  text = soup.get_text()
  return text

In [None]:
documents = []
all_titles = []

for item in test_set:
  if item['title'].lower() not in all_titles:
    documents.append({
            "title": item['title'],
            "content": remove_html_tags(item["text"])
        }
    )
    all_titles.append(item['title'].lower())
  for link in item["links"]:
    if link['target'].lower() in context_articles and link['target'].lower() not in all_titles:
      documents.append({
          "title": link['target'],
          "content": remove_html_tags(context_articles[link['target'].lower()])
      })
      all_titles.append(link['target'].lower())
    else:
      print(link['target'].lower())

9th paratroopers assault regiment "col moschin"
goldfinger (film)
list of international cricket council members
icc americas championship
the rev
avenged sevenfold
fox footy
herald sun
fox footy
herald sun
united states
judeo-iraqi arabic
maya civilization
black watch
suicidal tendencies
western hockey league
national hockey league
home run
minor league baseball
colonel
colonel
massachusetts institute of technology
israel
harvard business review
american football
college football
united states
billboard 200
romeo discography
billboard 200
master p
hip hop history
billboard 200
louisiana
arizona
state farm stadium
louisiana
united states
gulf of mexico
saffir–simpson scale
forgotten realms
list of dungeons & dragons rulebooks
mexico
napoleon iii
american football
national football league
mexico
lucha libre
protagonist
double dragon
world war ii
banff, alberta
american football
quarterback
college football
2009 nfl draft
new york city
los angeles
metal massacre
metal massacre
hull city a

In [None]:
len(documents), len(all_titles)

(7028, 7028)

In [None]:
from tqdm import tqdm
import spacy

nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

stride = 2
max_length = 3

def window(documents, 
           stride=2, 
           #stride=3, #without overlap
           max_length=3):
    treated_documents = []

    for j,document in enumerate(tqdm(documents)):
        doc_text = document['content']
        doc = nlp(doc_text)
        sentences = [sent.text.strip() for sent in doc.sents]
        for i in range(0, len(sentences), stride):
            segment = ' '.join(sentences[i:i + max_length])
            treated_documents.append({
                "title": document['title'],
                "contents": document['title']+". "+segment,
                "segment": segment
            })
            if i + max_length >= len(sentences):
                break
                
    return treated_documents

treated_documents = window(documents)

100%|██████████| 7028/7028 [03:32<00:00, 33.13it/s]


In [None]:
treated_documents[0]

{'title': 'Palici',
 'contents': "Palici. The Palici (Παλικοί in Greek), or Palaci, were a pair of indigenous Sicilian chthonic deities in Roman mythology, and to a lesser extent in Greek mythology. They are mentioned in Ovid's Metamorphoses V, 406, and in Virgil's Aeneid IX, 585. Their cult centered on three small lakes that emitted sulphurous vapors in the Palagonia plain, and as a result these twin brothers were associated with geysers and the underworld.",
 'segment': "The Palici (Παλικοί in Greek), or Palaci, were a pair of indigenous Sicilian chthonic deities in Roman mythology, and to a lesser extent in Greek mythology. They are mentioned in Ovid's Metamorphoses V, 406, and in Virgil's Aeneid IX, 585. Their cult centered on three small lakes that emitted sulphurous vapors in the Palagonia plain, and as a result these twin brothers were associated with geysers and the underworld."}

In [None]:
treated_documents[1]

{'title': 'Palici',
 'contents': 'Palici. Their cult centered on three small lakes that emitted sulphurous vapors in the Palagonia plain, and as a result these twin brothers were associated with geysers and the underworld. There was also a shrine to the Palaci in Palacia, where people could subject themselves or others to tests of reliability through divine judgement; passing meant that an oath could be trusted. The mythological lineage of the Palici is uncertain; one legend made the Palici the sons of Zeus, or possibly Hephaestus, by Aetna or Thalia, but another claimed that the Palici were the sons of the Sicilian deity Adranus.',
 'segment': 'Their cult centered on three small lakes that emitted sulphurous vapors in the Palagonia plain, and as a result these twin brothers were associated with geysers and the underworld. There was also a shrine to the Palaci in Palacia, where people could subject themselves or others to tests of reliability through divine judgement; passing meant tha

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!mkdir {main_dir}/iirc_indices

In [None]:
f = open(f"{main_dir}/iirc_indices/contents.jsonl",'w')

for i, doc in enumerate(treated_documents):
    doc['id'] = i
    if doc['segment'] != "":
        f.write(json.dumps(doc)+"\n")

In [None]:
!python3 -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator -threads 1 -input {main_dir}/iirc_indices -index {main_dir}/iirc_index -storeRaw

pyserini.index is deprecated, please use pyserini.index.lucene.
2023-05-17 20:24:17,451 INFO  [main] index.IndexCollection (IndexCollection.java:380) - Setting log level to INFO
2023-05-17 20:24:17,453 INFO  [main] index.IndexCollection (IndexCollection.java:383) - Starting indexer...
2023-05-17 20:24:17,454 INFO  [main] index.IndexCollection (IndexCollection.java:385) - DocumentCollection path: /content/gdrive/MyDrive/Unicamp-aula-11/iirc_indices
2023-05-17 20:24:17,454 INFO  [main] index.IndexCollection (IndexCollection.java:386) - CollectionClass: JsonCollection
2023-05-17 20:24:17,454 INFO  [main] index.IndexCollection (IndexCollection.java:387) - Generator: DefaultLuceneDocumentGenerator
2023-05-17 20:24:17,454 INFO  [main] index.IndexCollection (IndexCollection.java:388) - Threads: 1
2023-05-17 20:24:17,455 INFO  [main] index.IndexCollection (IndexCollection.java:389) - Language: en
2023-05-17 20:24:17,455 INFO  [main] index.IndexCollection (IndexCollection.java:390) - Stemmer: p

## Pyserini searcher

In [9]:
from pyserini.search.lucene import LuceneSearcher
import json

searcher = LuceneSearcher(f'{main_dir}/iirc_index')

## Preparing to rerank

In [None]:
n_examples = 50

In [None]:
import random 

#sampled_test_set = random.sample(test_set, n_examples)
all_q = []

for item in test_set:
  for q in item['questions']:
    q['text'] = item['text']
    q['title'] = item['title']
    q['links'] = item['links']
    all_q.append(q)

In [None]:
len(all_q)

1301

In [None]:
random.seed(42)

In [None]:
sampled_q = random.sample(all_q, n_examples)

In [None]:
len(sampled_q)

50

In [None]:
import json
import numpy as np
from tqdm import tqdm

limit_by_query= 1000
new_test = []

In [None]:
current_pos = len(new_test)

for i in range(current_pos, len(sampled_q)):
  q = sampled_q[i]
  decomposition = decompose(q['question'])
  q['decomposition'] = decomposition
  titles = [l['target'].lower() for l in q['links']] + [q['title'].lower()]
  decs = []
  
  for d in decomposition:
    hits = searcher.search(d,k=limit_by_query)
    chosen = []
    for hit in hits:
      hit = json.loads(hit.raw)
      if hit['title'].lower() in titles:
        chosen.append(hit)
        
    decs.append({
        "question": d,
        "documents": chosen
    })
  q['decomposition'] = decs

  new_test.append(q)

In [None]:
new_test[0]['question']

'What was the Belarus teams record under coach Bernd Stange?'

In [None]:
dec_q = [dec['question'] for dec in new_test[0]['decomposition']]
dec_q

['How many games did the Belarus team play under coach Bernd Stange?',
 'How many games did the Belarus team win under coach Bernd Stange?',
 'How many games did the Belarus team lose under coach Bernd Stange?',
 'How many games did the Belarus team draw under coach Bernd Stange?']

In [None]:
len(new_test[0]['decomposition'])

4

In [None]:
len(new_test)

50

In [None]:
json.dump(new_test, open(f"{main_dir}/to_rerank.json",'w'))

## Evaluating

In [11]:
import json

test_set = json.load(open(f'{main_dir}/iirc_reranked.json','r'))

In [12]:
len(test_set)

50

In [28]:
len(test_set[1]['decomposition'][2]['documents'])

17

In [13]:
questions_with_answer = [t for t in test_set if t['answer']['type'] != 'none']
len(questions_with_answer)

36

In [14]:
dev = json.load(open(f"{main_dir}/explained_dataset.json",'r'))

In [15]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')

Downloading (…)8df09/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)50dc78df09/README.md:   0%|          | 0.00/6.14k [00:00<?, ?B/s]

Downloading (…)dc78df09/config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)8df09/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

Downloading (…)df09/train_script.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

Downloading (…)50dc78df09/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)c78df09/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [16]:
passages = []
for q in dev:
    text = "{0}".format(q['question'])
    passages.append(text)
passages_embeddings = model.encode(passages, show_progress_bar=True, batch_size=128)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
passages[0]

'In what year was Columbia University founded?'

In [18]:
import numpy as np
import torch
from tqdm import tqdm
import random

KSHOT = 4
test = []

for q in tqdm(test_set):    
  item_passage = ""
  for i,c in enumerate(q['context']):
    item_passage+= "Document {0}: {1}\n\n".format(i+1, c['text'])
  item_passage += "{0}".format(q['question'])
  
  item_embedding = model.encode(item_passage)

  all_top = util.dot_score(item_embedding, passages_embeddings)[0].topk(KSHOT)
  if len(all_top.indices) == 1:
    hits = np.array(dev)[[all_top.indices]].tolist()
  else:
    hits = np.array(dev)[all_top.indices].tolist()

  hits.reverse()

  system_prompt = "For each example, use the documents to create an \"Answer\" and an \"Evidence\" to the \"Question\". Answer \"not enough information\" when not enough information is provided in the documents.\n\n"
  prompts = []
  for i, hit in enumerate(hits):
    prompt = "Example {0}:\n\n".format(i+1)
    for j,c in enumerate(hit['context']):
      text = ""
      if c['passage'] == "main":
          text = "Title: {0}. Content: {1}".format(hit['title'],c['text'])
      else:
          text = "Title: {0}. Content: {1}".format(c['passage'],c['text'])
      prompt+= "Document {0}: {1}\n\n".format(j+1, text)
    answer = hit['answer']

    if hit['explanation']:
      prompt += "Question: Based on the above documents, {0}\n\nEvidence: {1}\n\nAnswer: {2}.\n\n".format(hit['question'], hit['explanation'].replace('\n',''), answer)
    else:
      prompt += "Question: Based on the above documents, {0}\n\nEvidence: {1}\n\nAnswer: {2}.\n\n".format(hit['question'], 'Not found.', answer)
    prompts.append(prompt)
  
  prompt = "Example {0}:\n\n".format(i+2)

  limit_per_query = 3
  min_total = 3
  chosen = []
  if len(q['decomposition']) < 2:
      chosen = q['decomposition'][0]['documents'][:min_total]
  else:
      for d in q['decomposition']:
          chosen = chosen +d['documents'][:limit_per_query]
  for i, c in enumerate(chosen):
      text = ""
      text = "Title: {0}. Content: {1}".format(c['title'], c['text'])
      prompt+= "Document {0}: {1}\n\n".format(i+1, text)
  
  prompt += "Question: Based on the above documents, {0}\n\nEvidence:".format(q['question'])
  prompts.append(prompt)
  
  q['prompts'] = prompts
  q['system_prompt'] = system_prompt
  answers = []
  if q['answer']['type'] == "span":
      at = ", ".join([a['text'] for a in q['answer']["answer_spans"]])
      answers.append(at)
  elif q['answer']['type'] == "value":
      at = "{0} {1}".format(q['answer']['answer_value'],q['answer']['answer_unit'])
      answers.append(at)
  elif q['answer']['type'] == "binary":
      answers.append(q['answer']['answer_value'])
  elif q['answer']['type'] == "none":
      answers.append("Not enough information")
  q['clean_answers'] = answers
  test.append(q)

100%|██████████| 50/50 [00:01<00:00, 47.59it/s]


In [20]:
from openai.error import InvalidRequestError
from tqdm import tqdm
import json
import re

temperature = 0
attempts = 1

for item in tqdm(test):
  item['responses'] = []
  item['completions'] = []
  for i in range(attempts):
    try:
      res = generate(item['system_prompt'], item['prompts'],temperature=temperature)
    except InvalidRequestError:
      #Reduces the number of prompts by removing the largest one
      print("Current number of prompts = ", len(item['prompts']))
      max_prompt = item['prompts'][0]
      max_len = len(item['prompts'][0])
      for i, prompt in enumerate(item['prompts']):
        if i != 0 and len(prompt) > max_len:
          max_len = len(prompt)
          max_prompt = prompt
      item[ 'prompts'].remove(max_prompt)
      #Try again
      res = generate(item['system_prompt'], item['prompts'],temperature=temperature)

    if "Answer" not in res:
      print("res = ", res)
      item["new_prompt"] = "{0}{1}\n\nAnswer:".format(item["prompt"], res)
      res2= generate(item["new_prompt"])
      item['results'] = "{0}\n\nAnswer: {1}".format(res, res2)
      item['responses'].append(res2)
      item['completions'].append("{0}\n\nAnswer: {1}".format(res, res2))
      item["asked_twice"] = True
    else:
      #kudos Marcus Borela (same performance as previous RE-based code)
      response = res.split("Answer:")[1].strip()
      item['responses'].append(response)
      ###

      item['results'] = res 
      item['completions'].append(res)
      item["asked_twice"] = False
        
json.dump(test, open(f"{main_dir}/iirc.json",'w'))

 58%|█████▊    | 29/50 [05:13<03:43, 10.62s/it]

Current number of prompts =  5


100%|██████████| 50/50 [08:27<00:00, 10.15s/it]


In [21]:
import json
import argparse
import collections
import numpy as np
import os
import re
import string
import sys
import unicodedata

def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  def remove_accents(input_str):
      nfkd_form = unicodedata.normalize('NFKD', input_str)
      only_ascii = nfkd_form.encode('ASCII', 'ignore')
      return only_ascii.decode("utf-8")

  return white_space_fix(remove_articles(remove_punc(lower(remove_accents(s)))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
  return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1


In [22]:
from collections import Counter
from tqdm import tqdm

f1s = []
ems = []

for item in tqdm(test):
    normalised = [normalize_answer(a.replace('\n','')) for a in item['responses']]
    c = Counter(normalised)
    response = c.most_common(1)[0][0]
    if "Not enough information provided in the documents." == item['clean_answers'][0]:
        item['clean_answers'][0] = "Not enough information"
    f1 = compute_f1(item['clean_answers'][0], response)
    f1s.append(f1)
    ems.append(compute_exact(item['clean_answers'][0], response))

print("F1:",np.mean(f1s))
print("EM:",np.mean(ems))

100%|██████████| 50/50 [00:00<00:00, 13338.96it/s]

F1: 0.439500466853408
EM: 0.36





### Conclusions

*   Metrics without removing HTML tags: F1: 0.45294651584974166, EM: 0.34
*   Metrics achieved when removing HTML tags: F1: 0.4319157509157509, EM: 0.36

