In [3]:
import os
squad_dir ='/content/drive/MyDrive/Colab Notebooks/NLP_TRANSFORMERS/QA/data/squad'

if not os.path.exists(squad_dir):
  os.mkdir(squad_dir)

In [4]:
url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
files = ['train-v2.0.json', 'dev-v2.0.json']

In [5]:
import requests

for file in files:
    res = requests.get(url+file)
    with open(os.path.join(squad_dir,file),'wb') as f:
      for chunk in res.iter_content(chunk_size=40):
        f.write(chunk)

In [6]:
import json

with open(os.path.join(squad_dir,'train-v2.0.json'),'rb') as f:
  squad = json.load(f)

In [None]:
squad['data'][0]

In [8]:
new_squad = []

for group in squad['data']:
  for paragraph in group['paragraphs']:
    context = paragraph['context']
    for qa_pair in paragraph['qas']:
      question = qa_pair['question']

      if 'answers' in qa_pair.keys() and len(qa_pair['answers']) > 0:
        answer = qa_pair['answers'][0]['text']
      elif 'plausible_answers' in qa_pair.keys() and len(qa_pair['plausible_answers']) > 0:
        answer = qa_pair['plausible_answers'][0]['text']
      else:
        answer = None

      new_squad.append({
          'question':question,
          'answer':answer,
          'context':context
      })

In [9]:
new_squad[:2], new_squad[-2:]

([{'question': 'When did Beyonce start becoming popular?',
   'answer': 'in the late 1990s',
   'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'},
  {'question': 'What areas did Beyonce compete in when she was growing up?',
   'answer': 'singing and dancing',
   'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born

In [10]:
with open(os.path.join(squad_dir,'train.json'),'w') as f:
  json.dump(new_squad,f)

#### Building the QA model

In [11]:
with open('/content/drive/MyDrive/Colab Notebooks/NLP_TRANSFORMERS/QA/data/squad/dev-v2.0.json','r') as f:
  squad = json.load(f)

In [12]:
from transformers import BertTokenizer, BertForQuestionAnswering

modelname ='deepset/bert-base-cased-squad2'

tokenizer = BertTokenizer.from_pretrained(modelname)
model = BertForQuestionAnswering.from_pretrained(modelname)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
from transformers import pipeline

qa = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [20]:
# Dev processing
new_squad = []
for group in squad['data']:
  for paragraph in group['paragraphs']:
    context = paragraph['context']
    for qa_pair in paragraph['qas']:
      question = qa_pair['question']

      if 'answers' in qa_pair.keys() and len(qa_pair['answers']) > 0:
        answer_list = qa_pair['answers']
      elif 'plausible_answers' in qa_pair.keys() and len(qa_pair['plausible_answers']) > 0:
        answer_list = qa_pair['plausible_answers']
      else:
        answer_list = []
      answer_list = [item['text'] for item in answer_list]
      answer_list = list(set(answer_list))
      for answer in answer_list:
          new_squad.append({
          'question':question,
          'answer':answer,
          'context':context
      })



In [34]:
new_squad[1]

{'question': 'When were the Normans in Normandy?',
 'answer': '10th and 11th centuries',
 'context': 'The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.'}

In [25]:
with open(os.path.join(squad_dir,'dev.json'),'w') as f:
  json.dump(new_squad,f)

In [29]:
answers = []

for pair in new_squad[:5]:
    # pass in our question and context to return an answer
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    # append predicted answer and real to answers list
    answers.append({
        'predicted': ans['answer'],
        'true': pair['answer']
    })

In [30]:
answers

[{'predicted': 'France.', 'true': 'France'},
 {'predicted': '10th and 11th centuries', 'true': '10th and 11th centuries'},
 {'predicted': '10th and 11th centuries',
  'true': 'in the 10th and 11th centuries'},
 {'predicted': 'Denmark, Iceland and Norway',
  'true': 'Denmark, Iceland and Norway'},
 {'predicted': 'Rollo,', 'true': 'Rollo'}]