<a href="https://colab.research.google.com/github/nelsongg/information-retrieval-tensorflow/blob/main/question_answering_dataset_SQUAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q tensorflow_text
!pip install simpleneighbors[annoy]
!pip install -q nltk # natural language tool kit
!pip install -q tqdm # to generate progress bars

[K     |████████████████████████████████| 5.8 MB 16.3 MB/s 
[K     |████████████████████████████████| 588.3 MB 6.4 kB/s 
[K     |████████████████████████████████| 6.0 MB 66.4 MB/s 
[K     |████████████████████████████████| 439 kB 58.3 MB/s 
[K     |████████████████████████████████| 1.7 MB 45.1 MB/s 
[?25hLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpleneighbors[annoy]
  Downloading simpleneighbors-0.1.0-py2.py3-none-any.whl (12 kB)
Collecting annoy>=1.16.0
  Downloading annoy-1.17.1.tar.gz (647 kB)
[K     |████████████████████████████████| 647 kB 14.6 MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.1-cp37-cp37m-linux_x86_64.whl size=397034 sha256=54114b6d0757c6c1ca05798da4d98e056fc5d857cf4a003da936b1d3713170a7
  Stored in directory: /root/.cache/pip/wheels/81/94/bf/92cb0e4fef8770fe9c6df0ba588fca30a

In [2]:
import json
import nltk
import os
import pprint
import random
import simpleneighbors
import urllib
from IPython.display import HTML, display
from tqdm.notebook import tqdm
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

SQUAD Dataset

  * Documentation: https://rajpurkar.github.io/SQuAD-explorer/

In [3]:
SQUAD_url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'
squad_json = json.load(urllib.request.urlopen(SQUAD_url))

In [4]:
#squad_json

In [5]:
#nltk.tokenize.sent_tokenize('En 1874 su fama llegó al médico español Antonio Palau quien se trasladó desde la ciudad de Tucumán para conocerlas. Marchó después hasta Rosario de La Frontera a adquirir las tierras (arriendo) que pertenecían a Melchora Figueroa y Goyechea de Cornejo. El 1 de abril de 1880 Palau instaló unas sencillas carpas en las faldas del cerro del cual bajan las aguas termominerales, con éxito inmediato. A principios del siglo XX se construyó el lujoso balneario y hotel Termas, que alojó a presidentes y visitantes extranjeros, y cuyo diseño y construcción estuvieron a cargo de los notables arquitectos salmantinos Manuel y José Graña, respectivamente padre e hijo. En 1921, durante la presidencia de Hipólito Yrigoyen, el ferrocarril llegó hasta el lugar. En 1886 visitó las termas Domingo Faustino Sarmiento, ilustre escritor, militar y político argentino, expresidente de la nación argentina. Otros expresidentes que llegaron al lugar fueron Julio Argentino Roca, Nicolás Avellaneda, Raúl Alfonsín y Carlos Saúl Menem. Hotel Termas visto desde el aire. En 1904 el Agua Mineral Palau de Rosario de la Frontera obtiene el 1° Premio en la Exposición Universal de Saint Louis, EE. UU., como la mejor agua mineral del mundo. Como parte de dicha exposición se realizan los primeros Juegos Olímpicos en continente americano. A mediados del siglo XX algunos inmigrantes sijs llegaron desde la región del Panyab en India contratados por las empresas de ferrocarriles que poseían sede en Inglaterra. Ellos dieron origen a la importante población argentina con orígenes sijes que llega a ser hoy un 2% de la cantidad de habitantes de Rosario de la Frontera.')

In [6]:
from nltk.text import sent_tokenize
def extract_sentences(squad):
  all_sentences = []
  for data in squad['data']:
    for paragraph in data['paragraphs']:
      sentences = nltk.tokenize.sent_tokenize(paragraph['context'])
      #print(sentences)
      all_sentences.extend(zip(sentences, [paragraph['context']] * len(sentences)))
  return list(set(all_sentences))

In [7]:
sentences = extract_sentences(squad_json)
len(sentences)

10452

In [8]:
#sentences[0:5] #-> we can see the sentence and its context (sentences es una lista de sets, donde cada set essta compuesto por sentence y context)

In [9]:
def extract_questions_answers(squad):
  questions_answers = []
  for data in squad['data']:
    for paragraph in data['paragraphs']:
      for qas in paragraph['qas']:
        if qas['answers']:
          questions_answers.append((qas['question'], qas['answers'][0]['text']))
  return list(set(questions_answers))


In [10]:
questions_answers = extract_questions_answers(squad_json)

In [11]:
len(questions_answers)

10552

In [12]:
questions_answers[0:10]

[('What did water that flowed towards the Pacific have to flow across during the mid-Eocene?',
  'Amazonas Basin'),
 ('What kind of death did Luther say the revolting peasants deserve?',
  'in body and soul'),
 ('When was the Lisbon Treaty established?', '2007'),
 ('Which British sculptor and a leading member of the New Sculpture movement is represented in the the V&A collection?',
  'George Frampton'),
 ('What does the ctenophora use to swim?', '‘combs’ – groups of cilia'),
 ('Which bound of time is more difficult to establish?', 'lower bounds'),
 ("Which of Luther's hymn was the main one for Advent?",
  'Nun komm, der Heiden Heiland'),
 ('What does it mean for a disease to be enzootic?', 'commonly present'),
 ('Where did many Spanish Catholic move after British takeover in Florida?',
  'Most went to Cuba,'),
 ('Where do cryptophyte chloroplasts store starch?',
  'in granules found in the periplastid space')]

In [13]:
print('Sentence and Context\n')
sentence = random.choice(sentences)
print('Sentence: ')
pprint.pprint(sentence[0])
print('\nContext:\n')
pprint.pprint(sentence[1])
print()

Sentence and Context

Sentence: 
'Formal teaching may be carried out by paid professionals.'

Context:

('Teaching may be carried out informally, within the family, which is called '
 'homeschooling, or in the wider community. Formal teaching may be carried out '
 'by paid professionals. Such professionals enjoy a status in some societies '
 'on a par with physicians, lawyers, engineers, and accountants (Chartered or '
 'CPA).')



## Build the Index

In [14]:
model_path = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3'
model = hub.load(model_path)

In [15]:
sentences[0][0]

'Following this exodus, Huguenots remained in large numbers in only one region of France: the rugged Cévennes region in the south.'

In [16]:
sentences[0][1]

'After this, Huguenots (with estimates ranging from 200,000 to 1,000,000) fled to surrounding Protestant countries: England, the Netherlands, Switzerland, Norway, Denmark, and Prussia — whose Calvinist Great Elector Frederick William welcomed them to help rebuild his war-ravaged and underpopulated country. Following this exodus, Huguenots remained in large numbers in only one region of France: the rugged Cévennes region in the south. In the early 18th century, a regional group known as the Camisards who were Huguenots rioted against the Catholic Church in the region, burning churches and killing clergy. It took French troops years to hunt down and destroy all the bands of Camisards, between 1702 and 1709.'

In [17]:
encodings = model.signatures['response_encoder'](input=tf.constant([sentences[0][0]]), context=tf.constant([sentences[0][1]]))
encodings

{'outputs': <tf.Tensor: shape=(1, 512), dtype=float32, numpy=
 array([[-4.16966714e-02, -6.02807626e-02, -7.91354999e-02,
         -4.01331857e-02, -5.07585183e-02, -5.45654409e-02,
         -6.06396757e-02,  1.59942042e-02,  4.69699912e-02,
         -5.75715341e-02,  9.98238195e-03, -7.52302958e-03,
          1.44282160e-02,  5.47618680e-02, -4.63217646e-02,
         -1.06440969e-02, -5.79378288e-03,  5.27780354e-02,
         -6.27923310e-02,  7.36972243e-02, -4.64960188e-02,
          4.25235964e-02, -4.84484201e-03,  4.05097567e-02,
         -6.43942133e-02, -6.51290044e-02, -5.07866824e-03,
         -5.50431572e-02,  4.13628621e-03, -5.06071895e-02,
         -2.71646697e-02, -1.42645603e-02, -3.73821296e-02,
         -4.84618135e-02,  4.59165089e-02, -2.02946309e-02,
         -7.64925107e-02,  5.29165305e-02, -6.34871274e-02,
          6.04689606e-02,  6.43446147e-02, -5.20597445e-03,
         -3.99793033e-03,  1.06739905e-02,  7.00339973e-02,
          7.44327605e-02, -1.45145534e

In [18]:
encodings['outputs'][0]

<tf.Tensor: shape=(512,), dtype=float32, numpy=
array([-4.16966714e-02, -6.02807626e-02, -7.91354999e-02, -4.01331857e-02,
       -5.07585183e-02, -5.45654409e-02, -6.06396757e-02,  1.59942042e-02,
        4.69699912e-02, -5.75715341e-02,  9.98238195e-03, -7.52302958e-03,
        1.44282160e-02,  5.47618680e-02, -4.63217646e-02, -1.06440969e-02,
       -5.79378288e-03,  5.27780354e-02, -6.27923310e-02,  7.36972243e-02,
       -4.64960188e-02,  4.25235964e-02, -4.84484201e-03,  4.05097567e-02,
       -6.43942133e-02, -6.51290044e-02, -5.07866824e-03, -5.50431572e-02,
        4.13628621e-03, -5.06071895e-02, -2.71646697e-02, -1.42645603e-02,
       -3.73821296e-02, -4.84618135e-02,  4.59165089e-02, -2.02946309e-02,
       -7.64925107e-02,  5.29165305e-02, -6.34871274e-02,  6.04689606e-02,
        6.43446147e-02, -5.20597445e-03, -3.99793033e-03,  1.06739905e-02,
        7.00339973e-02,  7.44327605e-02, -1.45145534e-02, -4.00766656e-02,
        6.06216316e-04,  3.23082656e-02,  3.93969640

In [19]:
len(encodings['outputs'][0])

512

In [20]:
index = simpleneighbors.SimpleNeighbors(len(encodings['outputs'][0]), metric='angular')

In [21]:
batch_size = 100
slices = zip(*(iter(sentences),) * batch_size)
num_batches = int(len(sentences) / batch_size)
num_batches

104

In [22]:
for s in tqdm(slices, total=num_batches):
  sentence_batch = list([r for r, c in s])
  context_batch = list([c for r, c in s])
  encodings = model.signatures['response_encoder'](input=tf.constant(sentence_batch), context=tf.constant(context_batch))
  for batch_index, batch in enumerate(sentence_batch):
    index.add_one(batch, encodings['outputs'][batch_index])
index.build()

  0%|          | 0/104 [00:00<?, ?it/s]

## Visualizing the results

In [23]:
number_of_results = 10
question_answer = random.choice(questions_answers)
print(question_answer)

('What kind of cell wall do cyanobacteria have?', 'peptidoglycan')


In [24]:
def show_results(question, answer):
  embedding = model.signatures['question_encoder'](tf.constant([question]))['outputs'][0]
  #print(embedding)
  search_results = index.nearest(embedding, n = number_of_results)

  formatted_result = '''
    <p>Random question selected from SQUAD</p>
    <p><b>%s</b></p>
    <p>Answer:</p>
    <p><b>%s</b></p>
  ''' % (question, answer)

  formatted_result += '<ol>'
  for s in search_results:
    formatted_result += '<li>'
    formatted_result += s
    formatted_result += '</li>'
  formatted_result += '</ol>'

  display(HTML(formatted_result))

In [25]:
show_results(question_answer[0], question_answer[1])