<a href="https://colab.research.google.com/github/nelsongg/information-retrieval-tensorflow/blob/main/question_answering_dataset_SQUAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install -q tensorflow_text
#!pip install simpleneighbors[annoy]
#!pip install -q nltk # natural language tool kit
#!pip install -q tqdm # to generate progress bars

In [None]:
import json
import nltk
import os
import pprint
import random
import simpleneighbors
import urllib
from IPython.display import HTML, display
from tqdm.notebook import tqdm
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

SQUAD Dataset

  * Documentation: https://rajpurkar.github.io/SQuAD-explorer/

In [None]:
SQUAD_url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'
squad_json = json.load(urllib.request.urlopen(SQUAD_url))

In [None]:
#squad_json

In [None]:
#nltk.tokenize.sent_tokenize('En 1874 su fama llegó al médico español Antonio Palau quien se trasladó desde la ciudad de Tucumán para conocerlas. Marchó después hasta Rosario de La Frontera a adquirir las tierras (arriendo) que pertenecían a Melchora Figueroa y Goyechea de Cornejo. El 1 de abril de 1880 Palau instaló unas sencillas carpas en las faldas del cerro del cual bajan las aguas termominerales, con éxito inmediato. A principios del siglo XX se construyó el lujoso balneario y hotel Termas, que alojó a presidentes y visitantes extranjeros, y cuyo diseño y construcción estuvieron a cargo de los notables arquitectos salmantinos Manuel y José Graña, respectivamente padre e hijo. En 1921, durante la presidencia de Hipólito Yrigoyen, el ferrocarril llegó hasta el lugar. En 1886 visitó las termas Domingo Faustino Sarmiento, ilustre escritor, militar y político argentino, expresidente de la nación argentina. Otros expresidentes que llegaron al lugar fueron Julio Argentino Roca, Nicolás Avellaneda, Raúl Alfonsín y Carlos Saúl Menem. Hotel Termas visto desde el aire. En 1904 el Agua Mineral Palau de Rosario de la Frontera obtiene el 1° Premio en la Exposición Universal de Saint Louis, EE. UU., como la mejor agua mineral del mundo. Como parte de dicha exposición se realizan los primeros Juegos Olímpicos en continente americano. A mediados del siglo XX algunos inmigrantes sijs llegaron desde la región del Panyab en India contratados por las empresas de ferrocarriles que poseían sede en Inglaterra. Ellos dieron origen a la importante población argentina con orígenes sijes que llega a ser hoy un 2% de la cantidad de habitantes de Rosario de la Frontera.')

In [None]:
from nltk.text import sent_tokenize
def extract_sentences(squad):
  all_sentences = []
  for data in squad['data']:
    for paragraph in data['paragraphs']:
      sentences = nltk.tokenize.sent_tokenize(paragraph['context'])
      #print(sentences)
      all_sentences.extend(zip(sentences, [paragraph['context']] * len(sentences)))
  return list(set(all_sentences))

In [None]:
sentences = extract_sentences(squad_json)
len(sentences)

10452

In [None]:
#sentences[0:5] #-> we can see the sentence and its context (sentences es una lista de sets, donde cada set essta compuesto por sentence y context)

In [None]:
def extract_questions_answers(squad):
  questions_answers = []
  for data in squad['data']:
    for paragraph in data['paragraphs']:
      for qas in paragraph['qas']:
        if qas['answers']:
          questions_answers.append((qas['question'], qas['answers'][0]['text']))
  return list(set(questions_answers))


In [None]:
questions_answers = extract_questions_answers(squad_json)

In [None]:
len(questions_answers)

10552

In [None]:
questions_answers[0:10]

[('What is the main executive body of the EU?', 'The European Commission'),
 ('What commonality do alternate machine models, such as random access machines, share with Turing machines?',
  'the machines operate deterministically'),
 ('Who were the astronauts aboard the Apollo 11 mission?',
  'Neil Armstrong, Michael Collins and Buzz Aldrin'),
 ('What are the Catechisms of Martin Luther written in?', 'German vernacular'),
 ('How many companies were listed on the WSE on August 2009?', '374'),
 ('What did the Soviets intend to use in spacecraft after the success of Zond 5?',
  'human cosmonauts'),
 ('What runs from the riverside to higher parts of the city center?',
  'Stairs'),
 ("What cable provider did ABC reach an agreement with in 1993 to carry it's owned-and-operated stations in ABC O&O markets?",
  'Time Warner Cable'),
 ('A language solved in quadratic time implies the use of what type of Turing machine?',
  'single-tape Turing machines'),
 ('What were the two main theories of imm

In [None]:
print('Sentence and Context\n')
sentence = random.choice(sentences)
print('Sentence: ')
pprint.pprint(sentence[0])
print('\nContext:\n')
pprint.pprint(sentence[1])
print()

Sentence and Context

Sentence: 
'Q stands for the Quaternary period.'

Context:

('The following four timelines show the geologic time scale. The first shows '
 'the entire time from the formation of the Earth to the present, but this '
 'compresses the most recent eon. Therefore, the second scale shows the most '
 'recent eon with an expanded scale. The second scale compresses the most '
 'recent era, so the most recent era is expanded in the third scale. Since the '
 'Quaternary is a very short period with short epochs, it is further expanded '
 'in the fourth scale. The second, third, and fourth timelines are therefore '
 'each subsections of their preceding timeline as indicated by asterisks. The '
 'Holocene (the latest epoch) is too small to be shown clearly on the third '
 'timeline on the right, another reason for expanding the fourth scale. The '
 'Pleistocene (P) epoch. Q stands for the Quaternary period.')



## Build the Index

In [None]:
model_path = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3'
model = hub.load(model_path)

In [None]:
sentences[0][0]

'Like other national British museums, entrance to the museum has been free since 2001.'

In [None]:
sentences[0][1]

'The Victoria and Albert Museum (often abbreviated as the V&A), London, is the world\'s largest museum of decorative arts and design, housing a permanent collection of over 4.5 million objects. It was founded in 1852 and named after Queen Victoria and Prince Albert. The V&A is located in the Brompton district of the Royal Borough of Kensington and Chelsea, in an area that has become known as "Albertopolis" because of its association with Prince Albert, the Albert Memorial and the major cultural institutions with which he was associated. These include the Natural History Museum, the Science Museum and the Royal Albert Hall. The museum is a non-departmental public body sponsored by the Department for Culture, Media and Sport. Like other national British museums, entrance to the museum has been free since 2001.'

In [None]:
encodings = model.signatures['response_encoder'](input=tf.constant([sentences[0][0]]), context=tf.constant([sentences[0][1]]))
encodings

{'outputs': <tf.Tensor: shape=(1, 512), dtype=float32, numpy=
 array([[ 0.02583405,  0.01555862, -0.05111055, -0.05681932,  0.04022868,
         -0.02946803, -0.01213702, -0.02858561,  0.0354651 , -0.05452089,
          0.01888798,  0.01058078, -0.0103096 ,  0.01623675,  0.07251821,
          0.06424652,  0.03333166,  0.01099297,  0.01701182, -0.0356503 ,
         -0.01133491, -0.02006032,  0.00858851, -0.0201178 ,  0.0174555 ,
          0.04908156,  0.01050661,  0.0333305 , -0.0646935 , -0.06643648,
         -0.03217905, -0.06868497, -0.03999384,  0.03073232, -0.04177439,
         -0.0645042 , -0.00152715,  0.01870152, -0.0477081 ,  0.03902281,
         -0.07308477,  0.01935051,  0.03654603, -0.04118524,  0.07037194,
         -0.00198435,  0.00253207,  0.02224534,  0.05974026,  0.02193194,
          0.06376714, -0.03497445, -0.05021532,  0.06491258,  0.02049905,
          0.02547506, -0.01869895,  0.03226592, -0.02413476,  0.02551622,
         -0.0007823 , -0.06340646, -0.06956416, -0

In [None]:
encodings['outputs'][0]

<tf.Tensor: shape=(512,), dtype=float32, numpy=
array([ 0.02583405,  0.01555862, -0.05111055, -0.05681932,  0.04022868,
       -0.02946803, -0.01213702, -0.02858561,  0.0354651 , -0.05452089,
        0.01888798,  0.01058078, -0.0103096 ,  0.01623675,  0.07251821,
        0.06424652,  0.03333166,  0.01099297,  0.01701182, -0.0356503 ,
       -0.01133491, -0.02006032,  0.00858851, -0.0201178 ,  0.0174555 ,
        0.04908156,  0.01050661,  0.0333305 , -0.0646935 , -0.06643648,
       -0.03217905, -0.06868497, -0.03999384,  0.03073232, -0.04177439,
       -0.0645042 , -0.00152715,  0.01870152, -0.0477081 ,  0.03902281,
       -0.07308477,  0.01935051,  0.03654603, -0.04118524,  0.07037194,
       -0.00198435,  0.00253207,  0.02224534,  0.05974026,  0.02193194,
        0.06376714, -0.03497445, -0.05021532,  0.06491258,  0.02049905,
        0.02547506, -0.01869895,  0.03226592, -0.02413476,  0.02551622,
       -0.0007823 , -0.06340646, -0.06956416, -0.02872421,  0.02366752,
       -0.043759

In [None]:
len(encodings['outputs'][0])

512

In [None]:
index = simpleneighbors.SimpleNeighbors(len(encodings['outputs'][0]), metric='angular')

In [None]:
batch_size = 100
slices = zip(*(iter(sentences),) * batch_size)
num_batches = int(len(sentences) / batch_size)
num_batches

104

In [None]:
for s in tqdm(slices, total=num_batches):
  sentence_batch = list([r for r, c in s])
  context_batch = list([c for r, c in s])
  encodings = model.signatures['response_encoder'](input=tf.constant(sentence_batch), context=tf.constant(context_batch))
  for batch_index, batch in enumerate(sentence_batch):
    index.add_one(batch, encodings['outputs'][batch_index])
index.build()

  0%|          | 0/104 [00:00<?, ?it/s]