# **Multilingual Universal Sentence Encoder Q&A Retrieval**

In [6]:
# Install the latest Tensorflow version.
%pip install -q "tensorflow-text==2.8.*"
%pip install -q simpleneighbors[annoy]
%pip install -q nltk
%pip install -q tqdm
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting ipywidgets
  Downloading ipywidgets-8.0.2-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 37.3 MB/s eta 0:00:01
[?25hCollecting jupyterlab-widgets~=3.0
  Downloading jupyterlab_widgets-3.0.3-py3-none-any.whl (384 kB)
[K     |████████████████████████████████| 384 kB 88.0 MB/s eta 0:00:01
Collecting widgetsnbextension~=4.0
  Downloading widgetsnbextension-4.0.3-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 85.5 MB/s eta 0:00:01
Installing collected packages: jupyterlab-widgets, widgetsnbextension, ipywidgets
Successfully installed ipywidgets-8.0.2 jupyterlab-widgets-3.0.3 widgetsnbextension-4.0.3
Note: you may need to restart the kernel to use updated package

In [5]:
import json
import nltk
import os
import pprint
import random
import simpleneighbors
import urllib
from IPython.display import HTML, display
from tqdm.notebook import tqdm

import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer

nltk.download('punkt')


def download_squad(url):
  return json.load(urllib.request.urlopen(url))

def extract_sentences_from_squad_json(squad):
  all_sentences = []
  for data in squad['data']:
    for paragraph in data['paragraphs']:
      sentences = nltk.tokenize.sent_tokenize(paragraph['context'])
      all_sentences.extend(zip(sentences, [paragraph['context']] * len(sentences)))
  return list(set(all_sentences)) # remove duplicates

def extract_questions_from_squad_json(squad):
  questions = []
  for data in squad['data']:
    for paragraph in data['paragraphs']:
      for qas in paragraph['qas']:
        if qas['answers']:
          questions.append((qas['question'], qas['answers'][0]['text']))
  return list(set(questions))

def output_with_highlight(text, highlight):
  output = "<li> "
  i = text.find(highlight)
  while True:
    if i == -1:
      output += text
      break
    output += text[0:i]
    output += '<b>'+text[i:i+len(highlight)]+'</b>'
    text = text[i+len(highlight):]
    i = text.find(highlight)
  return output + "</li>\n"

def display_nearest_neighbors(query_text, answer_text=None):
  query_embedding = model.signatures['question_encoder'](tf.constant([query_text]))['outputs'][0]
  search_results = index.nearest(query_embedding, n=num_results)

  if answer_text:
    result_md = '''
    <p>Random Question from SQuAD:</p>
    <p>&nbsp;&nbsp;<b>%s</b></p>
    <p>Answer:</p>
    <p>&nbsp;&nbsp;<b>%s</b></p>
    ''' % (query_text , answer_text)
  else:
    result_md = '''
    <p>Question:</p>
    <p>&nbsp;&nbsp;<b>%s</b></p>
    ''' % query_text

  result_md += '''
    <p>Retrieved sentences :
    <ol>
  '''

  if answer_text:
    for s in search_results:
      result_md += output_with_highlight(s, answer_text)
  else:
    for s in search_results:
      result_md += '<li>' + s + '</li>\n'

  result_md += "</ol>"
  display(HTML(result_md))

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
squad_url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'

squad_json = download_squad(squad_url)

my_json = 
sentences = extract_sentences_from_squad_json(squad_json)
questions = extract_questions_from_squad_json(squad_json)
print("%s sentences, %s questions extracted from SQuAD %s" % (len(sentences), len(questions), squad_url))

print("\nExample sentence and context:\n")
sentence = random.choice(sentences)
print("sentence:\n")
pprint.pprint(sentence[0])
print("\ncontext:\n")
pprint.pprint(sentence[1])
print()

10452 sentences, 10552 questions extracted from SQuAD https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json

Example sentence and context:

sentence:

('Some episodes have been returned to the BBC from the archives of other '
 'countries who bought prints for broadcast, or by private individuals who '
 'acquired them by various means.')

context:

('Some episodes have been returned to the BBC from the archives of other '
 'countries who bought prints for broadcast, or by private individuals who '
 'acquired them by various means. Early colour videotape recordings made '
 'off-air by fans have also been retrieved, as well as excerpts filmed from '
 'the television screen onto 8 mm cine film and clips that were shown on other '
 'programmes. Audio versions of all of the lost episodes exist from home '
 'viewers who made tape recordings of the show. Short clips from every story '
 'with the exception of Marco Polo, "Mission to the Unknown" and The Massacre '
 "of St Bartholomew

In [7]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3"
model = hub.load(module_url)

In [8]:
batch_size = 100

encodings = model.signatures['response_encoder'](
  input=tf.constant([sentences[0][0]]),
  context=tf.constant([sentences[0][1]]))
index = simpleneighbors.SimpleNeighbors(
    len(encodings['outputs'][0]), metric='angular')

print('Computing embeddings for %s sentences' % len(sentences))
slices = zip(*(iter(sentences),) * batch_size)
num_batches = int(len(sentences) / batch_size)
for s in tqdm(slices, total=num_batches):
  response_batch = list([r for r, c in s])
  context_batch = list([c for r, c in s])
  encodings = model.signatures['response_encoder'](
    input=tf.constant(response_batch),
    context=tf.constant(context_batch)
  )
  for batch_index, batch in enumerate(response_batch):
    index.add_one(batch, encodings['outputs'][batch_index])

index.build()
print('simpleneighbors index for %s sentences built.' % len(sentences))

Computing embeddings for 10452 sentences


  0%|          | 0/104 [00:00<?, ?it/s]

simpleneighbors index for 10452 sentences built.


In [9]:
num_results = 25

query = random.choice(questions)
display_nearest_neighbors(query[0], query[1])