In [1]:
import os
import bokeh
import bokeh.models
import bokeh.plotting

import numpy as np
import pandas as pd
import tensorflow_hub as hub
import sklearn.metrics.pairwise
import tensorflow.compat.v2 as tf

from tqdm import tqdm
from tqdm import trange
from simpleneighbors import SimpleNeighbors
from tensorflow_text import SentencepieceTokenizer

In [2]:
def visualize_similarity(embeddings_1, embeddings_2, labels_1, labels_2,
                         plot_title,
                         plot_width = 1200, plot_height = 600,
                         xaxis_font_size = '12pt', yaxis_font_size = '12pt'):

  assert len(embeddings_1) == len(labels_1)
  assert len(embeddings_2) == len(labels_2)

  # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
  sim = 1 - np.arccos(
      sklearn.metrics.pairwise.cosine_similarity(embeddings_1,
                                                 embeddings_2))/np.pi

  embeddings_1_col, embeddings_2_col, sim_col = [], [], []
  for i in range(len(embeddings_1)):
    for j in range(len(embeddings_2)):
      embeddings_1_col.append(labels_1[i])
      embeddings_2_col.append(labels_2[j])
      sim_col.append(sim[i][j])

  df = pd.DataFrame(zip(embeddings_1_col, embeddings_2_col, sim_col),
                    columns = ['embeddings_1', 'embeddings_2', 'sim'])

  mapper = bokeh.models.LinearColorMapper(
      palette = [*reversed(bokeh.palettes.YlOrRd[9])], low = df.sim.min(),
      high = df.sim.max())

  p = bokeh.plotting.figure(title = plot_title, x_range = labels_1,
                            x_axis_location = "above",
                            y_range = [*reversed(labels_2)],
                            plot_width = plot_width, plot_height = plot_height,
                            tools = "save",toolbar_location = 'below', tooltips = [
                                ('pair', '@embeddings_1 ||| @embeddings_2'),
                                ('sim', '@sim')])
  p.rect(x = "embeddings_1", y = "embeddings_2", width = 1, height = 1, source = df,
         fill_color = {'field': 'sim', 'transform': mapper}, line_color = None)

  p.title.text_font_size = '12pt'
  p.axis.axis_line_color = None
  p.axis.major_tick_line_color = None
  p.axis.major_label_standoff = 16
  p.xaxis.major_label_text_font_size = xaxis_font_size
  p.xaxis.major_label_orientation = 0.25 * np.pi
  p.yaxis.major_label_text_font_size = yaxis_font_size
  p.min_border_right = 300

  bokeh.io.output_notebook()
  bokeh.io.show(p)

In [3]:
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'

model = hub.load(module_url)

def embed_text(input):
  return model(input)

### Computing Text Embeddings

In [4]:
# Some texts of different lengths in different languages.
arabic_sentences = ['كلب', 'الجراء لطيفة.', 'أستمتع بالمشي لمسافات طويلة على طول الشاطئ مع كلبي.']
chinese_sentences = ['狗', '小狗很好。', '我喜欢和我的狗一起沿着海滩散步。']
english_sentences = ['dog', 'Puppies are nice.', 'I enjoy taking long walks along the beach with my dog.']
french_sentences = ['chien', 'Les chiots sont gentils.', 'J\'aime faire de longues promenades sur la plage avec mon chien.']
german_sentences = ['Hund', 'Welpen sind nett.', 'Ich genieße lange Spaziergänge am Strand entlang mit meinem Hund.']
italian_sentences = ['cane', 'I cuccioli sono carini.', 'Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.']
japanese_sentences = ['犬', '子犬はいいです', '私は犬と一緒にビーチを散歩するのが好きです']
korean_sentences = ['개', '강아지가 좋다.', '나는 나의 산책을 해변을 따라 길게 산책하는 것을 즐긴다.']
russian_sentences = ['собака', 'Милые щенки.', 'Мне нравится подолгу гулять по пляжу со своей собакой.']
spanish_sentences = ['perro', 'Los cachorros son agradables.', 'Disfruto de dar largos paseos por la playa con mi perro.']

# Multilingual example
multilingual_example = ["Willkommen zu einfachen, aber", "verrassend krachtige", "multilingüe", "compréhension du langage naturel", "модели.", "大家是什么意思" , "보다 중요한", ".اللغة التي يتحدثونها"]
multilingual_example_in_en =  ["Welcome to simple yet", "surprisingly powerful", "multilingual", "natural language understanding", "models.", "What people mean", "matters more than", "the language they speak."]

In [5]:
# Compute embeddings.
ar_result = embed_text(arabic_sentences)
en_result = embed_text(english_sentences)
es_result = embed_text(spanish_sentences)
de_result = embed_text(german_sentences)
fr_result = embed_text(french_sentences)
it_result = embed_text(italian_sentences)
ja_result = embed_text(japanese_sentences)
ko_result = embed_text(korean_sentences)
ru_result = embed_text(russian_sentences)
zh_result = embed_text(chinese_sentences)

multilingual_result = embed_text(multilingual_example)
multilingual_in_en_result = embed_text(multilingual_example_in_en)

### Visualizing Similarity

In [6]:
### Multilingual Similarity
visualize_similarity(multilingual_in_en_result, multilingual_result,
                     multilingual_example_in_en, multilingual_example,  "Multilingual Universal Sentence Encoder for Semantic Retrieval (Yang et al., 2019)")

In [7]:
### English-Arabic Similarity
visualize_similarity(en_result, ar_result, english_sentences, arabic_sentences, 'English-Arabic Similarity')

### Engish-Russian Similarity
visualize_similarity(en_result, ru_result, english_sentences, russian_sentences, 'English-Russian Similarity')

### English-Spanish Similarity
visualize_similarity(en_result, es_result, english_sentences, spanish_sentences, 'English-Spanish Similarity')

### English-Italian Similarity
visualize_similarity(en_result, it_result, english_sentences, italian_sentences, 'English-Italian Similarity')

### Italian-Spanish Similarity
visualize_similarity(it_result, es_result, italian_sentences, spanish_sentences, 'Italian-Spanish Similarity')

### English-Chinese Similarity
visualize_similarity(en_result, zh_result, english_sentences, chinese_sentences, 'English-Chinese Similarity')

### English-Korean Similarity
visualize_similarity(en_result, ko_result, english_sentences, korean_sentences, 'English-Korean Similarity')

### Chinese-Korean Similarity
visualize_similarity(zh_result, ko_result, chinese_sentences, korean_sentences, 'Chinese-Korean Similarity')

### Creating a Multilingual Semantic-Similarity Search Engine

In [8]:
# Download data to Index

corpus_metadata = [
    ('ar', 'ar-en.txt.zip', 'News-Commentary.ar-en.ar', 'Arabic'),
    ('zh', 'en-zh.txt.zip', 'News-Commentary.en-zh.zh', 'Chinese'),
    ('en', 'en-es.txt.zip', 'News-Commentary.en-es.en', 'English'),
    ('ru', 'en-ru.txt.zip', 'News-Commentary.en-ru.ru', 'Russian'),
    ('es', 'en-es.txt.zip', 'News-Commentary.en-es.es', 'Spanish'),
]

language_to_sentences = {}
language_to_news_path = {}
for language_code, zip_file, news_file, language_name in corpus_metadata:
  zip_path = tf.keras.utils.get_file(
      fname = zip_file,
      origin = 'http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/' + zip_file,
      extract = True)
  news_path = os.path.join(os.path.dirname(zip_path), news_file)
  language_to_sentences[language_code] = pd.read_csv(news_path, sep = '\t', header = None)[0][:1000]
  language_to_news_path[language_code] = news_path

  print('{:,} {} sentences'.format(len(language_to_sentences[language_code]), language_name))


Downloading data from http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/ar-en.txt.zip
1,000 Arabic sentences
Downloading data from http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/en-zh.txt.zip
1,000 Chinese sentences
Downloading data from http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/en-es.txt.zip
1,000 English sentences
Downloading data from http://opus.nlpl.eu/download.php?f=News-Commentary/v11/moses/en-ru.txt.zip
1,000 Russian sentences
1,000 Spanish sentences


In [9]:
# Transform sentences into vectors

batch_size = 2048
language_to_embeddings = {}
for language_code, zip_file, news_file, language_name in corpus_metadata:
  print('\nComputing {} embeddings'.format(language_name))
  with tqdm(total = len(language_to_sentences[language_code])) as pbar:
    for batch in pd.read_csv(language_to_news_path[language_code], sep = '\t',header = None, chunksize = batch_size):
      language_to_embeddings.setdefault(language_code, []).extend(embed_text(batch[0]))
      pbar.update(len(batch))

  0%|          | 0/1000 [00:00<?, ?it/s]


Computing Arabic embeddings


83178it [00:55, 1492.94it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]


Computing Chinese embeddings


69206it [00:33, 2049.92it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]


Computing English embeddings


238853it [01:09, 3434.46it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]


Computing Russian embeddings


190092it [01:01, 3086.28it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]


Computing Spanish embeddings


238819it [01:15, 3166.26it/s]


In [10]:
# Building an index of semantic vectors

num_index_trees = 40
language_name_to_index = {}
embedding_dimensions = len(list(language_to_embeddings.values())[0][0])
for language_code, zip_file, news_file, language_name in corpus_metadata:
  print('\nAdding {} embeddings to index'.format(language_name))
  index = SimpleNeighbors(embedding_dimensions, metric = 'dot')

  for i in trange(len(language_to_sentences[language_code])):
    index.add_one(language_to_sentences[language_code][i], language_to_embeddings[language_code][i])

  print('Building {} index with {} trees...'.format(language_name, num_index_trees))
  index.build(n = num_index_trees)
  language_name_to_index[language_name] = index

  0%|          | 0/1000 [00:00<?, ?it/s]


Adding Arabic embeddings to index


100%|██████████| 1000/1000 [03:13<00:00,  5.18it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Building Arabic index with 40 trees...

Adding Chinese embeddings to index


100%|██████████| 1000/1000 [03:15<00:00,  5.13it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Building Chinese index with 40 trees...

Adding English embeddings to index


100%|██████████| 1000/1000 [03:12<00:00,  5.19it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Building English index with 40 trees...

Adding Russian embeddings to index


100%|██████████| 1000/1000 [03:15<00:00,  5.13it/s]
  0%|          | 0/1000 [00:00<?, ?it/s]

Building Russian index with 40 trees...

Adding Spanish embeddings to index


100%|██████████| 1000/1000 [03:15<00:00,  5.11it/s]


Building Spanish index with 40 trees...
