<table class="tfo-notebook-buttons" align="left">
  <td>
    <a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/nlp/multilingual/labse-language-agnostic-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>    
  </td>
  <td>
    <a href="https://github.com/martin-fabbri/colab-notebooks/blob/master/nlp/multilingual/labse-language-agnostic-bert.ipynb" target="_parent"><img src="https://raw.githubusercontent.com/martin-fabbri/colab-notebooks/master/assets/github.svg" alt="View On Github"/></a>  </td>
</table>

# LaBSE

The language-agnostic BERT sentence embedding support 109 languages encoding text into high dimensional vectors.


## Outline

- [1. LaBSE model](#1)
- [2. Visualize sentence similarity](#2)
- [3. LaBSE vs. Multilingual Sentence Encoder](#3)
- [4. References](#4)

In [34]:
#@title ### Setup environment
%%capture
!pip install bert-for-tf2
!pip install bokeh

In [42]:
#@title Imports & Visualization Utils
#@markdown - visualize_similarity(emb1, emb2, text1, text2)
import bert
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise
import sys
import os
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Lambda

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

def visualize_similarity(
    embeddings_1,
    embeddings_2,
    labels_1,
    labels_2,
    plot_title,
    plot_width=1200,
    plot_height=600,
    xaxis_font_size="12pt",
    yaxis_font_size="12pt",
):

    assert len(embeddings_1) == len(labels_1)
    assert len(embeddings_2) == len(labels_2)

    # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
    sim = (
        1
        - np.arccos(
            sklearn.metrics.pairwise.cosine_similarity(
                embeddings_1, embeddings_2
            )
        )
        / np.pi
    )

    embeddings_1_col, embeddings_2_col, sim_col = [], [], []
    for i in range(len(embeddings_1)):
        for j in range(len(embeddings_2)):
            embeddings_1_col.append(labels_1[i])
            embeddings_2_col.append(labels_2[j])
            sim_col.append(sim[i][j])
    df = pd.DataFrame(
        zip(embeddings_1_col, embeddings_2_col, sim_col),
        columns=["embeddings_1", "embeddings_2", "sim"],
    )

    mapper = bokeh.models.LinearColorMapper(
        palette=[*reversed(bokeh.palettes.YlOrRd[9])],
        low=df.sim.min(),
        high=df.sim.max(),
    )

    p = bokeh.plotting.figure(
        title=plot_title,
        x_range=labels_1,
        x_axis_location="above",
        y_range=[*reversed(labels_2)],
        plot_width=plot_width,
        plot_height=plot_height,
        tools="save",
        toolbar_location="below",
        tooltips=[("pair", "@embeddings_1 ||| @embeddings_2"), ("sim", "@sim")],
    )
    p.rect(
        x="embeddings_1",
        y="embeddings_2",
        width=1,
        height=1,
        source=df,
        fill_color={"field": "sim", "transform": mapper},
        line_color=None,
    )

    p.title.text_font_size = "12pt"
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 16
    p.xaxis.major_label_text_font_size = xaxis_font_size
    p.xaxis.major_label_orientation = 0.25 * np.pi
    p.yaxis.major_label_text_font_size = yaxis_font_size
    p.min_border_right = 300

    bokeh.io.output_notebook()
    bokeh.io.show(p)

!pip list | grep "tensorflow-hub\|bert\|bokeh"

bert-for-tf2                  0.14.9         
bokeh                         2.1.1          
tensorflow-hub                0.11.0         


In [3]:
module_url = "https://tfhub.dev/google/LaBSE/1"
MAX_SEQ_LENGTH = 64

def embed_text(input):
    return model(input)

In [4]:
labse_layer = hub.KerasLayer(module_url, trainable=True)

input_word_ids = Input(
    shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name="input_word_ids"
)

input_mask = Input(
    shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name="input_mask"
)

segment_ids = Input(
    shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name="segment_ids"
)

pooled_output, _ = labse_layer([input_word_ids, input_mask, segment_ids])

pooled_output = Lambda(
    lambda x: tf.nn.l2_normalize(x, axis=1)
)(pooled_output)

labse_model = Model(
    inputs=[input_word_ids, input_mask, segment_ids],
    outputs=pooled_output
)

In [5]:
vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
print("Loading LaBSE vocab file from:\n")
print(vocab_file)

Loading LaBSE vocab file from:

b'/tmp/tfhub_modules/0bc004950cec35a1a895621e888926cac85cbeba/assets/cased_vocab.txt'


<a name="1"></a>
## 1. LaBSE model

In [6]:
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [15]:
def create_input(input_strings, tokenizer, max_seq_length=MAX_SEQ_LENGTH):
    input_ids_all, input_mask_all, segment_ids_all = [], [], []
    for input_string in input_strings:
        input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        sequece_length = min(len(input_ids), max_seq_length)

        if len(input_ids) >= max_seq_length:
            input_ids = input_ids[:max_seq_length]
        else:
            input_ids += [0] * (max_seq_length - sequece_length)

        input_mask = [1] * sequece_length + [0] * (
            max_seq_length - sequece_length
        )
        segment_ids = [0] * max_seq_length

        input_ids_all.append(input_ids)
        input_mask_all.append(input_mask)
        segment_ids_all.append(segment_ids)
    return (
        np.array(input_ids_all),
        np.array(input_mask_all),
        np.array(segment_ids_all),
    )

In [9]:
def encode(input_text):
    input_ids, input_mask, segment_ids = create_input(input_text, tokenizer)
    encoded_text = labse_model([input_ids, input_mask, segment_ids])
    return encoded_text

In [29]:
english_sentences = [
    # dogs
    "dog",
    "Puppies are nice.",
    "I really like dogs.",

    # expenses
    "I have to pay my bills.",
    "I must cut my expenses.",
    "I am not saving enough.",

    # travel
    "Travel leaves you speachless.",
    "Work, save, travel, repeat.",
    "The world is a book."
]

spanish_sentences = [
    # perros
    "perro",
    "Los cachorros son hermosos.",
    "Me gustan mucho los perros.",

    # gastos
    "Tengo que pagar mis cuentas.",
    "Debo limitar mis gastos.",
    "No estoy ahorrando lo suficiente.",

    # viajar
    "Viajar the deja sin palabras.",
    "Trabaja, ahorra, viaja, repetir.",
    "El mundo es un libro."

]

In [30]:
english_embeddings = encode(english_sentences)
spanish_embeddings = encode(spanish_sentences)

<a name="1"></a>
## 2. Visualize sentence similarity

English-Spanish similarity

In [31]:
np.matmul(english_embeddings, np.transpose(spanish_embeddings))

array([[0.94169664, 0.3821025 , 0.4760863 , 0.15467414, 0.09316662,
        0.08895118, 0.0912079 , 0.20456746, 0.2550047 ],
       [0.35243934, 0.86077833, 0.6258786 , 0.2005299 , 0.14329308,
        0.14446422, 0.129114  , 0.07418032, 0.2710706 ],
       [0.42326206, 0.6748208 , 0.93224126, 0.34993127, 0.31767827,
        0.30824906, 0.1894702 , 0.15512148, 0.29984328],
       [0.07580602, 0.22203556, 0.2923513 , 0.9203923 , 0.67320347,
        0.38553953, 0.1014867 , 0.23426145, 0.20725845],
       [0.03277462, 0.19185612, 0.25489172, 0.7552227 , 0.84708667,
        0.46313635, 0.15214604, 0.30809993, 0.13661781],
       [0.08451916, 0.19679993, 0.28389174, 0.39505637, 0.43518874,
        0.9278028 , 0.20504348, 0.2736635 , 0.19181563],
       [0.02940177, 0.16748562, 0.10989107, 0.01142392, 0.10201485,
        0.15255557, 0.7656269 , 0.25528654, 0.23659357],
       [0.22013086, 0.15794165, 0.21882798, 0.27358148, 0.28550065,
        0.22655389, 0.3208754 , 0.92591226, 0.29516527],


In [33]:
visualize_similarity(
    english_embeddings,
    spanish_embeddings,
    english_sentences,
    spanish_sentences,
    "English-Spanish similarity",
    plot_width=900,
    plot_height=500
)

<a name="3"></a>
## 3. LaBSE vs. Multilingual Universal Sentence Encoder

In [44]:
ml_use_module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
ml_use_model = hub.load(ml_use_module_url)

def ml_use_embed_text(input):
    return ml_use_model(input)

ml_use_en_embeddings = ml_use_embed_text(english_sentences)
ml_use_es_embeddings = ml_use_embed_text(spanish_sentences)

visualize_similarity(
    ml_use_en_embeddings,
    ml_use_es_embeddings,
    english_sentences,
    spanish_sentences,
    "Multilingual USE English-Spanish similarity",
    plot_width=900,
    plot_height=500
)













<a name="4"></a>
## 4. References

### [Language-agnostic BERT Sentence Embedding](https://arxiv.org/abs/2007.01852)
Fangxiaoyu Feng, Yinfei Yang, Daniel Cer, Naveen Arivazhagan, Wei Wang. 2020. arXiv preprint arXiv:2007.01852