<table class="tfo-notebook-buttons" align="left">
  <td>
    <a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/nlp/multilingual/labse-language-agnostic-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>    
  </td>
  <td>
    <a href="https://github.com/martin-fabbri/colab-notebooks/blob/master/nlp/multilingual/labse-language-agnostic-bert.ipynb" target="_parent"><img src="https://raw.githubusercontent.com/martin-fabbri/colab-notebooks/master/assets/github.svg" alt="View On Github"/></a>  </td>
</table>

# LaBSE

The language-agnostic BERT sentence embedding support 109 languages encoding text into high dimensional vectors.


## Outline

- [1. Multilingual sentence visualization](#1)

- [3. References](#3)

In [1]:
#@title ### Setup environment
%%capture
!pip install bert-for-tf2
!pip install bokeh

In [20]:
#@title Imports & Visualization Utils
#@markdown - visualize_similarity(emb1, emb2, text1, text2)
import bert
import bokeh
import bokeh.models
import bokeh.plotting
import numpy as np
import pandas as pd
import sklearn.metrics.pairwise
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Lambda


def visualize_similarity(
    embeddings_1,
    embeddings_2,
    labels_1,
    labels_2,
    plot_title,
    plot_width=1200,
    plot_height=600,
    xaxis_font_size="12pt",
    yaxis_font_size="12pt",
):

    assert len(embeddings_1) == len(labels_1)
    assert len(embeddings_2) == len(labels_2)

    # arccos based text similarity (Yang et al. 2019; Cer et al. 2019)
    sim = (
        1
        - np.arccos(
            sklearn.metrics.pairwise.cosine_similarity(
                embeddings_1, embeddings_2
            )
        )
        / np.pi
    )

    embeddings_1_col, embeddings_2_col, sim_col = [], [], []
    for i in range(len(embeddings_1)):
        for j in range(len(embeddings_2)):
            embeddings_1_col.append(labels_1[i])
            embeddings_2_col.append(labels_2[j])
            sim_col.append(sim[i][j])
    df = pd.DataFrame(
        zip(embeddings_1_col, embeddings_2_col, sim_col),
        columns=["embeddings_1", "embeddings_2", "sim"],
    )

    mapper = bokeh.models.LinearColorMapper(
        palette=[*reversed(bokeh.palettes.YlOrRd[9])],
        low=df.sim.min(),
        high=df.sim.max(),
    )

    p = bokeh.plotting.figure(
        title=plot_title,
        x_range=labels_1,
        x_axis_location="above",
        y_range=[*reversed(labels_2)],
        plot_width=plot_width,
        plot_height=plot_height,
        tools="save",
        toolbar_location="below",
        tooltips=[("pair", "@embeddings_1 ||| @embeddings_2"), ("sim", "@sim")],
    )
    p.rect(
        x="embeddings_1",
        y="embeddings_2",
        width=1,
        height=1,
        source=df,
        fill_color={"field": "sim", "transform": mapper},
        line_color=None,
    )

    p.title.text_font_size = "12pt"
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_standoff = 16
    p.xaxis.major_label_text_font_size = xaxis_font_size
    p.xaxis.major_label_orientation = 0.25 * np.pi
    p.yaxis.major_label_text_font_size = yaxis_font_size
    p.min_border_right = 300

    bokeh.io.output_notebook()
    bokeh.io.show(p)

!pip list | grep "tensorflow-hub\|bert\|bokeh"

bert-for-tf2                  0.14.9         
bokeh                         2.1.1          
tensorflow-hub                0.11.0         


In [4]:
module_url = "https://tfhub.dev/google/LaBSE/1"
MAX_SEQ_LENGTH = 64

def embed_text(input):
    return model(input)

In [5]:
labse_layer = hub.KerasLayer(module_url, trainable=True)

input_word_ids = Input(
    shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name="input_word_ids"
)

input_mask = Input(
    shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name="input_mask"
)

segment_ids = Input(
    shape=(MAX_SEQ_LENGTH,), dtype=tf.int32, name="segment_ids"
)

pooled_output, _ = labse_layer([input_word_ids, input_mask, segment_ids])

pooled_output = Lambda(
    lambda x: tf.nn.l2_normalize(x, axis=1)
)(pooled_output)

labse_model = Model(
    inputs=[input_word_ids, input_mask, segment_ids],
    outputs=pooled_output
)

In [6]:
vocab_file = labse_layer.resolved_object.vocab_file.asset_path.numpy()
print("Loading LaBSE vocab file from:\n")
print(vocab_file)

Loading LaBSE vocab file from:

b'/tmp/tfhub_modules/0bc004950cec35a1a895621e888926cac85cbeba/assets/cased_vocab.txt'


## 2. LaBSE model

In [7]:
do_lower_case = labse_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [8]:
english_sentences = [
    "dog",
    "Puppies are nice.",
    "I enjoy taking long walks along the beach with my dog.",
]

spanish_sentences = [
    "perro",
    "Los cachorros son hermosos.",
    "Yo disfruto tomar paseos largos por la playa con mi perro.",
]

italian_sentences = [
    "cane",
    "I cuccioli sono carini.",
    "Mi piace fare lunghe passeggiate lungo la spiaggia con il mio cane.",
]

japanese_sentences = ["犬", "子犬はいいです", "私は犬と一緒にビーチを散歩するのが好きです"]

In [9]:
def create_input(input_strings, tokenizer, max_seq_length=MAX_SEQ_LENGTH):
    input_ids_all, input_mask_all, segment_ids_all = [], [], []
    for input_string in input_strings:
        input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        sequece_length = min(len(input_ids), max_seq_length)

        if len(input_ids) >= max_seq_length:
            input_ids = input_ids[:max_seq_length]
        else:
            input_ids += [0] * (max_seq_length - sequece_length)

        input_mask = [1] * sequece_length + [0] * (
            max_seq_length - sequece_length
        )
        segment_ids = [0] * max_seq_length

        input_ids_all.append(input_ids)
        input_mask_all.append(input_mask)
        segment_ids_all.append(segment_ids)
    return (
        np.array(input_ids_all),
        np.array(input_mask_all),
        np.array(segment_ids_all),
    )

In [10]:
def encode(input_text):
    input_ids, input_mask, segment_ids = create_input(input_text, tokenizer)
    encoded_text = labse_model([input_ids, input_mask, segment_ids])
    return encoded_text

In [11]:
english_embeddings = encode(english_sentences)
spanish_embeddings = encode(spanish_sentences)
italian_embeddings = encode(italian_sentences)
japanese_embeddings = encode(japanese_sentences)

### 3. Sentence similarity

English-Spanish similarity

In [12]:
np.matmul(english_embeddings, np.transpose(spanish_embeddings))

array([[0.9416965 , 0.38210246, 0.4421917 ],
       [0.3524394 , 0.86077815, 0.3836411 ],
       [0.44596314, 0.44882995, 0.9537374 ]], dtype=float32)

In [17]:
visualize_similarity(
    english_embeddings,
    spanish_embeddings,
    english_sentences,
    spanish_sentences,
    "English-Spanish similarity",
    plot_width=900,
    plot_height=500
)

## References

### [Language-agnostic BERT Sentence Embedding](https://arxiv.org/abs/2007.01852)
Fangxiaoyu Feng, Yinfei Yang, Daniel Cer, Naveen Arivazhagan, Wei Wang. 2020. arXiv preprint arXiv:2007.01852

### [Dataset: News-Commentary](http://www.casmacat.eu/corpus/news-commentary.html)
J. Tiedemann, 2012, Parallel Data, Tools and Interfaces in OPUS. In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)