In [None]:
!ls /notebooks

In [None]:
!pip list > /notebooks/requirements.txt

In [None]:
!pip install transformers sentencepiece datasets tensorflow_text einops subword-nmt langid tensorrt --no-binary :all: tensorflow

# Neural Machine Translation for English and Russian: A BERT-Based Approach


---




## Mathematical knowledge for Text-Data Manipulation

### initial step: imports and necessary tools

In [None]:
import langid
import random
import os
import pathlib

import typing
from typing import Any, Tuple

import string
import re
from string import punctuation
import nltk

import numpy as np
import pandas as pd

import einops
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import transformers as trns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tensorflow.keras.layers import TextVectorization
import tensorflow_text as tf_text

In [None]:
#@title
class ShapeChecker():
  def __init__(self):
    # Keep a cache of every axis-name seen
    self.shapes = {}

  def __call__(self, tensor, names, broadcast=False):
    if not tf.executing_eagerly():
      return[item for item in dir(tokenizers.en) if not item.startswith('_')]

    parsed = einops.parse_shape(tensor, names)

    for name, new_dim in parsed.items():
      old_dim = self.shapes.get(name, None)

      if (broadcast and new_dim == 1):
        continue

      if old_dim is None:
        # If the axis name is new, add its length to the cache.
        self.shapes[name] = new_dim
        continue

      if new_dim != old_dim:
        raise ValueError(f"Shape mismatch for dimension: '{name}'\n"
                         f"    found: {new_dim}\n"
                         f"    expected: {old_dim}\n")

### HYPERPARAMETERS AND CONSTANTS

In [None]:
MAX_TOKENS_LENGTH = 128
BUFFER_SIZE=None
BATCH_SIZE=64

MAX_VOCAB_SIZE = 150000 # MAXIMUM AMOUNT OF WORDS
UNITS = 256

NUM_HEADS = 8
DENSE_LAYER_NEURONS = 2048
NUM_LAYER = 6
DROPOUT_RATE=0.1

en_vocab_size=None
ru_vocab_size=None

max_subword_length = 10

reserved_tokens = ["[START]", "[END]", "[UNK]", "[SEP]"]

### Data Collection:
Collect a large parallel corpus of text data for the language pairs that you want to translate. A parallel corpus contains sentences in one language and their corresponding translations in another language. You can use existing parallel corpora or create your own.

In [None]:
# path_to_file = pathlib.Path("/content/drive/MyDrive/rus.txt")
# # corpus = pathlib.Path("/content/drive/MyDrive/corpus")

# rus_path = "/content/drive/MyDrive/corpus.en_ru.1m.ru"
# en_path = "/content/drive/MyDrive/corpus.en_ru.1m.en"

In [None]:
rus_path = "../datasets/yandex_rus/corpus.en_ru.1m.ru"
en_path = "../datasets/yandex_en/corpus.en_ru.1m.en"

In [None]:
langid.set_languages(['en', 'ru'])

### Data Preprocessing:
Preprocess the data by cleaning, tokenizing, and normalizing the text. Therafter, forming a vocabulary from them.

In [None]:
def lower_and_split_punct(text):
    # Нормализация символов.
    # text = tf_text.normalize_utf8(text, "NFKD")
    text = tf.strings.lower(text, encoding="utf-8")
    # Оставляем пробелы, буквы а-я a-z и выбранные знаки препинания.
    text = tf.strings.regex_replace(text, '[^ a-zа-я.?!,¿ёйъь-]', '')
    # Добавляем пробелы вокруг знаков препинания.
    text = tf.strings.regex_replace(text, r'([.?!,¿-])', r'  \1')
    # Удаляем лишние пробелы.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

In [None]:
txt = tf.constant("Последний я ё ъь")

In [None]:
lower_and_split_punct(txt).numpy().decode("utf-8")

##### Dataset with parallel sentences sized 467000

In [None]:
def load_data(path):
    data=pd.read_csv(path,delimiter='\t',header=None, encoding='utf8')
    data=data.iloc[:,:2]
    data.rename(columns={0: "English", 1: "Russian"}, inplace=True)
    print(data.head())
    context = data['English'].values
    target = data['Russian'].values
    return target, context

In [None]:
target_raw, context_raw = load_data(path_to_file)

In [None]:
print(len(context_raw))
print(len(target_raw))

In [None]:
context_raw[6]

In [None]:
target_raw[898]

In [None]:
# a = np.vectorize(lower_and_split_punct)
# target_raw = a(target_raw)
# context_raw = a(context_raw)

In [None]:
len(context_raw)//64

In [None]:
# BUFFER_SIZE = len(context_raw)
# # BUFFER_SIZE = 20000
# BATCH_SIZE = 64

# is_train = np.random.uniform(size=(len(target_raw),)) < 0.8

# train_raw = (
#     tf.data.Dataset
#     .from_tensor_slices((context_raw[is_train], target_raw[is_train]))
#     .shuffle(BUFFER_SIZE)
#     .batch(BATCH_SIZE))
# val_raw = (
#     tf.data.Dataset
#     .from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
#     .shuffle(BUFFER_SIZE)
#     .batch(BATCH_SIZE))

In [None]:
# for example_context_strings, example_target_strings in train_raw.take(1):
#   # print(example_context_strings[:5])
#   # print(example_context_strings[1])
#   # print(example_target_strings[1])
#   # print(example_target_strings[:5])
#   print(len(example_context_strings))
#   print(len(example_target_strings))
#   # break

##### Dataset with parallel sentences sized 1000000 from Yandex Parallel Corpus

In [None]:
def load_data_yandex_corpus(rus_path, en_path):
    with open(rus_path, 'rb') as russian_file:
      russian_sentences = russian_file.readlines()

    with open(en_path, 'r', encoding='utf-8') as english_file:
      english_sentences = english_file.readlines()

    data = {
        'Russian': russian_sentences,
        'English': english_sentences
    }
    df = pd.DataFrame(data)

    target = df['Russian'].values
    context = df['English'].values

    filter_context = []
    filter_target = []

    for src, trg in zip(context, target):
      if len(src)>=4 and len(trg)>=4:
        filter_context.append(src)
        filter_target.append(trg)

    target = np.array(filter_target)
    context = np.array(filter_context)

    # filtered_source_sentences = []
    # filtered_target_sentences = []

    # for source_sentence, target_sentence in zip(context, target):
    # # Classify the source sentence language
    #     source_lang, _ = langid.classify(source_sentence)

    # # Classify the target sentence language
    #     target_lang, _ = langid.classify(target_sentence)

    # # Only keep the sentences where both the source and target are identified as the expected languages
    #     if source_lang == 'en' and target_lang == 'ru':
    #         filtered_source_sentences.append(source_sentence)
    #         filtered_target_sentences.append(target_sentence)

    # return filtered_source_sentences, filtered_target_sentences
    return target, context

In [None]:
target_raw_corpus, context_raw_corpus = load_data_yandex_corpus(rus_path, en_path)

In [None]:
print(len(target_raw_corpus))
print(len(context_raw_corpus))

In [None]:
target_raw_corpus[500].decode("utf-8")

In [None]:
context_raw_corpus[500]

#### Combining datasets

In [None]:
target_raw_corpus, context_raw_corpus = np.concatenate((target_raw_corpus, target_raw)), np.concatenate((context_raw_corpus, context_raw))

In [None]:
# Generate a permutation index
permutation = np.random.permutation(len(target_raw_corpus))

# Shuffle both arrays using the permutation index
target_raw_corpus = target_raw_corpus[permutation]
context_raw_corpus = context_raw_corpus[permutation]

#### Subword different approaches

In [None]:
from transformers import BertTokenizer

rus_path = "/notebooks/trg.txt.subwords"
en_path = "/notebooks/ctx.txt.subwords"

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
print()
# Build the subword vocabulary
tokenizer.build_vocab(rus_path, en_path)

# Save the vocabulary
vocab_path = "bert_subword_vocab.txt"
tokenizer.save_vocabulary(vocab_path)

# Example usage
text = "Hello, how are you?"
encoded_tokens = tokenizer.encode(text)
decoded_text = tokenizer.decode(encoded_tokens)

print(decoded_text)
print(encoded_tokens)


In [None]:
BUFFER_SIZE = len(context_raw_corpus)
max_vocab_size = 100000 
BATCH_SIZE = 1

is_train = np.random.uniform(size=(len(target_raw_corpus),)) < 0.8

context_raw_train = context_raw_corpus[is_train]
context_raw_val = context_raw_corpus[~is_train]

target_raw_train = target_raw_corpus[is_train]
target_raw_val = target_raw_corpus[~is_train]

# train_raw = (
#     tf.data.Dataset
#     .from_tensor_slices((context_raw_train, target_raw_train))
#     .shuffle(BUFFER_SIZE)
#     .batch(BATCH_SIZE))
# val_raw = (
#     tf.data.Dataset
#     .from_tensor_slices((context_raw_val, target_raw_val))
#     .shuffle(BUFFER_SIZE)
#     .batch(BATCH_SIZE))

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_train, target_raw_train)))
    # .shuffle(BUFFER_SIZE))
    # .batch(BATCH_SIZE))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_val, target_raw_val)))
    # .shuffle(BUFFER_SIZE))
    # .batch(BATCH_SIZE))

In [None]:
for en, ru in train_raw.take(1):
    print(en.numpy().decode("utf-8"))
    print()
    print(ru.numpy().decode("utf-8"))

In [None]:
train_raw_ds = train_raw.map(lambda context, target: (lower_and_split_punct(context), lower_and_split_punct(target)), tf.data.AUTOTUNE)
val_raw_ds = val_raw.map(lambda context, target: (lower_and_split_punct(context), lower_and_split_punct(target)), tf.data.AUTOTUNE)

In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 100000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
rus_path = "../datasets/yandex_rus/corpus.en_ru.1m.ru"
en_path = "../datasets/yandex_en/corpus.en_ru.1m.en"

In [None]:
import tensorflow as tf
import tensorflow_text as text

# Read the context and target files
ctx_file = "ctx.txt.subwords"
trg_file = "trg.txt.subwords"

with open(ctx_file, "r") as f:
    ctx_text = f.read()

with open(trg_file, "r") as f:
    trg_text = f.read()

# Load the BERT tokenizer
tokenizer = text.BertTokenizer()

# Tokenize the text
ctx_tokens = tokenizer.tokenize(ctx_text)
trg_tokens = tokenizer.tokenize(trg_text)

# Convert tokens to integer sequences
ctx_token_ids = tokenizer.convert_tokens_to_ids(ctx_tokens)
trg_token_ids = tokenizer.convert_tokens_to_ids(trg_tokens)

# Create TensorFlow datasets
ctx_dataset = tf.data.Dataset.from_tensor_slices(ctx_token_ids)
trg_dataset = tf.data.Dataset.from_tensor_slices(trg_token_ids)

# Print some example token sequences
for ctx_tokens, trg_tokens in zip(ctx_dataset.take(5), trg_dataset.take(5)):
    print("Context tokens:", ctx_tokens)
    print("Target tokens:", trg_tokens)
    print()


In [None]:
rus_tokenizer = tf_text.BertTokenizer(rus_path, **bert_tokenizer_params)
en_tokenizer = tf_text.BertTokenizer(en_path, **bert_tokenizer_params)

In [None]:
# for en, ru in train_raw_ds.take(1):
#     print(en.numpy().decode("utf-8"))
#     print()
#     print(ru.numpy().decode("utf-8"))

In [None]:
# tokenizer_en = tf_text.BertTokenizer(en_path)

In [None]:
tokenizer = trns.BertTokenizer.from_pretrained('bert-base-uncased')

text = "ты очень добрый"
tokens = tokenizer.tokenize(text)
print(tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)

print(input_ids)

In [None]:
for en, ru in train_raw_ds.take(1):
    en_tokens = tokenizer.tokenize(en.numpy().decode("utf-8"))
    print("English text: ", en.numpy().decode("utf-8"))
    print("Tokens English: ", en_tokens)
    print("English ids: ", tokenizer.convert_tokens_to_ids(en_tokens))
    print()
    rus_tokens = tokenizer.tokenize(ru.numpy().decode("utf-8"))
    print("Russian text: ", ru.numpy().decode("utf-8"))
    print("Tokens Russian: ", rus_tokens)
    print("Russian ids: ", tokenizer.convert_tokens_to_ids(rus_tokens))

In [None]:
# [item for item in dir(tokenizer) if not item.startswith('_')]

In [None]:
[item for item in dir(tokenizer) if not item.startswith('_')]

#### Subword tokenization

In [None]:
BUFFER_SIZE = len(context_raw_corpus)

is_train = np.random.uniform(size=(len(target_raw_corpus),)) < 0.8

context_raw_train = context_raw_corpus[is_train]
context_raw_val = context_raw_corpus[~is_train]

target_raw_train = target_raw_corpus[is_train]
target_raw_val = target_raw_corpus[~is_train]

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_train, target_raw_train)))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_val, target_raw_val)))

In [None]:
train_raw_ds = train_raw.map(lambda context, target: (lower_and_split_punct(context), lower_and_split_punct(target)), tf.data.AUTOTUNE)
val_raw_ds = val_raw.map(lambda context, target: (lower_and_split_punct(context), lower_and_split_punct(target)), tf.data.AUTOTUNE)

In [None]:
for cnt, en in train_raw_ds.take(1):
    print(context_subword_processor.encode(cnt.numpy()))
    print(len(context_subword_processor.encode(en.numpy().decode("utf-8"))))

In [None]:
num_elements = 1

# Take the first num_elements from the train_ds dataset
sample_elements = train_raw_ds.take(num_elements)

# Iterate over the sample_elements dataset and print the context and target
for context, target in sample_elements:
    print("Context:", context)
    print("Target:", target.numpy().decode("utf-8"))
    # print("Context:", len(context))
    # print("Target:", len(target))
    print()

In [None]:
%%time
context_subword_processor = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus( # convertation of a text to encoded tokens using subword
    (context.numpy() for context, target in train_raw_ds),
    target_vocab_size=MAX_VOCAB_SIZE,
    reserved_tokens=reserved_tokens,
    max_subword_length=max_subword_length)

In [None]:
!ls /notebooks

In [None]:
context_subword_processor.save_to_file("/notebooks/ctx")

In [None]:
%%time
target_subword_processor = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (target.numpy() for context, target in train_raw_ds),
    target_vocab_size=MAX_VOCAB_SIZE,
    reserved_tokens=reserved_tokens,
    max_subword_length=max_subword_length)

In [None]:
context_subword_processor.save_to_file("/notebooks/trg")

In [None]:
%%time
context_encoded_train = [context_subword_processor.encode(s.numpy()) for s, t in train_raw]
context_encoded_val = [context_subword_processor.encode(s.numpy()) for s, t in val_raw]

In [None]:
%%time
target_encoded_train = [target_subword_processor.encode(t.numpy()) for s, t in train_raw]
target_encoded_val = [target_subword_processor.encode(t.numpy()) for s, t in val_raw]

In [None]:
start_text = "[START]"
end_text = "[END]"

In [None]:
context_subword_processor.subwords.insert(0, start_text)
context_subword_processor.subwords.insert(1, end_text)
target_subword_processor.subwords.insert(0, start_text)
target_subword_processor.subwords.insert(1, end_text)

In [None]:
start_token = "START"
end_token = "END"

for subword in context_subword_processor.subwords:
    if subword == start_token:
        print(f"Found '{start_token}' token.")
    elif subword == end_token:
        print(f"Found '{end_token}' token.")

In [None]:
[item for item in dir(context_subword_processor) if not item.startswith('_')]

In [None]:
[item for item in dir(target_subword_processor) if not item.startswith('_')]

In [None]:
START_TOKEN, END_TOKEN = [0], [1] # adding additional tokens to start and end a sentence

In [None]:
print(context_subword_processor.vocab_size)
print(target_subword_processor.vocab_size)

In [None]:
context_subword_processor.subwords[150]

In [None]:
target_subword_processor.subwords[150]

In [None]:
# context_vocab_size = context_subword_processor.vocab_size + 2
# target_vocab_size = target_subword_processor.vocab_size + 2

In [None]:
def prepare_dataset(context, target):
    
    context_tensor = context_subword_processor.encode(context.numpy())
    context_tensor = tf.constant(context_tensor[:MAX_TOKENS_LENGTH])
    context_tensor = tf.
    
    target_tensor = target_subword_processor.encode(target_numpy())
    target_tensor_in = 
    
    # context_tensor = tf.keras.utils.pad_sequences(context, maxlen=MAX_TOKENS_LENGTH, dtype="int64", padding='post', truncating='post')
    context_tensor = tf.keras.preprocessing.sequence.pad_sequences(context, maxlen=MAX_TOKENS_LENGTH, dtype="int64", padding='post', truncating='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target, maxlen=MAX_TOKENS_LENGTH, dtype="int64", padding='post', truncating='post')
    # target_tensor = tf.keras.utils.pad_sequences(target, maxlen=MAX_TOKENS_LENGTH, dtype="int64", padding='post', truncating='post')
    target_tensor_in = target_tensor[:, :-1]
    target_tensor_out = target_tensor[:, 1:]
    return (context_tensor, target_tensor_in), target_tensor_out

In [None]:
tf.ragged.constant(target_encoded_train[:10])

In [None]:
def prepare_dataset(context: list, target: list):
    context_tensor = tf.ragged.constant(context)
    target_tensor = tf.ragged.constant(target)

    # Pad sequences to a maximum length
    context_tensor = context_tensor.to_tensor(default_value=0)
    target_tensor = target_tensor.to_tensor(default_value=0)

    target_tensor_in = target_tensor[:, :-1]
    target_tensor_out = target_tensor[:, 1:]

    return (context_tensor, target_tensor_in), target_tensor_out

In [None]:
len(target_encoded_train[0])

In [None]:
len(context_encoded_train[0])

In [None]:
a = prepare_batch(context_encoded_train[:15], target_encoded_train[:15])

In [None]:
print(a[0][0].shape)
print(a[0][1].shape)
print(a[1].shape)

In [None]:
def seq_dataset(context: list, target: list):
    return (tf.data.Dataset.from_tensor_slices((context, target))
            .shuffle(BUFFER_SIZE)
            .map(prepare_dataset, tf.data.AUTOTUNE)
            .batch(BATCH_SIZE)
            .prefetch(buffer_size=tf.data.AUTOTUNE))

In [None]:
def seq_dataset(ds):
    return (ds
            .map(prepare_dataset, tf.data.AUTOTUNE)
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE)
            .prefetch(buffer_size=tf.data.AUTOTUNE))

In [None]:
%%time
train_ds = seq_dataset(context_encoded_train, target_encoded_train)
val_ds = seq_dataset(context_encoded_val, target_encoded_val)

In [None]:
%%time
train_ds = seq_dataset(train_raw_ds)
val_ds = seq_dataset(val_raw_ds)

In [None]:
for (ex_context_tok, ex_tar_in), ex_tar_out in train_ds.take(1):
  print(ex_context_tok[0, :].numpy())
  print()
  print(ex_tar_in[0, :].numpy())
  print(ex_tar_out[0, :].numpy())

In [None]:
print(context_subword_processor.decode(ex_context_tok[0, :]))
print(target_subword_processor.decode(ex_tar_in[0, :]))
print(target_subword_processor.decode(ex_tar_out[0, :]))

#### Word tokenization

In [None]:
BUFFER_SIZE = len(context_raw_corpus)

is_train = np.random.uniform(size=(len(target_raw_corpus),)) < 0.8

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_corpus[is_train], target_raw_corpus[is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_corpus[~is_train], target_raw_corpus[~is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

In [None]:
for example_context_strings, example_target_strings in train_raw.take(1):
  # print(example_context_strings[:5])
  # print(example_context_strings[1])
  # print(example_target_strings[1])
  # print(example_target_strings[:5])
  print(len(example_context_strings))
  print(len(example_target_strings))
  # break

##### Tokenizing context words, in our case english words

In [None]:
context_text_processor = tf.keras.layers.TextVectorization(
    standardize=lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

In [None]:
context_text_processor.adapt(train_raw.map(lambda context, target: context, tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE))

# Here are the first 10 words from the vocabulary:
context_text_processor.get_vocabulary()[:5]

##### Tokenizing target words, in our case russian words

In [None]:
len(context_text_processor.get_vocabulary())

In [None]:
target_text_processor = tf.keras.layers.TextVectorization(
    standardize=lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

target_text_processor.adapt(train_raw.map(lambda context, target: target, tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE))
target_text_processor.get_vocabulary()[:10]

In [None]:
# target_text_processor.get_vocabulary()

In [None]:
len(target_text_processor.get_vocabulary())

We divided our parallel sentences into 64 batches. We vectorized words and assigned at each word its own index id, and this is a vocabulary.
And we gave one batch sized 64 to vectorized and here what we have

In [None]:
example_tokens_c = context_text_processor(example_context_strings)
example_tokens_c[:3, :]

In [None]:
example_tokens_c.to_tensor()

In [None]:
example_target_strings[:3]

In [None]:
example_tokens_t = target_text_processor(example_target_strings)
example_tokens_t[:3, :]

In [None]:
context_vocab = np.array(context_text_processor.get_vocabulary())
tokens_c = context_vocab[example_tokens_c[0].numpy()]
' '.join(tokens_c)

In [None]:
target_vocab = np.array(target_text_processor.get_vocabulary())
tokens_t = target_vocab[example_tokens_t[0].numpy()]
' '.join(tokens_t)

## Processing the dataset

The `Datasets` of strings are transformed into 0-padded tensors of token IDs via the `process_text` function listed below. For training with `keras.Model.fit,` it also changes from a `((context, target))` pair to a `((context, target_in), target_out))` pair. `(inputs, labels)` pairs are what Keras anticipates; the inputs are `((context, target_in))` and the labels are `target_out`. Target_in and Target_out differ from each other in that they are moved apart by one step, making the label the subsequent token at each position.

#### Processing with subword tokenization

In [None]:
def process_text(context, target):
  context = context_subword_processor.encode(context).to_tensor()
  target = target_subword_processor.encode(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out


train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

In [None]:
en_vocab_size = context_subword_processor.vocab_size
ru_vocab_size = target_subword_processor.vocab_size

#### Processing with word tokenization

##### Processing

In [None]:
def process_text(context, target):
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out


train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

targ_in for all words except last one

targ_out for all except first one


In [None]:
for (ex_context_tok, ex_tar_in), ex_tar_out in train_ds.take(1):
  print(ex_context_tok[0, :].numpy())
  print()
  print(ex_tar_in[0, :].numpy())
  print(ex_tar_out[0, :].numpy())

In [None]:
en_vocab_size = len(context_text_processor.get_vocabulary())
ru_vocab_size = len(target_text_processor.get_vocabulary())

## Structure and What are transformers and attention used for?

##### Positional Embedding

In [None]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis] # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)

  angle_rates = 1 / (10000**depths) # (1, depth)
  angle_rads = positions * angle_rates  # seq, depth


  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1
  )

  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
pos_encoding = positional_encoding(length=DENSE_LAYER_NEURONS, depth=UNITS)

# Check the shape.
print(pos_encoding.shape)

# Plot the dimensions.
# plt.pcolormesh(pos_encoding.numpy().T, cmap='RdBu')
# plt.ylabel('Depth')
# plt.xlabel('Position')
# plt.colorbar()
# plt.show()
# pos_encoding

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=DENSE_LAYER_NEURONS, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)

    x*= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [None]:
pos_encoding

In [None]:
for (en, ru_in), ru_out in train_ds.take(1):
  # print(example_context_strings[1])
  # print(example_target_strings[1])
  break
print(en.shape)
print(ru_in.shape)
print(ru_out.shape)

In [None]:
embed_en = PositionalEmbedding(vocab_size=en_vocab_size, d_model=UNITS)
embed_ru = PositionalEmbedding(vocab_size=ru_vocab_size, d_model=UNITS)

en_emb = embed_en(en)
ru_emb = embed_ru(ru_in)

In [None]:
print(en_emb._keras_mask[0])
print(ru_emb._keras_mask[0])

#### The Encoder/Decoder


The goal of the encoder is to process the context sequence into a sequence of vectors that are useful for the decoder as it attempts to predict the next output for each timestep. Since the context sequence is constant, there is no restriction on how information can flow in the encoder, so use a bidirectional-RNN to do the processing:

Takes a list of token IDs (from context_text_processor).
Looks up an embedding vector for each token (Using a layers.Embedding).
Processes the embeddings into a new sequence (Using a bidirectional layers.GRU).
Returns the processed sequence. This will be passed to the attention head.

The Attention Layer

The attention layer lets the decoder access the information extracted by the encoder. It computes a vector from the entire context sequence, and adds that to the decoder's output.

##### FeedForwarding

In [None]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=DROPOUT_RATE):
    super().__init__()
    self.seq = tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model),
        tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    # self.layer_norm = tf.keras.layers.LayerNormalization()
    self.batch_norm = tf.keras.layers.BatchNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.batch_norm(x)
    return x

##### Attention

In [None]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.batch_norm = tf.keras.layers.BatchNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.batch_norm(x)

    return x

In [None]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x
    )
    x = self.add([x, attn_output])
    return x

In [None]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask=True
    )
    x = self.add([x, attn_output])
    return x

The Decoder

The decoder's job is to generate predictions for the next token at each location in the target sequence.

1. It looks up embeddings for each token in the target sequence.
2. It uses an RNN to process the target sequence, and keep track of what it has generated so far.
3. It uses RNN output as the "query" to the attention layer, when attending to the encoder's output.
4. At each location in the output it predicts the next token.

When training, the model predicts the next word at each location. So it's important that the information only flows in one direction through the model. The decoder uses a unidirectional (not bidirectional) RNN to process the target sequence.

When running inference with this model it produces one word at a time, and those are fed back into the model.


##### Encoder

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=DROPOUT_RATE):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
sample = EncoderLayer(d_model=UNITS, num_heads=NUM_HEADS, dff=DENSE_LAYER_NEURONS)
print(ru_emb.shape)
print(sample(ru_emb).shape)

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=DROPOUT_RATE):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [None]:
# Instantiate the encoder.
sample_encoder = Encoder(num_layers=NUM_LAYER,
                         d_model=UNITS,
                         num_heads=NUM_HEADS,
                         dff=DENSE_LAYER_NEURONS,
                         vocab_size=MAX_VOCAB_SIZE)

sample_encoder_output = sample_encoder(en, training=False)

# Print the shape.
print(en.shape)
print(sample_encoder_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.

##### Decoder

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=DROPOUT_RATE):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [None]:
# Instantiate the decoder.
sample_decoder = Decoder(num_layers=NUM_LAYER,
                         d_model=UNITS,
                         num_heads=NUM_HEADS,
                         dff=DENSE_LAYER_NEURONS,
                         vocab_size=MAX_VOCAB_SIZE)

output = sample_decoder(
    x=ru_in,
    context=en_emb)

# Print the shapes.
print(ru_in.shape)
print(en_emb.shape)
print(output.shape)

In [None]:
sample_decoder.last_attn_scores.shape  # (batch, heads, target_seq, input_seq)

##### Transformer

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=DROPOUT_RATE):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

##### Combination

In [None]:
transformer = Transformer(
    num_layers=NUM_LAYER,
    d_model=UNITS,
    num_heads=NUM_HEADS,
    dff=DENSE_LAYER_NEURONS,
    input_vocab_size=en_vocab_size,
    target_vocab_size=ru_vocab_size,
    dropout_rate=DROPOUT_RATE
)

In [None]:
output = transformer((en, ru_in))

print(ru_in.shape)
print(en.shape)
print(output.shape)

In [None]:
attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape)

In [None]:
transformer.summary()

### Training

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(UNITS)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,epsilon=1e-9)

In [None]:
plt.plot(learning_rate(tf.range(40000, dtype=tf.float32)))
plt.ylabel('Learning Rate')
plt.xlabel('Train Step')

In [None]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
transformer.fit(train_ds,
                epochs=20,
                validation_data=val_ds)

In [None]:
# history = transformer.fit(
#     train_ds,
#     epochs=100,
#     steps_per_epoch = 100,
#     validation_data=val_ds,
#     validation_steps = 20,
#     callbacks=[
#         tf.keras.callbacks.EarlyStopping(patience=3)])

In [None]:
transformer.evaluate(val_ds, steps=20, return_dict=True)

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('CE/token')
plt.legend()

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['masked_accuracy'], label='accuracy')
plt.plot(history.history['val_masked_accuracy'], label='val_accuracy')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('CE/token')
plt.legend()

### Translator

In [None]:
class Translator(tf.Module):
  @classmethod
  def add_method(cls, fun):
    setattr(cls, fun.__name__, fun)
    return fun
  def __init__(self, context_text_processor, target_text_processor, transformer):
    self.context_text_processor = context_text_processor
    self.target_text_processor = target_text_processor
    self.transformer = transformer

  def __call__(self, sentence, max_length=max_vocab_size):

    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]


    sentence = self.context_text_processor(sentence).to_tensor()

    encoder_input = sentence


    start = tf.constant(target_text_processor.get_vocabulary().index("[START]"), dtype=tf.int64)[tf.newaxis]
    end = tf.constant(target_text_processor.get_vocabulary().index("[END]"), dtype=tf.int64)[tf.newaxis]


    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)


      pre_id = tf.cast(predicted_id[0], dtype=tf.int64)
      output_array = output_array.write(i+1, pre_id)

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())


    target_vocab = np.array(target_text_processor.get_vocabulary())
    tokens_t = target_vocab[output]


    text = ' '.join(tokens_t[1:len(tokens_t)-1])
    tokens = tokens_t


    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, tokens, attention_weights

In [None]:
@Translator.add_method
def plot_attention(self, text, **kwargs):
  assert isinstance(text, str)
  output = self.translate([text], **kwargs)
  output = output[0].numpy().decode()

  attention = self.last_attention_weights[0]

  context = tf_lower_and_split_punct(text)
  context = context.numpy().decode().split()

  output = tf_lower_and_split_punct(output)
  output = output.numpy().decode().split()[1:]

  fig = plt.figure(figsize=(10, 10))
  ax = fig.add_subplot(1, 1, 1)

  ax.matshow(attention, cmap='viridis', vmin=0.0)

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + context, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + output, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  ax.set_xlabel('Input text')
  ax.set_ylabel('Output text')

In [None]:
translator = Translator(context_text_processor, target_text_processor, transformer)

In [None]:
translator.plot_attention(' are you still at home? ')

In [None]:
def print_translation(sentence, tokens, ground_truth):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Prediction":15s}: {tokens}')
  print(f'{"Ground truth":15s}: {ground_truth}')

In [None]:
sentence = ' tom likes you '
ground_truth = "  "

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

In [None]:
class ExportTranslator(tf.Module):
  def __init__(self, translator):
    self.translator = translator

  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])
  def __call__(self, sentence):
    (result,
     tokens,
     attention_weights) = self.translator(sentence)

    return result

In [None]:
translator = ExportTranslator(translator)

In [None]:
translator(' tom was very tired . he likes you . ')

In [None]:
tf.saved_model.save(translator, export_dir='translator')

In [None]:
reloaded = tf.saved_model.load('translator')

In [None]:
# reloaded()

## Evaluation methods and benchmarking techniques

### Hyperparameter Tuning:
Tune the hyperparameters of the model, such as the learning rate, batch size, and number of training epochs, to optimize its performance.


### Evaluation:
Evaluate the performance of the model on the test set using standard metrics such as BLEU, ROUGE, or METEOR.

In [None]:
from datasets import load_metric

bleu = load_metric("bleu")
predictions = [["the", "picture", "the", "picture",
				"by", "me"]]
references = [
	[["the", "picture", "is", "clicked", "by", "me"],
	["this", "picture", "was", "clicked", "by", "me"]]
]
print(bleu.compute(predictions=predictions, references=references))


In [None]:
from rouge import Rouge

rouge = Rouge()
scores = rouge.get_scores(predictions, references)
rouge_l_score = scores[0]["rouge-l"]["f"]
print(rouge_l_score)

### Deployment:
Deploy the model for use in production by integrating it into a web or mobile application.

## Reference


https://www.tensorflow.org/text/tutorials/transformer

https://www.tensorflow.org/text/tutorials/nmt_with_attention

https://www.tensorflow.org/text/tutorials/bert_glue