In [5]:
!ls /notebooks

README.md	    ctx.subwords	      requirements.txt
corpus.en_ru.1m.en  machine_translator.ipynb  rus.txt
corpus.en_ru.1m.ru  mt2.ipynb		      trg.subwords


In [3]:
!pip list > /notebooks/requirements.txt

In [1]:
!pip install transformers sentencepiece datasets tensorflow_text einops subword-nmt langid tensorrt --no-binary :all: tensorflow

[33mDEPRECATION: --no-binary currently disables reading from the cache of locally built wheels. In the future --no-binary will not influence the wheel cache. pip 23.1 will enforce this behaviour change. A possible replacement is to use the --no-cache-dir option. You can use the flag --use-feature=no-binary-enable-wheel-cache to test the upcoming behaviour. Discussion can be found at https://github.com/pypa/pip/issues/11453[0m[33m
[31mERROR: Could not find a version that satisfies the requirement tensorflow_text (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow_text[0m[31m
[0m

# Neural Machine Translation for English and Russian: A BERT-Based Approach


---




## Mathematical knowledge for Text-Data Manipulation

### initial step: imports and necessary tools

In [183]:
import langid
import random
import os
import pathlib

import typing
from typing import Any, Tuple

import string
import re
from string import punctuation
import nltk

import numpy as np
import pandas as pd

import einops
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import transformers as trns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tensorflow.keras.layers import TextVectorization
import tensorflow_text as tf_text

In [184]:
#@title
class ShapeChecker():
  def __init__(self):
    # Keep a cache of every axis-name seen
    self.shapes = {}

  def __call__(self, tensor, names, broadcast=False):
    if not tf.executing_eagerly():
      return[item for item in dir(tokenizers.en) if not item.startswith('_')]

    parsed = einops.parse_shape(tensor, names)

    for name, new_dim in parsed.items():
      old_dim = self.shapes.get(name, None)

      if (broadcast and new_dim == 1):
        continue

      if old_dim is None:
        # If the axis name is new, add its length to the cache.
        self.shapes[name] = new_dim
        continue

      if new_dim != old_dim:
        raise ValueError(f"Shape mismatch for dimension: '{name}'\n"
                         f"    found: {new_dim}\n"
                         f"    expected: {old_dim}\n")

### HYPERPARAMETERS AND CONSTANTS

In [185]:
MAX_TOKENS_LENGTH = 128
BUFFER_SIZE=None
BATCH_SIZE=64

MAX_VOCAB_SIZE = 150000 # MAXIMUM AMOUNT OF WORDS
UNITS = 256

NUM_HEADS = 8
DENSE_LAYER_NEURONS = 2048
NUM_LAYER = 6
DROPOUT_RATE=0.1

en_vocab_size=None
ru_vocab_size=None

max_subword_length = 10

reserved_tokens = ["[START]", "[END]", "[UNK]", "[SEP]"]

### Data Collection:
Collect a large parallel corpus of text data for the language pairs that you want to translate. A parallel corpus contains sentences in one language and their corresponding translations in another language. You can use existing parallel corpora or create your own.

In [186]:
# path_to_file = pathlib.Path("/content/drive/MyDrive/rus.txt")
# # corpus = pathlib.Path("/content/drive/MyDrive/corpus")

# rus_path = "/content/drive/MyDrive/corpus.en_ru.1m.ru"
# en_path = "/content/drive/MyDrive/corpus.en_ru.1m.en"

In [187]:
rus_path = "../datasets/yandex_rus/corpus.en_ru.1m.ru"
en_path = "../datasets/yandex_en/corpus.en_ru.1m.en"

In [188]:
langid.set_languages(['en', 'ru'])

### Data Preprocessing:
Preprocess the data by cleaning, tokenizing, and normalizing the text. Therafter, forming a vocabulary from them.

In [189]:
def lower_and_split_punct(text):
    # Нормализация символов.
    # text = tf_text.normalize_utf8(text, "NFKD")
    text = tf.strings.lower(text, encoding="utf-8")
    # Оставляем пробелы, буквы а-я a-z и выбранные знаки препинания.
    text = tf.strings.regex_replace(text, '[^ a-zа-я.?!,¿ёйъь-]', '')
    # Добавляем пробелы вокруг знаков препинания.
    text = tf.strings.regex_replace(text, r'([.?!,¿-])', r'  \1')
    # Удаляем лишние пробелы.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

In [190]:
txt = tf.constant("Последний я ё ъь")

In [191]:
lower_and_split_punct(txt).numpy().decode("utf-8")

'[START] последний я ё ъь [END]'

##### Dataset with parallel sentences sized 467000

In [42]:
def load_data(path):
    data=pd.read_csv(path,delimiter='\t',header=None, encoding='utf8')
    data=data.iloc[:,:2]
    data.rename(columns={0: "English", 1: "Russian"}, inplace=True)
    print(data.head())
    context = data['English'].values
    target = data['Russian'].values
    return target, context

In [43]:
target_raw, context_raw = load_data(path_to_file)

  English        Russian
0     Go.          Марш!
1     Go.           Иди.
2     Go.         Идите.
3     Hi.  Здравствуйте.
4     Hi.        Привет!


In [44]:
print(len(context_raw))
print(len(target_raw))

33385
33385


In [None]:
context_raw[6]

In [None]:
target_raw[898]

In [None]:
# a = np.vectorize(lower_and_split_punct)
# target_raw = a(target_raw)
# context_raw = a(context_raw)

In [None]:
len(context_raw)//64

In [None]:
# BUFFER_SIZE = len(context_raw)
# # BUFFER_SIZE = 20000
# BATCH_SIZE = 64

# is_train = np.random.uniform(size=(len(target_raw),)) < 0.8

# train_raw = (
#     tf.data.Dataset
#     .from_tensor_slices((context_raw[is_train], target_raw[is_train]))
#     .shuffle(BUFFER_SIZE)
#     .batch(BATCH_SIZE))
# val_raw = (
#     tf.data.Dataset
#     .from_tensor_slices((context_raw[~is_train], target_raw[~is_train]))
#     .shuffle(BUFFER_SIZE)
#     .batch(BATCH_SIZE))

In [None]:
# for example_context_strings, example_target_strings in train_raw.take(1):
#   # print(example_context_strings[:5])
#   # print(example_context_strings[1])
#   # print(example_target_strings[1])
#   # print(example_target_strings[:5])
#   print(len(example_context_strings))
#   print(len(example_target_strings))
#   # break

##### Dataset with parallel sentences sized 1000000 from Yandex Parallel Corpus

In [192]:
def load_data_yandex_corpus(rus_path, en_path):
    with open(rus_path, 'rb') as russian_file:
      russian_sentences = russian_file.readlines()

    with open(en_path, 'r', encoding='utf-8') as english_file:
      english_sentences = english_file.readlines()

    data = {
        'Russian': russian_sentences,
        'English': english_sentences
    }
    df = pd.DataFrame(data)

    target = df['Russian'].values
    context = df['English'].values

    filter_context = []
    filter_target = []

    for src, trg in zip(context, target):
      if len(src)>=4 and len(trg)>=4:
        filter_context.append(src)
        filter_target.append(trg)

    target = np.array(filter_target)
    context = np.array(filter_context)

    # filtered_source_sentences = []
    # filtered_target_sentences = []

    # for source_sentence, target_sentence in zip(context, target):
    # # Classify the source sentence language
    #     source_lang, _ = langid.classify(source_sentence)

    # # Classify the target sentence language
    #     target_lang, _ = langid.classify(target_sentence)

    # # Only keep the sentences where both the source and target are identified as the expected languages
    #     if source_lang == 'en' and target_lang == 'ru':
    #         filtered_source_sentences.append(source_sentence)
    #         filtered_target_sentences.append(target_sentence)

    # return filtered_source_sentences, filtered_target_sentences
    return target, context

In [193]:
target_raw_corpus, context_raw_corpus = load_data_yandex_corpus(rus_path, en_path)

In [194]:
print(len(target_raw_corpus))
print(len(context_raw_corpus))

999999
999999


In [195]:
target_raw_corpus[500].decode("utf-8")

'В эру Интернета, мобильных телефонов, систем видеоигр, кабельного телевидения по востребованию, DVR-ов и других технологий, уже не так легко "дойти" до потребителя посредством рекламы для массового рынка (такой, как телевидение или печать).\n'

In [196]:
context_raw_corpus[500]

'With the internet, cellular phones, video game systems, on-demand cable, DVRs and other technologies, consumers are not as accessible via mass market advertising like televison or print.\n'

#### Combining datasets

In [197]:
target_raw_corpus, context_raw_corpus = np.concatenate((target_raw_corpus, target_raw)), np.concatenate((context_raw_corpus, context_raw))

NameError: name 'target_raw' is not defined

In [None]:
# Generate a permutation index
permutation = np.random.permutation(len(target_raw_corpus))

# Shuffle both arrays using the permutation index
target_raw_corpus = target_raw_corpus[permutation]
context_raw_corpus = context_raw_corpus[permutation]

#### Subword different approaches

In [81]:
from transformers import BertTokenizer

rus_path = "/notebooks/trg.txt.subwords"
en_path = "/notebooks/ctx.txt.subwords"

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
print()
# Build the subword vocabulary
tokenizer.build_vocab(rus_path, en_path)

# Save the vocabulary
vocab_path = "bert_subword_vocab.txt"
tokenizer.save_vocabulary(vocab_path)

# Example usage
text = "Hello, how are you?"
encoded_tokens = tokenizer.encode(text)
decoded_text = tokenizer.decode(encoded_tokens)

print(decoded_text)
print(encoded_tokens)


Downloading vocab.txt:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]




AttributeError: 'BertTokenizer' object has no attribute 'build_vocab'

In [30]:
BUFFER_SIZE = len(context_raw_corpus)
max_vocab_size = 100000 
BATCH_SIZE = 1

is_train = np.random.uniform(size=(len(target_raw_corpus),)) < 0.8

context_raw_train = context_raw_corpus[is_train]
context_raw_val = context_raw_corpus[~is_train]

target_raw_train = target_raw_corpus[is_train]
target_raw_val = target_raw_corpus[~is_train]

# train_raw = (
#     tf.data.Dataset
#     .from_tensor_slices((context_raw_train, target_raw_train))
#     .shuffle(BUFFER_SIZE)
#     .batch(BATCH_SIZE))
# val_raw = (
#     tf.data.Dataset
#     .from_tensor_slices((context_raw_val, target_raw_val))
#     .shuffle(BUFFER_SIZE)
#     .batch(BATCH_SIZE))

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_train, target_raw_train)))
    # .shuffle(BUFFER_SIZE))
    # .batch(BATCH_SIZE))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_val, target_raw_val)))
    # .shuffle(BUFFER_SIZE))
    # .batch(BATCH_SIZE))

In [46]:
for en, ru in train_raw.take(1):
    print(en.numpy().decode("utf-8"))
    print()
    print(ru.numpy().decode("utf-8"))

This new development in Harry's character may be a disappointment to those readers who enjoyed his old vindictive ways, but it also reinforces the position of pro-Potter people who do not see beneath the surface appearance of the characters and plots.


Такое развитие характера Гарри может разочаровать читателей, полюбивших его былую мстительность, но с другой стороны это преображение укрепляет позицию тех, кто не видит глубже сюжета и изображения героев.



2023-06-15 07:45:13.082870: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [799998]
	 [[{{node Placeholder/_1}}]]


In [36]:
train_raw_ds = train_raw.map(lambda context, target: (lower_and_split_punct(context), lower_and_split_punct(target)), tf.data.AUTOTUNE)
val_raw_ds = val_raw.map(lambda context, target: (lower_and_split_punct(context), lower_and_split_punct(target)), tf.data.AUTOTUNE)

In [82]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 100000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [101]:
rus_path = "../datasets/yandex_rus/corpus.en_ru.1m.ru"
en_path = "../datasets/yandex_en/corpus.en_ru.1m.en"

In [84]:
import tensorflow as tf
import tensorflow_text as text

# Read the context and target files
ctx_file = "ctx.txt.subwords"
trg_file = "trg.txt.subwords"

with open(ctx_file, "r") as f:
    ctx_text = f.read()

with open(trg_file, "r") as f:
    trg_text = f.read()

# Load the BERT tokenizer
tokenizer = text.BertTokenizer()

# Tokenize the text
ctx_tokens = tokenizer.tokenize(ctx_text)
trg_tokens = tokenizer.tokenize(trg_text)

# Convert tokens to integer sequences
ctx_token_ids = tokenizer.convert_tokens_to_ids(ctx_tokens)
trg_token_ids = tokenizer.convert_tokens_to_ids(trg_tokens)

# Create TensorFlow datasets
ctx_dataset = tf.data.Dataset.from_tensor_slices(ctx_token_ids)
trg_dataset = tf.data.Dataset.from_tensor_slices(trg_token_ids)

# Print some example token sequences
for ctx_tokens, trg_tokens in zip(ctx_dataset.take(5), trg_dataset.take(5)):
    print("Context tokens:", ctx_tokens)
    print("Target tokens:", trg_tokens)
    print()


TypeError: __init__() missing 1 required positional argument: 'vocab_lookup_table'

In [83]:
rus_tokenizer = tf_text.BertTokenizer(rus_path, **bert_tokenizer_params)
en_tokenizer = tf_text.BertTokenizer(en_path, **bert_tokenizer_params)

2023-06-15 16:28:03.012886: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at lookup_table_init_op.cc:148 : FAILED_PRECONDITION: HashTable has different value for same key. Key '[END]' has 3 and trying to add value 8


FailedPreconditionError: {{function_node __wrapped__InitializeTableFromTextFileV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} HashTable has different value for same key. Key '[END]' has 3 and trying to add value 8 [Op:InitializeTableFromTextFileV2]

In [None]:
# for en, ru in train_raw_ds.take(1):
#     print(en.numpy().decode("utf-8"))
#     print()
#     print(ru.numpy().decode("utf-8"))

In [None]:
# tokenizer_en = tf_text.BertTokenizer(en_path)

In [None]:
tokenizer = trns.BertTokenizer.from_pretrained('bert-base-uncased')

text = "ты очень добрый"
tokens = tokenizer.tokenize(text)
print(tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)

print(input_ids)

['т', '##ы', 'о', '##ч', '##е', '##н', '##ь', 'д', '##о', '##б', '##р', '##ы', '##и']
[1197, 29113, 1193, 29752, 15290, 18947, 23742, 1184, 14150, 29740, 16856, 29113, 10325]


In [48]:
for en, ru in train_raw_ds.take(1):
    en_tokens = tokenizer.tokenize(en.numpy().decode("utf-8"))
    print("English text: ", en.numpy().decode("utf-8"))
    print("Tokens English: ", en_tokens)
    print("English ids: ", tokenizer.convert_tokens_to_ids(en_tokens))
    print()
    rus_tokens = tokenizer.tokenize(ru.numpy().decode("utf-8"))
    print("Russian text: ", ru.numpy().decode("utf-8"))
    print("Tokens Russian: ", rus_tokens)
    print("Russian ids: ", tokenizer.convert_tokens_to_ids(rus_tokens))

English text:  this new development in harrys character may be a disappointment to those readers who enjoyed his old vindictive ways  , but it also reinforces the position of pro  -potter people who do not see beneath the surface appearance of the characters and plots  .
Tokens English:  ['this', 'new', 'development', 'in', 'harry', '##s', 'character', 'may', 'be', 'a', 'disappointment', 'to', 'those', 'readers', 'who', 'enjoyed', 'his', 'old', 'vin', '##dict', '##ive', 'ways', ',', 'but', 'it', 'also', 'reinforce', '##s', 'the', 'position', 'of', 'pro', '-', 'potter', 'people', 'who', 'do', 'not', 'see', 'beneath', 'the', 'surface', 'appearance', 'of', 'the', 'characters', 'and', 'plots', '.']
English ids:  [2023, 2047, 2458, 1999, 4302, 2015, 2839, 2089, 2022, 1037, 10520, 2000, 2216, 8141, 2040, 5632, 2010, 2214, 19354, 29201, 3512, 3971, 1010, 2021, 2009, 2036, 19444, 2015, 1996, 2597, 1997, 4013, 1011, 10693, 2111, 2040, 2079, 2025, 2156, 4218, 1996, 3302, 3311, 1997, 1996, 3494, 

2023-06-15 07:51:07.129043: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [799998]
	 [[{{node Placeholder/_1}}]]


In [35]:
# [item for item in dir(tokenizer) if not item.startswith('_')]

In [None]:
[item for item in dir(tokenizer) if not item.startswith('_')]

#### Subword tokenization

In [101]:
BUFFER_SIZE = len(context_raw_corpus)

is_train = np.random.uniform(size=(len(target_raw_corpus),)) < 0.8

context_raw_train = context_raw_corpus[is_train]
context_raw_val = context_raw_corpus[~is_train]

target_raw_train = target_raw_corpus[is_train]
target_raw_val = target_raw_corpus[~is_train]

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_train, target_raw_train)))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_val, target_raw_val)))

In [102]:
train_raw_ds = train_raw.map(lambda context, target: (lower_and_split_punct(context), lower_and_split_punct(target)), tf.data.AUTOTUNE)
val_raw_ds = val_raw.map(lambda context, target: (lower_and_split_punct(context), lower_and_split_punct(target)), tf.data.AUTOTUNE)

In [119]:
for cnt, en in train_raw_ds.take(1):
    print(context_subword_processor.encode(cnt.numpy()))
    print(len(context_subword_processor.encode(en.numpy().decode("utf-8"))))

[26, 60, 117, 11, 30402, 1843, 80, 21, 12, 15116, 10, 112, 4368, 61, 4226, 49, 301, 84428, 96072, 3379, 6, 41, 23, 57, 24378, 5, 667, 8, 3145, 16, 15722, 86, 61, 79, 30, 131, 7483, 5, 1918, 3020, 8, 5, 3217, 9, 21725, 7]
376


2023-06-15 17:15:10.299256: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [799214]
	 [[{{node Placeholder/_1}}]]


In [103]:
num_elements = 1

# Take the first num_elements from the train_ds dataset
sample_elements = train_raw_ds.take(num_elements)

# Iterate over the sample_elements dataset and print the context and target
for context, target in sample_elements:
    print("Context:", context)
    print("Target:", target.numpy().decode("utf-8"))
    # print("Context:", len(context))
    # print("Target:", len(target))
    print()

Context: tf.Tensor(b'this new development in harrys character may be a disappointment to those readers who enjoyed his old vindictive ways  , but it also reinforces the position of pro  -potter people who do not see beneath the surface appearance of the characters and plots  .', shape=(), dtype=string)
Target: такое развитие характера гарри может разочаровать читателей  , полюбивших его былую мстительность  , но с другой стороны это преображение укрепляет позицию тех  , кто не видит глубже сюжета и изображения героев  .



2023-06-15 16:31:56.432707: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [799214]
	 [[{{node Placeholder/_1}}]]


In [121]:
%%time
context_subword_processor = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus( # convertation of a text to encoded tokens using subword
    (context.numpy() for context, target in train_raw_ds),
    target_vocab_size=MAX_VOCAB_SIZE,
    reserved_tokens=reserved_tokens,
    max_subword_length=max_subword_length)

CPU times: user 10min 47s, sys: 18 s, total: 11min 5s
Wall time: 9min 27s


In [109]:
!ls /notebooks

README.md	    corpus.en_ru.1m.ru	      rus.txt
corpus.en_ru.1m.en  machine_translator.ipynb


In [122]:
context_subword_processor.save_to_file("/notebooks/ctx")

In [123]:
%%time
target_subword_processor = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (target.numpy() for context, target in train_raw_ds),
    target_vocab_size=MAX_VOCAB_SIZE,
    reserved_tokens=reserved_tokens,
    max_subword_length=max_subword_length)

CPU times: user 23min 59s, sys: 20.2 s, total: 24min 20s
Wall time: 22min 42s


In [124]:
context_subword_processor.save_to_file("/notebooks/trg")

In [24]:
%%time
context_encoded_train = [context_subword_processor.encode(s.numpy()) for s, t in train_raw]
context_encoded_val = [context_subword_processor.encode(s.numpy()) for s, t in val_raw]

2023-06-15 14:50:51.368299: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [200761]
	 [[{{node Placeholder/_1}}]]


CPU times: user 3min 5s, sys: 12 s, total: 3min 17s
Wall time: 3min 6s


In [25]:
%%time
target_encoded_train = [target_subword_processor.encode(t.numpy()) for s, t in train_raw]
target_encoded_val = [target_subword_processor.encode(t.numpy()) for s, t in val_raw]

CPU times: user 3min 30s, sys: 12.9 s, total: 3min 43s
Wall time: 3min 32s


In [86]:
start_text = "[START]"
end_text = "[END]"

In [67]:
context_subword_processor.subwords.insert(0, start_text)
context_subword_processor.subwords.insert(1, end_text)
target_subword_processor.subwords.insert(0, start_text)
target_subword_processor.subwords.insert(1, end_text)

In [31]:
start_token = "START"
end_token = "END"

for subword in context_subword_processor.subwords:
    if subword == start_token:
        print(f"Found '{start_token}' token.")
    elif subword == end_token:
        print(f"Found '{end_token}' token.")

Found 'START' token.
Found 'END' token.


In [28]:
[item for item in dir(context_subword_processor) if not item.startswith('_')]

['build_from_corpus',
 'decode',
 'encode',
 'load_from_file',
 'save_to_file',
 'subwords',
 'vocab_size']

In [29]:
[item for item in dir(target_subword_processor) if not item.startswith('_')]

['build_from_corpus',
 'decode',
 'encode',
 'load_from_file',
 'save_to_file',
 'subwords',
 'vocab_size']

In [109]:
START_TOKEN, END_TOKEN = [0], [1] # adding additional tokens to start and end a sentence

In [35]:
print(context_subword_processor.vocab_size)
print(target_subword_processor.vocab_size)

96271
100981


In [36]:
context_subword_processor.subwords[150]

'before_'

In [37]:
target_subword_processor.subwords[150]

'например'

In [112]:
# context_vocab_size = context_subword_processor.vocab_size + 2
# target_vocab_size = target_subword_processor.vocab_size + 2

In [61]:
def prepare_dataset(context, target):
    
    context_tensor = context_subword_processor.encode(context.numpy())
    context_tensor = tf.constant(context_tensor[:MAX_TOKENS_LENGTH])
    context_tensor = tf.
    
    target_tensor = target_subword_processor.encode(target_numpy())
    target_tensor_in = 
    
    # context_tensor = tf.keras.utils.pad_sequences(context, maxlen=MAX_TOKENS_LENGTH, dtype="int64", padding='post', truncating='post')
    context_tensor = tf.keras.preprocessing.sequence.pad_sequences(context, maxlen=MAX_TOKENS_LENGTH, dtype="int64", padding='post', truncating='post')
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target, maxlen=MAX_TOKENS_LENGTH, dtype="int64", padding='post', truncating='post')
    # target_tensor = tf.keras.utils.pad_sequences(target, maxlen=MAX_TOKENS_LENGTH, dtype="int64", padding='post', truncating='post')
    target_tensor_in = target_tensor[:, :-1]
    target_tensor_out = target_tensor[:, 1:]
    return (context_tensor, target_tensor_in), target_tensor_out

In [73]:
tf.ragged.constant(target_encoded_train[:10])

<tf.RaggedTensor [[100933, 100885, 86893, 415, 3090, 100757, 100765, 502, 14492, 120, 31801,
  17, 100794, 6070, 16265, 9835, 100794, 84095, 34406, 12069, 100766,
  100757, 76001, 22, 7899, 54702, 87, 266, 9431, 37125, 7888, 100933,
  100888, 4291, 70306, 100757, 6, 11, 4110, 210, 73543, 4135, 468, 11161,
  100771, 100735]                                                          ,
 [100933, 100879, 24077, 666, 1632, 100774, 100781, 100757, 21, 100774,
  100782, 100757, 254, 4157, 686, 59833, 132, 91281, 415, 39156, 99504, 6,
  23184, 42, 7, 7963, 100771, 100735]                                     ,
 [100933, 100869, 100757, 269, 43055, 5168, 27312, 158, 147, 70, 559,
  100791, 100801, 100790, 100792, 100800, 100757, 100808, 100790, 100791,
  100791, 100790, 100809, 100797, 100757, 6, 100800, 100798, 100808, 100808,
  100771, 100735]                                                           ,
 [100933, 100900, 100757, 71, 46585, 4157, 155, 36342, 31232, 34, 58627, 8,
  35567, 52, 10077

In [65]:
def prepare_dataset(context: list, target: list):
    context_tensor = tf.ragged.constant(context)
    target_tensor = tf.ragged.constant(target)

    # Pad sequences to a maximum length
    context_tensor = context_tensor.to_tensor(default_value=0)
    target_tensor = target_tensor.to_tensor(default_value=0)

    target_tensor_in = target_tensor[:, :-1]
    target_tensor_out = target_tensor[:, 1:]

    return (context_tensor, target_tensor_in), target_tensor_out

In [62]:
len(target_encoded_train[0])

46

In [63]:
len(context_encoded_train[0])

40

In [71]:
a = prepare_batch(context_encoded_train[:15], target_encoded_train[:15])

In [72]:
print(a[0][0].shape)
print(a[0][1].shape)
print(a[1].shape)

(15, 40)
(15, 45)
(15, 45)


In [135]:
def seq_dataset(context: list, target: list):
    return (tf.data.Dataset.from_tensor_slices((context, target))
            .shuffle(BUFFER_SIZE)
            .map(prepare_dataset, tf.data.AUTOTUNE)
            .batch(BATCH_SIZE)
            .prefetch(buffer_size=tf.data.AUTOTUNE))

In [None]:
def seq_dataset(ds):
    return (ds
            .map(prepare_dataset, tf.data.AUTOTUNE)
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE)
            .prefetch(buffer_size=tf.data.AUTOTUNE))

In [None]:
%%time
train_ds = seq_dataset(context_encoded_train, target_encoded_train)
val_ds = seq_dataset(context_encoded_val, target_encoded_val)

In [None]:
%%time
train_ds = seq_dataset(train_raw_ds)
val_ds = seq_dataset(val_raw_ds)

In [None]:
for (ex_context_tok, ex_tar_in), ex_tar_out in train_ds.take(1):
  print(ex_context_tok[0, :].numpy())
  print()
  print(ex_tar_in[0, :].numpy())
  print(ex_tar_out[0, :].numpy())

In [53]:
print(context_subword_processor.decode(ex_context_tok[0, :]))
print(target_subword_processor.decode(ex_tar_in[0, :]))
print(target_subword_processor.decode(ex_tar_out[0, :]))

NameError: name 'context_subword_processor' is not defined

#### Word tokenization

In [198]:
BUFFER_SIZE = len(context_raw_corpus)

is_train = np.random.uniform(size=(len(target_raw_corpus),)) < 0.8

train_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_corpus[is_train], target_raw_corpus[is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))
val_raw = (
    tf.data.Dataset
    .from_tensor_slices((context_raw_corpus[~is_train], target_raw_corpus[~is_train]))
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE))

In [199]:
for example_context_strings, example_target_strings in train_raw.take(1):
  # print(example_context_strings[:5])
  # print(example_context_strings[1])
  # print(example_target_strings[1])
  # print(example_target_strings[:5])
  print(len(example_context_strings))
  print(len(example_target_strings))
  # break

2023-06-15 19:56:41.490047: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [800046]
	 [[{{node Placeholder/_1}}]]
2023-06-15 19:56:41.490519: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [800046]
	 [[{{node Placeholder/_0}}]]


64
64


##### Tokenizing context words, in our case english words

In [200]:
context_text_processor = tf.keras.layers.TextVectorization(
    standardize=lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

In [201]:
context_text_processor.adapt(train_raw.map(lambda context, target: context, tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE))

# Here are the first 10 words from the vocabulary:
context_text_processor.get_vocabulary()[:5]

2023-06-15 19:56:44.871745: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [800046]
	 [[{{node Placeholder/_0}}]]
2023-06-15 19:56:44.872097: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype string and shape [800046]
	 [[{{node Placeholder/_1}}]]


KeyboardInterrupt: 

##### Tokenizing target words, in our case russian words

In [None]:
len(context_text_processor.get_vocabulary())

In [None]:
target_text_processor = tf.keras.layers.TextVectorization(
    standardize=lower_and_split_punct,
    max_tokens=MAX_VOCAB_SIZE,
    ragged=True)

target_text_processor.adapt(train_raw.map(lambda context, target: target, tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE))
target_text_processor.get_vocabulary()[:10]

In [202]:
# target_text_processor.get_vocabulary()

In [203]:
len(target_text_processor.get_vocabulary())

150000

We divided our parallel sentences into 64 batches. We vectorized words and assigned at each word its own index id, and this is a vocabulary.
And we gave one batch sized 64 to vectorized and here what we have

In [204]:
example_tokens_c = context_text_processor(example_context_strings)
example_tokens_c[:3, :]

2023-06-15 19:56:57.862467: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at lookup_table_op.cc:929 : FAILED_PRECONDITION: Table not initialized.


FailedPreconditionError: Exception encountered when calling layer 'string_lookup_8' (type StringLookup).

{{function_node __wrapped__LookupTableFindV2_device_/job:localhost/replica:0/task:0/device:CPU:0}} Table not initialized. [Op:LookupTableFindV2]

Call arguments received by layer 'string_lookup_8' (type StringLookup):
  • inputs=<tf.RaggedTensor [[b'[START]', b'special', b'attention', b'should', b'also', b'be', b'given',
  b'to', b'the', b'mv', b'cable', b'the', b'outermost', b'insulation',
  b'of', b'cables', b'shall', b'be', b'considered', b'at', b'ground',
  b'potential', b',', b'therefore', b'shall', b'be', b'kept', b'at', b'a',
  b'certain', b'distance', b'from', b'the', b'transformer', b'live',
  b'parts', b'in', b'the', b'same', b'way', b'as', b'for', b'the', b'other',
  b'accessories', b',', b'in', b'compliance', b'with', b'table', b'.',
  b'on', b'request', b'we', b'can', b'supply', b'connections', b'and',
  b'busbars', b'based', b'on', b'customers', b'specification', b'.',
  b'[END]']                                                                 ,
 [b'[START]', b'as', b'alexander', b'zmeul', b'atafisha', b'the', b'age',
  b'of', b'architectural', b'blockbusters', b'is', b'upon', b'us', b'.',
  b'[END]']                                                              ,
 [b'[START]', b'orco', b'-molcom', b'has', b'adopted', b'the', b'best',
  b'traditions', b'of', b'european', b'development', b',', b'which', b',',
  b'consolidated', b'with', b'many', b'years', b'of', b'experience', b'on',
  b'the', b'russian', b'market', b',', b'undoubtedly', b'makes', b'the',
  b'company', b'one', b'of', b'the', b'key', b'players', b'on', b'the',
  b'real', b'estate', b'market', b'.', b'[END]']                           ,
 [b'[START]', b'engineering', b'and', b'consulting', b'services', b',',
  b'drawing', b'up', b'of', b'analytical', b'reviews', b'about',
  b'situation', b'on', b'market', b'of', b'control', b'and', b'automation',
  b'systems', b'[END]']                                                    ,
 [b'[START]', b'if', b'arriving', b'and', b'departing', b'via',
  b'chhatarpati', b'shivaji', b'international', b'airport', b'it', b'is',
  b'worth', b'noting', b'the', b'kilometres', b'to', b'kamran',
  b'residency', b'.', b'[END]']                                          ,
 [b'[START]', b'undef', b'recognizes', b'the', b'important', b'role',
  b'that', b'political', b'parties', b'play', b'in', b'democratization',
  b'.', b'[END]']                                                       ,
 [b'[START]', b'these', b'gold', b'antennas', b'act', b'physically',
  b'like', b'radio', b'antennas', b'.', b'[END]']                   ,
 [b'[START]', b'the', b'documents', b'to', b'be', b'submitted', b'should',
  b'be', b'translated', b'into', b'russian', b'and', b'be', b'affixed',
  b'with', b'apostille', b'.', b'[END]']                                  ,
 [b'[START]', b'windows', b'system', b'resource', b'manager', b'looks',
  b'for', b'states', b'to', b'be', b'evaluated', b'when', b'the', b'event',
  b'occurs', b'.', b'[END]']                                               ,
 [b'[START]', b'yet', b'he', b'too', b'is', b'a', b'product', b'of', b'the',
  b's', b',', b'an', b'unemployed', b'ex', b'-spy', b'who', b'became', b'a',
  b'top', b'official', b'in', b'the', b'yeltsin', b'kremlin', b'.',
  b'[END]']                                                                 ,
 [b'[START]', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.', b'.',
  b'.', b'.', b'.', b'.', b'.', b'.', b'pension', b'assets', b'nominal',
  b'income', b'ratio', b'mos', b'.', b'-', b'own', b'capital', b'is',
  b'calculated', b'in', b'accordance', b'with', b'the', b'prudential',
  b'rules', b'key', b'indicators', b'of', b'financial', b'market', b'of',
  b'the', b'republic', b'of', b'kazakhstan', b'as', b'of', b'october', b',',
  b'kzt', b'bln', b'.', b'[END]']                                           ,
 [b'[START]', b'if', b'you', b'have', b'registered', b'and', b'for',
  b'some', b'reason', b'cannot', b'attend', b',', b'you', b'are',
  b'requested', b'to', b'cancel', b'your', b'registration', b'online', b'.',
  b'[END]']                                                                 ,
 [b'[START]', b'the', b'revisions', b'entered', b'into', b'force', b'on',
  b'february', b'.', b'[END]']                                           ,
 [b'[START]', b'it', b'is', b'noted', b',', b'that', b'selection', b'of',
  b'events', b'on', b'centrality', b'in', b'cumulative', b'region',
  b'could', b'help', b'to', b'localize', b'a', b'position', b'of', b'a',
  b'critical', b'point', b'.', b'[END]']                                 ,
 [b'[START]', b'despite', b'the', b'assertions', b'of', b'some',
  b'theoreticians', b',', b'print', b'did', b'not', b'lose', b'its',
  b'function', b'and', b'importance', b'when', b'radio', b'and',
  b'television', b'developed', b'in', b'the', b'th', b'century', b'on',
  b'the', b'contrary', b',', b'the', b'continuing', b'growth', b'of',
  b'the', b'print', b'media', b'throughout', b'the', b'world', b'attests',
  b'to', b'extension', b'and', b'expansion', b'of', b'both', b'the',
  b'function', b'and', b'importance', b'of', b'print', b'.', b'[END]']    ,
 [b'[START]', b'you', b'might', b'know', b'that', b'you', b'have', b'a',
  b'certain', b'feeling', b'sometimes', b'and', b'you', b'might', b'call',
  b'it', b'an', b'anxiety', b'in', b'my', b'stomach', b'.', b'[END]']     ,
 [b'[START]', b'our', b'players', b'confronted', b'indeed', b'strong',
  b'fun', b'support', b',', b'therefore', b',', b'it', b'physiologically',
  b'influenced', b'them', b'.', b'[END]']                                 ,
 [b'[START]', b'a', b'dead', b'journalist', b',', b'his', b'headless',
  b'corpse', b'found', b'in', b'a', b'shallow', b'grave', b'.', b'[END]'],
 [b'[START]', b'how', b'many', b'times', b'have', b'i', b'seen', b'a',
  b'television', b'broadcaster', b'afraid', b'to', b'tell', b'the',
  b'truth', b'on', b'-air', b',', b'although', b'no', b'one', b'was',
  b'forcing', b'him', b'to', b'do', b'it', b'.', b'[END]']            ,
 [b'[START]', b'cancers', b'of', b'the', b'colon', b'and', b'breast',
  b'are', b'some', b'of', b'the', b'most', b'common', b'forms', b'of',
  b'the', b'disease', b',', b'and', b'the', b'report', b'says', b'the',
  b'evidence', b'is', b'convincing', b'that', b'body', b'fat', b'plays',
  b'a', b'key', b'role', b'in', b'the', b'development', b'of', b'these',
  b'tumours', b'.', b'[END]']                                           ,
 [b'[START]', b'.', b'more', b'handicapped', b'parking', b'.', b'[END]'],
 [b'[START]', b'painting', b'over', b'the', b'aluminum', b'paint', b'or',
  b'foil', b'with', b'ordinary', b'paint', b'changes', b'the', b'surface',
  b'to', b'emissivity', b'.', b'[END]']                                   ,
 [b'[START]', b'this', b'year', b'we', b'held', b'seminars', b'in', b'all',
  b'areas', b'of', b'the', b'region', b',', b'the', b'focus', b'is', b'not',
  b'on', b'the', b'extras', b',', b'ie', b'quantity', b',', b'but', b'the',
  b'quality', b'of', b'education', b'.', b'[END]']                          ,
 [b'[START]', b'.', b'organization', b'of', b'the', b'judiciary', b'.',
  b'[END]']                                                            ,
 [b'[START]', b'the', b'assembly', b'resolves', b'.', b'to', b'accept',
  b'the', b'statute', b'of', b'the', b'joint', b'inspection', b'unit', b',',
  b'which', b'will', b'continue', b'to', b'be', b'responsible', b'to',
  b'the', b'council', b'in', b'so', b'far', b'as', b'the', b'activities',
  b'of', b'the', b'unit', b'relating', b'to', b'icao', b'are', b'concerned',
  b'.', b'that', b'the', b'competence', b'of', b'the', b'unit', b'shall',
  b'continue', b'to', b'extend', b'over', b'the', b'functions', b'of',
  b'the', b'secretary', b'general', b',', b'but', b'not', b'over', b'those',
  b'of', b'the', b'assembly', b',', b'the', b'council', b'[END]']           ,
 [b'[START]', b'in', b'november', b'slavik', b'fell', b'ill', b'seriously',
  b'.', b'[END]']                                                          ,
 [b'[START]', b'the', b'house', b'was', b'a', b'three', b'-bedroom',
  b'built', b'in', b'the', b'sixties', b'.', b'[END]']              ,
 [b'[START]', b'the', b'pre', b'-hellenic', b'cultures', b'of', b'the',
  b'aegean', b'sea', b'-', b'the', b'minoan', b'on', b'the', b'island',
  b'of', b'crete', b'and', b'the', b'mycenaean', b'on', b'the', b'greek',
  b'mainland', b'-', b'revealed', b'evidence', b'that', b'the', b'near',
  b'eastern', b',', b'not', b'the', b'egyptian', b',', b'culture', b'had',
  b'been', b'adopted', b'.', b'[END]']                                    ,
 [b'[START]', b'another', b'quarter', b'consists', b'of', b'russian',
  b'-speaking', b'ethnic', b'ukrainians', b'who', b'have', b'generally',
  b'sided', b',', b'though', b'not', b'unconditionally', b',', b'with',
  b'the', b'russians', b'in', b'their', b'voting', b'patterns', b'this',
  b'third', b'group', b'will', b'now', b'split', b',', b'with', b'a',
  b'significant', b'portion', b'probably', b'concluding', b'that', b'its',
  b'time', b'to', b'resist', b'moscows', b'bullying', b'by', b'joining',
  b'forces', b'with', b'the', b'orange', b'ukrainians', b'.', b'[END]']   ,
 [b'[START]', b'ruka', b'i', b'think', b'we', b'have', b'become', b'better',
  b'.', b'[END]']                                                           ,
 [b'[START]', b'libby', b'is', b'unliky', b'to', b'nix', b'this', b'deal',
  b'by', b'turning', b'on', b'cheney', b',', b'unless', b'something',
  b'happens', b'to', b'both', b'bush', b'and', b'cheney', b'and', b'he',
  b'is', b'forced', b'to', b'deal', b'with', b'fitzgerald', b'.', b'[END]'],
 [b'[START]', b'a', b'lone', b'woman', b'is', b'troubled', b'with', b'such',
  b'dreams', b'and', b'such', b'thoughts', b'that', b'shes', b'afeard',
  b'of', b'herself', b'sometimes', b'.', b'[END]']                          ,
 [b'[START]', b'in', b'particular', b',', b'the', b'sc', b'recalled',
  b'again', b'the', b'principle', b'that', b'the', b'defender', b'should',
  b'be', b'given', b'actual', b'notice', b'in', b'sufficient', b'time',
  b'to', b'allow', b'him', b'or', b'her', b'to', b'organise', b'a',
  b'defence', b'.', b'[END]']                                             ,
 [b'[START]', b'in', b'order', b'to', b'answer', b'this', b'question',
  b'it', b'is', b'worth', b'to', b'mention', b',', b'that', b'about', b'of',
  b'danish', b'territorial', b'claims', b'in', b'the', b'arctic',
  b'coincide', b'with', b'the', b'ones', b'of', b'canada', b'.', b'[END]']  ,
 [b'[START]', b'vga', b'camera', b'for', b'video', b'telephony', b'no',
  b'vga', b'camera', b'in', b'k', b'[END]']                            ,
 [b'[START]', b'afterwards', b',', b'williams', b'addressed', b'the',
  b'issue', b'in', b'the', b'bands', b'livejournal', b',', b'with', b'a',
  b'post', b'saying', b',', b'we', b'couldve', b'done', b'without', b'a',
  b'cover', b'piece', b'.', b'sorry', b',', b'if', b'it', b'offends',
  b'anyone', b'at', b'kerrang', b'!', b'but', b'i', b'dont', b'think',
  b'there', b'was', b'one', b'bit', b'of', b'truth', b'in', b'that',
  b'article', b'.', b'[END]']                                            ,
 [b'[START]', b'when', b'he', b'used', b'to', b'go', b'back', b'to', b'the',
  b'river', b'later', b'in', b'the', b'day', b'-', b'everything', b'there',
  b'was', b'cleaned', b'up', b'as', b'well', b'-', b'even', b'the',
  b'place', b'where', b'he', b'passed', b'stool', b'.', b'[END]']           ,
 [b'[START]', b'be', b'sure', b'fresh', b'water', b'is', b'available',
  b'at', b'all', b'times', b'.', b'[END]']                            ,
 [b'[START]', b'the', b'provisions', b'of', b'the', b'convention', b'on',
  b'withdrawal', b'from', b'the', b'agency', b',', b'suspension', b'of',
  b'membership', b'and', b'cessation', b'of', b'operations', b'are',
  b'generally', b'patterned', b'on', b'those', b'of', b'the', b'bank', b'.',
  b'[END]']                                                                 ,
 [b'[START]', b'to', b'verify', b'that', b'there', b'really', b'is', b'a',
  b'database', b'there', b',', b'type', b'in', b'this', b'query', b'[END]'],
 [b'[START]', b'for', b'information', b'about', b'how', b'to', b'contact',
  b'hardware', b'manufacturers', b',', b'click', b'the', b'appropriate',
  b'article', b'number', b'in', b'the', b'following', b'list', b'to',
  b'view', b'the', b'article', b'in', b'the', b'microsoft', b'knowledge',
  b'base', b'[END]']                                                      ,
 [b'[START]', b'incisions', b'must', b'be', b'made', b'in', b'nearly',
  b'invisible', b'places', b'while', b'plastic', b'surgery', b'[END]'],
 [b'[START]', b'if', b'you', b'have', b'a', b'content', b'which', b'is',
  b'unique', b'and', b'necessary', b'to', b'users', b'it', b'can', b'be',
  b'spread', b'in', b'fail', b'changing', b'work', b'and', b'to',
  b'publish', b'the', b'reference', b'in', b'forums', b'and',
  b'communities', b'.', b'[END]']                                        ,
 [b'[START]', b'we', b'must', b'take', b'part', b'in', b'regional',
  b'elections', b'.', b'[END]']                                    ,
 [b'[START]', b'sudan', b'is', b'the', b'largest', b'country', b'in',
  b'africa', b'in', b'area', b'.', b'[END]']                         ,
 [b'[START]', b'more', b'detailed', b'information', b'on', b'password',
  b'change', b'is', b'provided', b'in', b'section', b'of', b'this',
  b'guide', b'.', b'[END]']                                            ,
 [b'[START]', b'we', b'can', b'see', b'more', b'holistically', b'how',
  b'projects', b'in', b'different', b'sectors', b'such', b'as', b'economic',
  b'growth', b',', b'agriculture', b',', b'natural', b'resource',
  b'management', b',', b'health', b'and', b'governance', b'can', b'be',
  b'linked', b'to', b'reinforce', b'each', b'other', b',', b'she', b'added',
  b'.', b'[END]']                                                           ,
 [b'[START]', b'at', b'least', b'for', b'the', b'topographic', b'accuracy',
  b'.', b'[END]']                                                          ,
 [b'[START]', b'wharram', b',', b'tiki', b',', b'sailing', b'yacht', b',',
  b',', b',', b'catamaran', b',', b'wood', b',', b'tiller', b',', b',',
  b'other', b'-', b',', b'm', b',', b'ft', b'[END]']                      ,
 [b'[START]', b'keep', b'the', b'send', b'to', b'back', b'set', b'as',
  b'background', b'opacity', b'left', b'mouse', b'button', b'pressed',
  b'and', b'drag', b'left', b'low', b'or', b'right', b'high', b'to',
  b'adjust', b'the', b'opacity', b'of', b'your', b'selected', b'photo',
  b'.', b'[END]']                                                      ,
 [b'[START]', b'in', b'his', b'diary', b'sven', b'hedin', b'made', b'a',
  b'note', b'portal', b'of', b'the', b'mosque', b'-mausoleum', b'is',
  b'extremely', b'high', b'and', b'decorated', b'with', b'two',
  b'picturesque', b'towers', b',', b'and', b'the', b'mosque', b'itself',
  b'is', b'topped', b'with', b'a', b'number', b'of', b'cupolas', b'.',
  b'[END]']                                                             ,
 [b'[START]', b'the', b'doctor', b'never', b'loses', b',', b'though', b'he',
  b'plays', b'both', b'sides', b'against', b'the', b'middle', b'and',
  b'takes', b'bigger', b'risks', b'than', b'necessary', b'.', b'[END]']     ,
 [b'[START]', b'there', b'are', b'two', b'important', b'landmarks', b'in',
  b'the', b'history', b'of', b'bratislava', b'the', b'year', b'when',
  b'the', b'city', b'obtained', b'the', b'status', b'of', b'a', b'free',
  b'royal', b'city', b',', b'and', b'the', b'period', b'between', b'and',
  b'when', b'bratislava', b'was', b'the', b'coronation', b'city', b'of',
  b'the', b'kingdom', b'of', b'hungary', b'.', b'[END]']                  ,
 [b'[START]', b'no', b'proceedings', b'for', b'the', b'forfeiture', b'or',
  b'revocation', b'of', b'a', b'patent', b'may', b'be', b'instituted',
  b'before', b'the', b'expiration', b'of', b'two', b'years', b'from',
  b'the', b'grant', b'of', b'the', b'first', b'compulsory', b'license',
  b'.', b'[END]']                                                         ,
 [b'[START]', b'the', b'corporation', b'takes', b'part', b'in', b'creating',
  b'a', b'nano', b'-technologic', b'infrastructure', b',', b'for',
  b'example', b',', b'collective', b'use', b'centers', b',', b'business',
  b'incubators', b',', b'and', b'early', b'investment', b'funds', b'.',
  b'[END]']                                                                 ,
 [b'[START]', b'apart', b'from', b'this', b'there', b'were', b'over', b',',
  b'cases', b'of', b'administrative', b'influence', b'.', b'[END]']        ,
 [b'[START]', b'after', b'opening', b'of', b'the', b'subsidiaries', b'in',
  b'st', b'.', b'petersburg', b'and', b'magadan', b'aeroflot', b'pays',
  b'taxes', b'to', b'the', b'budgets', b'of', b'these', b'cities', b'as',
  b'well', b'.', b'[END]']                                                ,
 [b'[START]', b'russia', b'could', b'not', b'do', b'that', b',', b'and',
  b'so', b'took', b'the', b'only', b'correct', b'decision', b'to',
  b'defend', b'the', b'peoples', b'of', b'abkhazia', b'and', b'south',
  b'ossetia', b',', b'and', b'to', b'recognise', b'their', b'independence',
  b'.', b'[END]']                                                          ,
 [b'[START]', b'loading', b'audio', b'data', b'from', b'file', b'[END]'],
 [b'[START]', b'the', b'javadoc', b'manager', b'was', b'added', b'to',
  b'the', b'tools', b'menu', b'to', b'enable', b'mounting', b'of',
  b'filesystems', b'of', b'documentation', b'generated', b'with', b'the',
  b'javadoc', b'tool', b'.', b'[END]']                                   ,
 [b'[START]', b'but', b',', b'if', b'an', b'athlete', b'has', b'one', b'or',
  b'more', b'distances', b'to', b'take', b',', b'then', b'a', b'dance',
  b'tournament', b'finalist', b'performs', b'about', b'dances', b'.',
  b'[END]']                                                                 ,
 [b'[START]', b'in', b'our', b'opinion', b',', b'the', b'entire',
  b'problem', b'is', b'that', b'the', b'founder', b'of', b'the', b'modern',
  b'turkey', b'has', b'forced', b'the', b'european', b'dressing', b'style',
  b'and', b'the', b'republican', b'government', b'.', b'[END]']            ,
 [b'[START]', b'why', b'would', b'the', b'united', b'states', b'welcome',
  b'another', b'close', b'ally', b',', b'akhmadov', b',', b'as', b'it',
  b'has', b'recently', b'?', b'[END]']                                   ,
 [b'[START]', b'ea', b'sports', b'unveils', b'creation', b'centre', b'for',
  b'fifa', b'[END]']                                                       ]>

In [151]:
example_tokens_c.to_tensor()

<tf.Tensor: shape=(64, 72), dtype=int64, numpy=
array([[   5,  181, 3774, ...,    0,    0,    0],
       [   5,   10,   23, ...,    0,    0,    0],
       [   5,   46,   22, ...,    0,    0,    0],
       ...,
       [   5,    2,  352, ...,    0,    0,    0],
       [   5,  517,   51, ...,    0,    0,    0],
       [   5,  158,    3, ...,    0,    0,    0]])>

In [152]:
example_target_strings[:3]

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'\xd0\x9f\xd0\xbe\xd1\x81\xd0\xbb\xd0\xb5 \xd0\xbf\xd0\xbe\xd0\xb1\xd0\xb5\xd0\xb4\xd1\x8b \xd0\xbd\xd0\xb0 \xd1\x87\xd0\xb5\xd0\xbc\xd0\xbf\xd0\xb8\xd0\xbe\xd0\xbd\xd0\xb0\xd1\x82\xd0\xb5 \xd0\x95\xd0\xb2\xd1\x80\xd0\xbe\xd0\xbf\xd1\x8b 2005 \xd0\xb3\xd0\xbe\xd0\xb4\xd0\xb0 \xd0\xb6\xd0\xb5\xd0\xbd\xd1\x81\xd0\xba\xd0\xb0\xd1\x8f \xd1\x81\xd0\xb1\xd0\xbe\xd1\x80\xd0\xbd\xd0\xb0\xd1\x8f \xd0\x93\xd0\xb5\xd1\x80\xd0\xbc\xd0\xb0\xd0\xbd\xd0\xb8\xd0\xb8 \xd0\xbf\xd0\xbe \xd1\x84\xd1\x83\xd1\x82\xd0\xb1\xd0\xbe\xd0\xbb\xd1\x83 \xd0\xb2\xd1\x8b\xd1\x88\xd0\xbb\xd0\xb0 \xd0\xbd\xd0\xb0 \xd0\xbf\xd0\xb5\xd1\x80\xd0\xb2\xd0\xbe\xd0\xb5 \xd0\xbc\xd0\xb5\xd1\x81\xd1\x82\xd0\xbe \xd0\xb2 \xd0\xbc\xd0\xb8\xd1\x80\xd0\xbe\xd0\xb2\xd0\xbe\xd0\xbc \xd1\x80\xd0\xb5\xd0\xb9\xd1\x82\xd0\xb8\xd0\xbd\xd0\xb3\xd0\xb5 \xd0\xa4\xd0\x98\xd0\xa4\xd0\x90.\n',
       b'\xd0\xa2\xd0\xb0\xd0\xba \xd0\xb6\xd0\xb5 \xd0\xb8 \xd0\xb2 \xd1\x8d\xd1\x82\xd0\xbe\xd0\xbc

In [153]:
example_tokens_t = target_text_processor(example_target_strings)
example_tokens_t[:3, :]

<tf.RaggedTensor [[4, 59, 3582, 8, 9730, 484, 50, 20981, 21585, 870, 14, 16901, 5502, 8,
  1268, 213, 6, 5203, 10105, 37349, 3, 5]                              ,
 [4, 40, 52, 7, 6, 71, 102944, 1031, 1, 72, 8908, 1031, 25571, 2, 99, 9714,
  1, 106904, 15, 11104, 7968, 369, 3732, 2, 20, 10, 1, 118562, 70, 6660, 7,
  2623, 2, 7, 16822, 5721, 21, 19147, 46345, 7, 1, 10297, 197, 769, 123591,
  2, 52374, 3486, 2, 5429, 121423, 3190, 8550, 2024, 13483, 3732, 3, 5]    ,
 [4, 30, 17492, 914, 18803, 53, 44, 9684, 13766, 17, 1, 2, 5627, 41, 12123,
  2701, 33142, 3, 5]                                                       ]>

In [154]:
context_vocab = np.array(context_text_processor.get_vocabulary())
tokens_c = context_vocab[example_tokens_c[0].numpy()]
' '.join(tokens_c)

'[START] since winning the european championship in , the german national womens soccer team has moved to top position in the fifa world rankings . [END]'

In [155]:
target_vocab = np.array(target_text_processor.get_vocabulary())
tokens_t = target_vocab[example_tokens_t[0].numpy()]
' '.join(tokens_t)

'[START] после победы на чемпионате европы года женская сборная германии по футболу вышла на первое место в мировом рейтинге фифа . [END]'

## Processing the dataset

The `Datasets` of strings are transformed into 0-padded tensors of token IDs via the `process_text` function listed below. For training with `keras.Model.fit,` it also changes from a `((context, target))` pair to a `((context, target_in), target_out))` pair. `(inputs, labels)` pairs are what Keras anticipates; the inputs are `((context, target_in))` and the labels are `target_out`. Target_in and Target_out differ from each other in that they are moved apart by one step, making the label the subsequent token at each position.

#### Processing with subword tokenization

In [None]:
def process_text(context, target):
  context = context_subword_processor.encode(context).to_tensor()
  target = target_subword_processor.encode(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out


train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

In [None]:
en_vocab_size = context_subword_processor.vocab_size
ru_vocab_size = target_subword_processor.vocab_size

#### Processing with word tokenization

##### Processing

In [156]:
def process_text(context, target):
  context = context_text_processor(context).to_tensor()
  target = target_text_processor(target)
  targ_in = target[:,:-1].to_tensor()
  targ_out = target[:,1:].to_tensor()
  return (context, targ_in), targ_out


train_ds = train_raw.map(process_text, tf.data.AUTOTUNE)
val_ds = val_raw.map(process_text, tf.data.AUTOTUNE)

targ_in for all words except last one

targ_out for all except first one


In [157]:
for (ex_context_tok, ex_tar_in), ex_tar_out in train_ds.take(1):
  print(ex_context_tok[0, :].numpy())
  print()
  print(ex_tar_in[0, :].numpy())
  print(ex_tar_out[0, :].numpy())

2023-06-15 19:47:48.348446: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_18' with dtype int64
	 [[{{node Placeholder/_18}}]]
2023-06-15 19:47:48.349240: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_18' with dtype int64
	 [[{{node Placeholder/_18}}]]


[    5    33   253    47 15562  1099     7   137    15     2   557    13
    94    31   309    20  3220     4     6     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0]

[     4     26  69385  13174  76241   2431      8   9465      2    717
      2    307     10     49 122527      3      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0]
[    26  69385  13174  76241   2431      8   9465      2    717      2
    307     10     49 122

In [158]:
en_vocab_size = len(context_text_processor.get_vocabulary())
ru_vocab_size = len(target_text_processor.get_vocabulary())

## Structure and What are transformers and attention used for?

##### Positional Embedding

In [159]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis] # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)

  angle_rates = 1 / (10000**depths) # (1, depth)
  angle_rads = positions * angle_rates  # seq, depth


  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1
  )

  return tf.cast(pos_encoding, dtype=tf.float32)

In [160]:
pos_encoding = positional_encoding(length=DENSE_LAYER_NEURONS, depth=UNITS)

# Check the shape.
print(pos_encoding.shape)

# Plot the dimensions.
# plt.pcolormesh(pos_encoding.numpy().T, cmap='RdBu')
# plt.ylabel('Depth')
# plt.xlabel('Position')
# plt.colorbar()
# plt.show()
# pos_encoding

(2048, 256)


In [161]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
    self.pos_encoding = positional_encoding(length=DENSE_LAYER_NEURONS, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)

    x*= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [162]:
pos_encoding

<tf.Tensor: shape=(2048, 256), dtype=float32, numpy=
array([[ 0.        ,  0.        ,  0.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.84147096,  0.8019618 ,  0.7617204 , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.9092974 ,  0.95814437,  0.98704624, ...,  0.99999994,
         1.        ,  1.        ],
       ...,
       [ 0.17589758, -0.7070546 , -0.819888  , ...,  0.9679724 ,
         0.9722453 ,  0.97595036],
       [-0.7333133 ,  0.1447375 , -0.09510706, ...,  0.9679412 ,
         0.9722182 ,  0.97592694],
       [-0.9683193 ,  0.8799798 ,  0.69664717, ...,  0.96791005,
         0.9721912 ,  0.9759035 ]], dtype=float32)>

In [163]:
for (en, ru_in), ru_out in train_ds.take(1):
  # print(example_context_strings[1])
  # print(example_target_strings[1])
  break
print(en.shape)
print(ru_in.shape)
print(ru_out.shape)

2023-06-15 19:47:52.734198: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_14' with dtype int64
	 [[{{node Placeholder/_14}}]]
2023-06-15 19:47:52.734198: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_18' with dtype int64
	 [[{{node Placeholder/_18}}]]


(64, 68)
(64, 76)
(64, 76)


In [164]:
embed_en = PositionalEmbedding(vocab_size=en_vocab_size, d_model=UNITS)
embed_ru = PositionalEmbedding(vocab_size=ru_vocab_size, d_model=UNITS)

en_emb = embed_en(en)
ru_emb = embed_ru(ru_in)

In [165]:
print(en_emb._keras_mask[0])
print(ru_emb._keras_mask[0])

tf.Tensor(
[ True  True  True  True  True  True  True  True  True  True  True  True
  True False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False], shape=(68,), dtype=bool)
tf.Tensor(
[ True  True  True  True  True  True  True  True  True  True  True False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False], shape=(76,), dtype=bool)


#### The Encoder/Decoder


The goal of the encoder is to process the context sequence into a sequence of vectors that are useful for the decoder as it attempts to predict the next output for each timestep. Since the context sequence is constant, there is no restriction on how information can flow in the encoder, so use a bidirectional-RNN to do the processing:

Takes a list of token IDs (from context_text_processor).
Looks up an embedding vector for each token (Using a layers.Embedding).
Processes the embeddings into a new sequence (Using a bidirectional layers.GRU).
Returns the processed sequence. This will be passed to the attention head.

The Attention Layer

The attention layer lets the decoder access the information extracted by the encoder. It computes a vector from the entire context sequence, and adds that to the decoder's output.

##### FeedForwarding

In [166]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=DROPOUT_RATE):
    super().__init__()
    self.seq = tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model),
        tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    # self.layer_norm = tf.keras.layers.LayerNormalization()
    self.batch_norm = tf.keras.layers.BatchNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.batch_norm(x)
    return x

##### Attention

In [167]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.batch_norm = tf.keras.layers.BatchNormalization()
    self.add = tf.keras.layers.Add()

In [168]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.batch_norm(x)

    return x

In [169]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x
    )
    x = self.add([x, attn_output])
    return x

In [170]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask=True
    )
    x = self.add([x, attn_output])
    return x

The Decoder

The decoder's job is to generate predictions for the next token at each location in the target sequence.

1. It looks up embeddings for each token in the target sequence.
2. It uses an RNN to process the target sequence, and keep track of what it has generated so far.
3. It uses RNN output as the "query" to the attention layer, when attending to the encoder's output.
4. At each location in the output it predicts the next token.

When training, the model predicts the next word at each location. So it's important that the information only flows in one direction through the model. The decoder uses a unidirectional (not bidirectional) RNN to process the target sequence.

When running inference with this model it produces one word at a time, and those are fed back into the model.


##### Encoder

In [171]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=DROPOUT_RATE):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [172]:
sample = EncoderLayer(d_model=UNITS, num_heads=NUM_HEADS, dff=DENSE_LAYER_NEURONS)
print(ru_emb.shape)
print(sample(ru_emb).shape)

(64, 76, 256)
(64, 76, 256)


In [173]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=DROPOUT_RATE):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [174]:
# Instantiate the encoder.
sample_encoder = Encoder(num_layers=NUM_LAYER,
                         d_model=UNITS,
                         num_heads=NUM_HEADS,
                         dff=DENSE_LAYER_NEURONS,
                         vocab_size=MAX_VOCAB_SIZE)

sample_encoder_output = sample_encoder(en, training=False)

# Print the shape.
print(en.shape)
print(sample_encoder_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.

(64, 68)
(64, 68, 256)


##### Decoder

In [175]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [176]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=DROPOUT_RATE):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [177]:
# Instantiate the decoder.
sample_decoder = Decoder(num_layers=NUM_LAYER,
                         d_model=UNITS,
                         num_heads=NUM_HEADS,
                         dff=DENSE_LAYER_NEURONS,
                         vocab_size=MAX_VOCAB_SIZE)

output = sample_decoder(
    x=ru_in,
    context=en_emb)

# Print the shapes.
print(ru_in.shape)
print(en_emb.shape)
print(output.shape)

(64, 76)
(64, 68, 256)
(64, 76, 256)


In [178]:
sample_decoder.last_attn_scores.shape  # (batch, heads, target_seq, input_seq)

TensorShape([64, 8, 76, 68])

##### Transformer

In [179]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=DROPOUT_RATE):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

##### Combination

In [180]:
transformer = Transformer(
    num_layers=NUM_LAYER,
    d_model=UNITS,
    num_heads=NUM_HEADS,
    dff=DENSE_LAYER_NEURONS,
    input_vocab_size=en_vocab_size,
    target_vocab_size=ru_vocab_size,
    dropout_rate=DROPOUT_RATE
)

In [None]:
output = transformer((en, ru_in))

print(ru_in.shape)
print(en.shape)
print(output.shape)

In [None]:
attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape)

In [None]:
transformer.summary()

### Training

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(UNITS)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,epsilon=1e-9)

In [None]:
plt.plot(learning_rate(tf.range(40000, dtype=tf.float32)))
plt.ylabel('Learning Rate')
plt.xlabel('Train Step')

In [None]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
transformer.fit(train_ds,
                epochs=20,
                validation_data=val_ds)

In [118]:
# history = transformer.fit(
#     train_ds,
#     epochs=100,
#     steps_per_epoch = 100,
#     validation_data=val_ds,
#     validation_steps = 20,
#     callbacks=[
#         tf.keras.callbacks.EarlyStopping(patience=3)])

Epoch 1/100


2023-06-15 19:34:46.790288: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_16' with dtype int64
	 [[{{node Placeholder/_16}}]]
2023-06-15 19:34:46.791411: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_14' with dtype int64
	 [[{{node Placeholder/_14}}]]
2023-06-15 19:35:00.084334: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 5.22GiB (rounded to 5606400000)requested by op transformer_1/dense_101/Tensordot/MatMul
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve th

ResourceExhaustedError: Graph execution error:

Detected at node 'transformer_1/dense_101/Tensordot/MatMul' defined at (most recent call last):
    File "/usr/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/usr/local/lib/python3.9/dist-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/usr/local/lib/python3.9/dist-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/usr/local/lib/python3.9/dist-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/usr/local/lib/python3.9/dist-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
      self._run_once()
    File "/usr/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
      handle._run()
    File "/usr/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/usr/local/lib/python3.9/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/usr/local/lib/python3.9/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/usr/local/lib/python3.9/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 2885, in run_cell
      result = self._run_cell(
    File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "/usr/local/lib/python3.9/dist-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 3139, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 3318, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/usr/local/lib/python3.9/dist-packages/IPython/core/interactiveshell.py", line 3378, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_1006/641152342.py", line 1, in <module>
      history = transformer.fit(
    File "/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/usr/local/lib/python3.9/dist-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/usr/local/lib/python3.9/dist-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.9/dist-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/usr/local/lib/python3.9/dist-packages/keras/engine/training.py", line 1050, in train_step
      y_pred = self(x, training=True)
    File "/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/keras/engine/training.py", line 558, in __call__
      return super().__call__(*args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/tmp/ipykernel_1006/2399977420.py", line 27, in call
      logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)
    File "/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/usr/local/lib/python3.9/dist-packages/keras/layers/core/dense.py", line 244, in call
      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
Node: 'transformer_1/dense_101/Tensordot/MatMul'
OOM when allocating tensor with shape[4672,300000] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node transformer_1/dense_101/Tensordot/MatMul}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_148897]

In [None]:
transformer.evaluate(val_ds, steps=20, return_dict=True)

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('CE/token')
plt.legend()

In [None]:
history.history.keys()

In [None]:
plt.plot(history.history['masked_accuracy'], label='accuracy')
plt.plot(history.history['val_masked_accuracy'], label='val_accuracy')
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch #')
plt.ylabel('CE/token')
plt.legend()

### Translator

In [None]:
class Translator(tf.Module):
  @classmethod
  def add_method(cls, fun):
    setattr(cls, fun.__name__, fun)
    return fun
  def __init__(self, context_text_processor, target_text_processor, transformer):
    self.context_text_processor = context_text_processor
    self.target_text_processor = target_text_processor
    self.transformer = transformer

  def __call__(self, sentence, max_length=max_vocab_size):

    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]


    sentence = self.context_text_processor(sentence).to_tensor()

    encoder_input = sentence


    start = tf.constant(target_text_processor.get_vocabulary().index("[START]"), dtype=tf.int64)[tf.newaxis]
    end = tf.constant(target_text_processor.get_vocabulary().index("[END]"), dtype=tf.int64)[tf.newaxis]


    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)


      pre_id = tf.cast(predicted_id[0], dtype=tf.int64)
      output_array = output_array.write(i+1, pre_id)

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())


    target_vocab = np.array(target_text_processor.get_vocabulary())
    tokens_t = target_vocab[output]


    text = ' '.join(tokens_t[1:len(tokens_t)-1])
    tokens = tokens_t


    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, tokens, attention_weights

In [None]:
@Translator.add_method
def plot_attention(self, text, **kwargs):
  assert isinstance(text, str)
  output = self.translate([text], **kwargs)
  output = output[0].numpy().decode()

  attention = self.last_attention_weights[0]

  context = tf_lower_and_split_punct(text)
  context = context.numpy().decode().split()

  output = tf_lower_and_split_punct(output)
  output = output.numpy().decode().split()[1:]

  fig = plt.figure(figsize=(10, 10))
  ax = fig.add_subplot(1, 1, 1)

  ax.matshow(attention, cmap='viridis', vmin=0.0)

  fontdict = {'fontsize': 14}

  ax.set_xticklabels([''] + context, fontdict=fontdict, rotation=90)
  ax.set_yticklabels([''] + output, fontdict=fontdict)

  ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
  ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

  ax.set_xlabel('Input text')
  ax.set_ylabel('Output text')

In [None]:
translator = Translator(context_text_processor, target_text_processor, transformer)

In [None]:
translator.plot_attention(' are you still at home? ')

In [None]:
def print_translation(sentence, tokens, ground_truth):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Prediction":15s}: {tokens}')
  print(f'{"Ground truth":15s}: {ground_truth}')

In [None]:
sentence = ' tom likes you '
ground_truth = "  "

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

In [None]:
class ExportTranslator(tf.Module):
  def __init__(self, translator):
    self.translator = translator

  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])
  def __call__(self, sentence):
    (result,
     tokens,
     attention_weights) = self.translator(sentence)

    return result

In [None]:
translator = ExportTranslator(translator)

In [None]:
translator(' tom was very tired . he likes you . ')

In [None]:
tf.saved_model.save(translator, export_dir='translator')

In [None]:
reloaded = tf.saved_model.load('translator')

In [None]:
# reloaded()

## Evaluation methods and benchmarking techniques

### Hyperparameter Tuning:
Tune the hyperparameters of the model, such as the learning rate, batch size, and number of training epochs, to optimize its performance.


### Evaluation:
Evaluate the performance of the model on the test set using standard metrics such as BLEU, ROUGE, or METEOR.

In [None]:
from datasets import load_metric

bleu = load_metric("bleu")
predictions = [["the", "picture", "the", "picture",
				"by", "me"]]
references = [
	[["the", "picture", "is", "clicked", "by", "me"],
	["this", "picture", "was", "clicked", "by", "me"]]
]
print(bleu.compute(predictions=predictions, references=references))


In [None]:
from rouge import Rouge

rouge = Rouge()
scores = rouge.get_scores(predictions, references)
rouge_l_score = scores[0]["rouge-l"]["f"]
print(rouge_l_score)

### Deployment:
Deploy the model for use in production by integrating it into a web or mobile application.

## Reference


https://www.tensorflow.org/text/tutorials/transformer

https://www.tensorflow.org/text/tutorials/nmt_with_attention

https://www.tensorflow.org/text/tutorials/bert_glue