In [1]:
import numpy as np
import typing
from typing import Any, Tuple
import tensorflow as tf
import tensorflow_text
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

In [2]:
use_builtins = True
punctuation = '.?!,'
acceptable_chars = punctuation + ' abcdefghijklmnopqrstuvwxyzабвгґдеєжзиіїйклмнопрстуфхцчшщьюя'
acceptable_chars = tensorflow_text.normalize_utf8(acceptable_chars, 'NFKD')

In [4]:
def load_data(path):
    text = open(path, 'r', encoding='utf-8').read()

    lines = text.splitlines()
    pairs = [line.split('\t') for line in lines]

    inp = [inp.lower() for targ, inp, _ in pairs]
    targ = [targ.lower() for targ, inp, _ in pairs]

  return inp, targ

In [5]:
def tf_lower_and_split_punct(text):
    # Split accecented characters.
    text = tensorflow_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, '[^' + acceptable_chars + ']', '')
    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, '[' + punctuation + ']', r' \0 ')
    # Strip whitespace.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

In [6]:
# get raw data
path_to_data = 'ukr-eng/ukr.txt'
inp, targ = load_data(path_to_data)

In [7]:
# tf.data.Dataset
BUFFER_SIZE = len(inp) # whole dataset size
BATCH_SIZE = 64

dataset = tf.data.Dataset.from_tensor_slices((inp, targ)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)

In [10]:
# text vectorization
max_vocab_size = 5000

input_text_processor = tf.keras.layers.TextVectorization(
    standardize=tf_lower_and_split_punct,
    max_tokens=max_vocab_size)
input_text_processor.adapt(inp)

In [34]:
# print prepped sentence
for example_input_batch, example_target_batch in dataset.take(1):
    example_tokens = input_text_processor(example_input_batch)
    print(example_tokens)
    input_vocab = np.array(input_text_processor.get_vocabulary())
    print(len(input_vocab))
    tokens = input_vocab[example_tokens[0].numpy()]
    print(' '.join(tokens))

tf.Tensor(
[[  2   1  36 ...   0   0   0]
 [  2   6 137 ...   0   0   0]
 [  2   7   8 ...   0   0   0]
 ...
 [  2  24  63 ...   0   0   0]
 [  2   6 800 ...   0   0   0]
 [  2  10  43 ...   0   0   0]], shape=(64, 20), dtype=int64)
5000
[START] [UNK] за [UNK] тома . [END]             


In [39]:
embedding_dim = 256
units = 1024
input_vocab_size = input_text_processor.vocabulary_size()
embedding_layer = tf.keras.layers.Embedding(input_vocab_size, embedding_dim)

In [45]:
vectors = embedding_layer(example_tokens)

In [44]:
# what does GRU do?
gru = tf.keras.layers.GRU(units,
                          # Return the sequence and state
                          return_sequences=True,
                          return_state=True,
                          recurrent_initializer='glorot_uniform')

In [47]:
output, state = gru(vectors, initial_state=None)

In [51]:
output

<tf.Tensor: shape=(64, 20, 1024), dtype=float32, numpy=
array([[[-0.00240274, -0.00157611, -0.00623834, ...,  0.00461239,
         -0.00182369, -0.00567733],
        [ 0.00060355, -0.00030449, -0.00941917, ...,  0.00234954,
         -0.00698494,  0.00062927],
        [ 0.01221675, -0.00981983, -0.01610171, ...,  0.00381916,
         -0.00231389, -0.00057539],
        ...,
        [-0.02529003,  0.00509631,  0.00576039, ...,  0.02044816,
         -0.0259783 ,  0.00776972],
        [-0.02525885,  0.00508721,  0.00577477, ...,  0.02048077,
         -0.02598366,  0.00773164],
        [-0.02523578,  0.0050831 ,  0.00578738, ...,  0.0205014 ,
         -0.02598871,  0.0077064 ]],

       [[-0.00240274, -0.00157611, -0.00623834, ...,  0.00461239,
         -0.00182369, -0.00567733],
        [ 0.00207801,  0.00336301, -0.00178579, ...,  0.00623163,
         -0.0074925 , -0.00235636],
        [ 0.00728533, -0.00611938,  0.00788238, ...,  0.00084702,
         -0.01025532,  0.00286658],
        ...