<a href="https://colab.research.google.com/github/nahbos/AUT-Neural-Networks/blob/main/HW08/HW08.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Sobhan Moradian Daghigh
- 7/12/2022
- ANN - HW08

In [8]:
# !pip install -U 'tensorflow-text==2.8.*'

In [9]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text
import time

# Load data

In [10]:
dataset = tfds.load('ted_hrlr_translate/pt_to_en', as_supervised=True)
x_train, x_test, x_val = dataset['train'], dataset['test'], dataset['validation']

[1mDownloading and preparing dataset ted_hrlr_translate/pt_to_en/1.0.0 (download: 124.94 MiB, generated: Unknown size, total: 124.94 MiB) to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]






0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteJXTP6B/ted_hrlr_translate-train.tfrecord


  0%|          | 0/51785 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteJXTP6B/ted_hrlr_translate-validation.tfrecord


  0%|          | 0/1193 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompleteJXTP6B/ted_hrlr_translate-test.tfrecord


  0%|          | 0/1803 [00:00<?, ? examples/s]

[1mDataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0. Subsequent calls will reuse this data.[0m


### Preview

In [11]:
for pt_sentences, en_sentences in x_train.batch(5).take(1):
  for pt, en in zip(pt_sentences.numpy(), en_sentences.numpy()):
    print('Portuguese: [{}]'.format(pt.decode('utf-8')))
    print('English:    [{}]'.format(en.decode('utf-8')))
    print('---------------------------------')

Portuguese: [e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .]
English:    [and when you improve searchability , you actually take away the one advantage of print , which is serendipity .]
---------------------------------
Portuguese: [mas e se estes fatores fossem ativos ?]
English:    [but what if it were active ?]
---------------------------------
Portuguese: [mas eles não tinham a curiosidade de me testar .]
English:    [but they did n't test for curiosity .]
---------------------------------
Portuguese: [e esta rebeldia consciente é a razão pela qual eu , como agnóstica , posso ainda ter fé .]
English:    [and this conscious defiance is why i , as an agnostic , can still have faith .]
---------------------------------
Portuguese: [`` `` '' podem usar tudo sobre a mesa no meu corpo . '']
English:    [you can use everything on the table on me .]
---------------------------------


# Text tokenization
Downloading a model which was implemented for tokenizing the sentences.

In [12]:
name = 'ted_hrlr_translate_pt_en_converter'
tf.keras.utils.get_file(
    f'{name}.zip',
    f'https://storage.googleapis.com/download.tensorflow.org/models/{name}.zip',
    cache_dir='.', cache_subdir='', extract=True)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/ted_hrlr_translate_pt_en_converter.zip


'./ted_hrlr_translate_pt_en_converter.zip'

In [13]:
tokenizers = tf.saved_model.load(name)

### Lets see the results

In [14]:
for pt_sentences, en_sentences in x_train.batch(2).take(1):
  for en in en_sentences.numpy():
    print('English:   [{}]'.format(en.decode('utf-8')))

  tokenized = tokenizers.en.tokenize(en_sentences)
  for sentence in tokenized.to_list():
    print('Tokenized: [{}]'.format(sentence))

English:   [and when you improve searchability , you actually take away the one advantage of print , which is serendipity .]
English:   [but what if it were active ?]
Tokenized: [[2, 72, 117, 79, 1259, 1491, 2362, 13, 79, 150, 184, 311, 71, 103, 2308, 74, 2679, 13, 148, 80, 55, 4840, 1434, 2423, 540, 15, 3]]
Tokenized: [[2, 87, 90, 107, 76, 129, 1852, 30, 3]]


# Setup input pipeline
* Now lets make a filter on the sentences which have shorter than a threshold
* Then make batches

In [15]:
MAX_TOKENS  = 128
BUFFER_SIZE = 20000
BATCH_SIZE  = 64

In [16]:
def filter_max_tokens(pt, en):
  num_tokens = tf.maximum(tf.shape(pt)[1],tf.shape(en)[1])
  return num_tokens < MAX_TOKENS

In [17]:
def tokenize_pairs(pt, en):
    pt = tokenizers.pt.tokenize(pt)
    pt = pt.to_tensor()

    en = tokenizers.en.tokenize(en)
    en = en.to_tensor()
    
    return pt, en

In [18]:
def make_batches(ds):
  return (
      ds
      .cache()
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
      .filter(filter_max_tokens)
      .prefetch(tf.data.AUTOTUNE))

In [19]:
train_batches = make_batches(x_train)
val_batches   = make_batches(x_val)

# Positional Embedding

In [20]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [21]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)

  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

In [22]:
n, d = 2048, 512
pos_encoding = positional_encoding(n, d)
print(pos_encoding.shape)
pos_encoding = pos_encoding[0]

(1, 2048, 512)
