<a href="https://colab.research.google.com/github/matejkvassay/colab-notebooks/blob/master/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install -q tensorflow
import tensorflow_datasets as tfds
import tensorflow as tf

import time
import numpy as np
import matplotlib.pyplot as plt

# 1 - Data Preparation

## Prepare data pipeline 

In [0]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

####Inspect



In [3]:
pt_ex, en_ex = train_examples.take(1).__iter__().__next__()
print('PT example: {}'.format(pt_ex))
print('EN example: {}'.format(en_ex))


PT example: b'e quando melhoramos a procura , tiramos a \xc3\xbanica vantagem da impress\xc3\xa3o , que \xc3\xa9 a serendipidade .'
EN example: b'and when you improve searchability , you actually take away the one advantage of print , which is serendipity .'


## Train tokenizers

In [4]:
%%time
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=2**13)

tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)

CPU times: user 2min 35s, sys: 7.75 s, total: 2min 43s
Wall time: 2min 26s


#### Inspect

In [5]:
sample_string = en_ex.numpy().decode('utf8')
print('Sent:\n{}'.format(sample_string))

tokenized_string = tokenizer_en.encode(sample_string)
print('Token IDs:\n{}'.format(tokenized_string))

for token_id in tokenized_string:
  print ('{} ----> {}'.format(token_id, tokenizer_en.decode([token_id])))

Sent:
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
Token IDs:
[4, 59, 15, 1792, 6561, 3060, 7952, 1, 15, 103, 134, 378, 3, 47, 6122, 6, 5311, 1, 91, 13, 1849, 559, 1609, 894, 2]
4 ----> and 
59 ----> when 
15 ----> you 
1792 ----> improve 
6561 ----> search
3060 ----> abilit
7952 ----> y
1 ---->  , 
15 ----> you 
103 ----> actually 
134 ----> take 
378 ----> away 
3 ----> the 
47 ----> one 
6122 ----> advantage 
6 ----> of 
5311 ----> print
1 ---->  , 
91 ----> which 
13 ----> is 
1849 ----> ser
559 ----> end
1609 ----> ip
894 ----> ity
2 ---->  .


## Preprocess data
- add start & end tokens 
- filter out examples with >40 words
- prepare batches (pad shorter to maximum batch size)

In [0]:
BUFFER_SIZE=200000
MAX_LEN=40
BATCH_SIZE=64

def encode(lang1, lang2):
  # start token idx = vocab size
  # end token idx = vocab size + 1
  lang1 = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(
      lang1.numpy()) + [tokenizer_pt.vocab_size+1]

  lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang2.numpy()) + [tokenizer_en.vocab_size+1]
  
  return lang1, lang2

def tf_encode(pt, en):
  # make it work in graph (lazy execution)
  result_pt, result_en = tf.py_function(encode, [pt, en], [tf.int64, tf.int64])
  result_pt.set_shape([None])
  result_en.set_shape([None])

  return result_pt, result_en

def filter_max_length(x, y, max_length=MAX_LEN):
  # drop examples with more than max_length words
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)
  
train_preprocessed = (
    train_examples
    .map(tf_encode) 
    .filter(filter_max_length)
    # cache the dataset to memory to get a speedup while reading from it.
    .cache()
    .shuffle(BUFFER_SIZE))

val_preprocessed = (
    val_examples
    .map(tf_encode)
    .filter(filter_max_length))  

# pad & prepare batches    
train_dataset = (train_preprocessed
                 .padded_batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE))


val_dataset = (val_preprocessed.padded_batch(BATCH_SIZE))   

#### Inspect

In [15]:
# will take time - catches train set
pt_batch, en_batch = next(iter(train_dataset))
pt_batch, en_batch

(<tf.Tensor: shape=(64, 37), dtype=int64, numpy=
 array([[8214,   42,   13, ...,    0,    0,    0],
        [8214,  270, 4880, ...,    0,    0,    0],
        [8214,  278,    5, ...,    0,    0,    0],
        ...,
        [8214,    3, 2200, ...,    0,    0,    0],
        [8214,   25,  422, ...,    0,    0,    0],
        [8214,  164, 4010, ...,    0,    0,    0]])>,
 <tf.Tensor: shape=(64, 37), dtype=int64, numpy=
 array([[8087,    4,   16, ...,    0,    0,    0],
        [8087,   14,  124, ...,    0,    0,    0],
        [8087,   12,   84, ...,    0,    0,    0],
        ...,
        [8087,    3,  132, ...,    0,    0,    0],
        [8087,   15,  101, ...,    0,    0,    0],
        [8087,   12,   24, ...,    0,    0,    0]])>)

#2 - Transformer Model

## Positional encoding
https://github.com/tensorflow/examples/blob/master/community/en/position_encoding.ipynb