# Depth Growing For Neural Machine Translation  

Tensorflow implementation of the paper https://arxiv.org/abs/1907.01968  
The implementation is based on tensorflow tutorial on NMT https://www.tensorflow.org/tutorials/text/nmt_with_attention  


In [0]:
import sys
!{sys.executable} -m pip install subword-nmt

Collecting subword-nmt
  Downloading https://files.pythonhosted.org/packages/26/08/58267cb3ac00f5f895457777ed9e0d106dbb5e6388fa7923d8663b04b849/subword_nmt-0.3.6-py2.py3-none-any.whl
Installing collected packages: subword-nmt
Successfully installed subword-nmt-0.3.6


In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip install -q tensorflow tensorflow-datasets matplotlib

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

import tensorflow_datasets as tfds

In [0]:
from keras import backend as K
from tensorflow.keras import layers as layers
import time

**Implementation of the Transformer module**


---


![Transformer](https://camo.githubusercontent.com/4b80977ac0757d1d18eb7be4d0238e92673bfaba/68747470733a2f2f6c696c69616e77656e672e6769746875622e696f2f6c696c2d6c6f672f6173736574732f696d616765732f7472616e73666f726d65722e706e67)

In [0]:
def scaledDotAttentionWrapper(v, k, q, mask=None):
  matMul = tf.matmul(q, k, transpose_b=True)
  scaled = matMul / K.sqrt(tf.cast(tf.shape(k)[-1], tf.float32)) # tf.cast(tf.shape(k)[-1], tf.float32)
  masked = scaled
  if mask is not None:
    masked = scaled + (mask * -1e9)
  softmax = K.softmax(masked)
  out = tf.matmul(softmax, v)
  return out, softmax

In [0]:
np.set_printoptions(suppress=True)

temp_k = K.constant([[10,0,0],
                      [0,10,0],
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (4, 3)

temp_v = K.constant([[   1,0],
                      [  10,0],
                      [ 100,5],
                      [1000,6]], dtype=tf.float32)  # (4, 2)

# This `query` aligns with the second `key`,
# so the second `value` is returned.
temp_q = K.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)
temp_out, temp_attn = scaledDotAttentionWrapper(
      temp_v, temp_k, temp_q)
print ('Attention weights are:')
print (K.eval(temp_attn))
print ('Output is:')
print (temp_out)
print (K.eval(temp_out))

In [0]:
def multiHeadAttentionWrapper(v, k, q, mask=None, d_model=512, num_heads=8):
  depth = d_model // num_heads
  
  def split_heads(x, batch_size):
    x = tf.reshape(x, (batch_size, -1, num_heads, depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
  
  batch_size = tf.shape(q)[0]
  
  wv_splitted = split_heads(layers.Dense(d_model)(v), batch_size)
  wk_splitted = split_heads(layers.Dense(d_model)(k), batch_size)
  wq_splitted = split_heads(layers.Dense(d_model)(q), batch_size)
  
  scaled_attention, att_weights = scaledDotAttentionWrapper(wv_splitted, wk_splitted, wq_splitted)
  scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
  concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, d_model))
  out = layers.Dense(d_model)(concat_attention)
  return out, att_weights

In [0]:
class MultiHeadAttention(layers.Layer):
  def __init__(self, d_model=512, num_heads=8):
    super(MultiHeadAttention, self).__init__()
    self.depth = d_model // num_heads
    self.num_heads= num_heads
    self.d_model = d_model
    
  def split_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
  
  def call(self, v, k, q, mask=None):
    batch_size = tf.shape(q)[0]
    w = layers.Dense(self.d_model)
    wv_splitted = self.split_heads(w(v), batch_size)
    wk_splitted = self.split_heads(w(k), batch_size)
    wq_splitted = self.split_heads(w(q), batch_size)
    att_out, att_weights = scaledDotAttentionWrapper(wv_splitted, wk_splitted, wq_splitted, mask=mask)
    att_out = tf.transpose(att_out, perm=[0, 2, 1, 3])
    att_out = tf.reshape(att_out, (batch_size, -1, self.d_model))
    att_out = w(att_out)
    return att_out, att_weights

In [0]:
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = multiHeadAttentionWrapper(y, k=y, q=y, mask=None, d_model=512, num_heads=8)
out.shape, attn.shape

In [0]:
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = MultiHeadAttention()(y, y, y)
out.shape, attn.shape

In [0]:
def encoder_layer(X, training, d_model=512, num_heads=8, mask=None, dff=2048 ,dropout_rate=0.1):
  attention_out, _ = multiHeadAttentionWrapper(X, k=X, q=X, mask=mask, d_model=d_model, num_heads=num_heads)
  attention_out = layers.Dropout(dropout_rate)(attention_out, training)
  X = layers.LayerNormalization(epsilon=1e-6)(attention_out + X)
  feed_forward = layers.Dense(dff, activation='relu')(X)
  feed_forward = layers.Dense(d_model)(feed_forward)
  feed_forward = layers.Dropout(dropout_rate)(feed_forward, training)
  X = layers.LayerNormalization(epsilon=1e-6)(X + feed_forward)
  return X

In [0]:
sample_encoder_layer_output = encoder_layer(tf.random.uniform((64, 43, 512)), False)
sample_encoder_layer_output.shape  # (batch_size, input_seq_len, d_model)

In [0]:
class EncoderLayer(layers.Layer):
  def __init__(self, d_model=512, num_heads=8, dff=2048 ,dropout_rate=0.1):
    super(EncoderLayer, self).__init__()
    self.dm = d_model
    self.nh = num_heads
    self.dff = dff
    self.dr = dropout_rate
    
  def call(self, X, training, mask=None):
    out, _ = MultiHeadAttention(self.dm, self.nh)(X, X, X, mask)
    out = layers.Dropout(self.dr)(out, training)
    out = layers.LayerNormalization(epsilon=1e-6)(out + X)
    feed_forward = layers.Dense(self.dff, activation='relu')(out)
    feed_forward = layers.Dense(self.dm)(feed_forward)
    feed_forward = layers.Dropout(self.dr)(feed_forward, training)
    out = layers.LayerNormalization(epsilon=1e-6)(out + feed_forward)
    return out

In [0]:
sample_encoder_layer_output = EncoderLayer()(tf.random.uniform((64, 43, 512)), False)
sample_encoder_layer_output.shape  # (batch_size, input_seq_len, d_model)

In [0]:
def decoder_layer(X, enc_output, training, d_model=512, num_heads=8, look_ahead_mask=None, padding_mask=None, dff=2048, dropout_rate=0.1):
  attention_out, att_w1 = multiHeadAttentionWrapper(X, k=X, q=X, mask=look_ahead_mask, d_model=d_model, num_heads=num_heads)
  attention_out = layers.Dropout(dropout_rate)(attention_out, training)
  X = layers.LayerNormalization(epsilon=1e-6)(attention_out + X)
  attention_out, att_w2 = multiHeadAttentionWrapper(enc_output, k=enc_output, q=X, mask=padding_mask, d_model=d_model, num_heads=num_heads)
  attention_out = layers.Dropout(dropout_rate)(attention_out, training)
  X = layers.LayerNormalization(epsilon=1e-6)(attention_out + X)
  feed_forward = layers.Dense(dff, activation='relu')(X)
  feed_forward = layers.Dense(d_model)(feed_forward)
  feed_forward = layers.Dropout(dropout_rate)(feed_forward, training)
  X = layers.LayerNormalization(epsilon=1e-6)(X + feed_forward)
  return X, att_w1, att_w2

In [0]:
sample_decoder_layer_output, _ , _ = decoder_layer(tf.random.uniform((64, 50, 512)), sample_encoder_layer_output, False)
sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

In [0]:
class DecoderLayer(layers.Layer):
  def __init__(self, d_model=512, num_heads=8, dff=2048 ,dropout_rate=0.1):
    super(DecoderLayer, self).__init__()
    self.dm = d_model
    self.nh = num_heads
    self.dff = dff
    self.dr = dropout_rate
    
  def call(self, X, enc_output, training, look_ahead_mask=None, padding_mask=None):
    mha1, att_w1 = MultiHeadAttention(self.dm, self.nh)(X, X, X, look_ahead_mask)
    mha1 = layers.Dropout(self.dr)(mha1, training)
    out = layers.LayerNormalization(epsilon=1e-6)(mha1 + X)
    mha2, att_w2 = MultiHeadAttention(self.dm, self.nh)(enc_output, enc_output, out, padding_mask)
    mha2 = layers.Dropout(self.dr)(mha2, training)
    out = layers.LayerNormalization(epsilon=1e-6)(out + mha2)
    feed_forward = layers.Dense(self.dff, activation='relu')(out)
    feed_forward = layers.Dense(self.dm)(feed_forward)
    feed_forward = layers.Dropout(self.dr)(feed_forward, training)
    out = layers.LayerNormalization(epsilon=1e-6)(out + feed_forward)
    return out, att_w1, att_w2

In [0]:
sample_decoder_layer_output, _ , _ = DecoderLayer()(tf.random.uniform((64, 50, 512)), sample_encoder_layer_output, False)
sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

In [0]:
def positional_encoding(max_position, d_model):
  pos_encodings = np.zeros((max_position, d_model))
  angles = np.arange(max_position)[:, np.newaxis] / np.power(1000, np.arange(d_model)[np.newaxis, :] * 2 / np.float32(d_model))
  pos_encodings[:, 0::2] = np.sin(angles[:, 0::2])
  pos_encodings[:, 1::2] = np.cos(angles[:, 1::2])
  pos_encodings = pos_encodings[np.newaxis, ...]
    
  return (pos_encodings)

In [0]:
pos_encoding = positional_encoding(50, 512)
print (pos_encoding.shape)

plt.pcolormesh(pos_encoding[0], cmap='RdBu')
plt.xlabel('Depth')
plt.xlim((0, 512))
plt.ylabel('Position')
plt.colorbar()
plt.show()

In [0]:
def encoder(X, input_vocab_size, maximum_position_encoding, training, num_layers=6, d_model=512, num_heads=8, dff=2048, mask=None ,dropout_rate=0.1):
  seq_len = tf.shape(X)[1]
  X = layers.Embedding(input_vocab_size, d_model)(X)
  pos_encoding = positional_encoding(maximum_position_encoding, d_model)
  X *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  X += tf.slice(tf.cast(pos_encoding, tf.float32), [0, 0, 0], [pos_encoding.shape[0], seq_len, pos_encoding.shape[2]])
  X = layers.Dropout(dropout_rate)(X, training)
  for i in range(num_layers):
    X = encoder_layer(X, training, d_model, num_heads, mask, dff ,dropout_rate)
  return X

In [0]:
sample_encoder_output = encoder(tf.random.uniform((64, 62)), 8500, maximum_position_encoding=10000, training=False)
print (sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)

In [0]:
class Encoder(layers.Layer):
  def __init__(
      self,
      input_vocab_size: int,
      maximum_position_encoding: int,
      num_layers: int = 6,
      d_model: int = 512,
      num_heads: int = 8,
      dff: int = 2048,
      dropout_rate: int = 0.1
  ):
    super(Encoder, self).__init__()
    self.input_vocab_size = input_vocab_size
    self.maximum_position_encoding = maximum_position_encoding
    self.num_layers = num_layers
    self.d_model = d_model
    self.num_heads = num_heads
    self.dff = dff
    self.dropout_rate = dropout_rate
  
  def call(self, X, training, mask=None):
    seq_len = tf.shape(X)[1]
    X = layers.Embedding(self.input_vocab_size, self.d_model)(X)
    pos_encoding = positional_encoding(self.maximum_position_encoding, self.d_model)
    X *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    X += tf.slice(tf.cast(pos_encoding, tf.float32), [0, 0, 0], [pos_encoding.shape[0], seq_len, pos_encoding.shape[2]])
    X = layers.Dropout(self.dropout_rate)(X, training)
    for i in range(self.num_layers):
      X = EncoderLayer(self.d_model, self.num_heads, self.dff , self.dropout_rate)(X, training, mask)
    return X
    

In [0]:
sample_encoder_output = Encoder(8500, maximum_position_encoding=10000)(tf.random.uniform((64, 62)), training=False)
print (sample_encoder_output.shape)  # (batch_size, input_seq_len, d_model)

In [0]:
def decoder(X, enc_output, target_vocab_size, maximum_position_encoding, training, num_layers=6, d_model=512, num_heads=8, dff=2048, look_ahead_mask=None, padding_mask=None, dropout_rate=0.1):
  seq_len = tf.shape(X)[1]
  X = layers.Embedding(target_vocab_size, d_model)(X)
  pos_encoding = positional_encoding(maximum_position_encoding, d_model)
  X *= tf.math.sqrt(tf.cast(d_model, tf.float32))
  X += tf.slice(tf.cast(pos_encoding, tf.float32), [0, 0, 0], [pos_encoding.shape[0], seq_len, pos_encoding.shape[2]])
  X = layers.Dropout(dropout_rate)(X, training)
  attention_weights = {}
  for i in range(num_layers):
    X, block1, block2 = decoder_layer(X, enc_output, training, d_model, num_heads, look_ahead_mask, padding_mask, dff, dropout_rate)
    attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
    attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
  return X, attention_weights

In [0]:
output, attn = decoder(tf.random.uniform((64, 26)), enc_output=sample_encoder_output, training=False, target_vocab_size=8000, maximum_position_encoding=5000)
output.shape, attn['decoder_layer2_block2'].shape

In [0]:
class Decoder(layers.Layer):
  def __init__(
      self,
      target_vocab_size: int,
      maximum_position_encoding: int,
      num_layers: int = 6,
      d_model: int = 512,
      num_heads: int = 8,
      dff: int = 2048,
      dropout_rate: int = 0.1
  ):
    super(Decoder, self).__init__()
    self.target_vocab_size = target_vocab_size
    self.maximum_position_encoding = maximum_position_encoding
    self.num_layers = num_layers
    self.d_model = d_model
    self.num_heads = num_heads
    self.dff = dff
    self.dropout_rate = dropout_rate
  
  def call(self, X, enc_output, training, look_ahead_mask=None, padding_mask=None):
    seq_len = tf.shape(X)[1]
    X = layers.Embedding(self.target_vocab_size, self.d_model)(X)
    pos_encoding = positional_encoding(self.maximum_position_encoding, self.d_model)
    X *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    X += tf.slice(tf.cast(pos_encoding, tf.float32), [0, 0, 0], [pos_encoding.shape[0], seq_len, pos_encoding.shape[2]])
    X = layers.Dropout(self.dropout_rate)(X, training)
    attention_weights = {}
    for i in range(self.num_layers):
      X, block1, block2 = DecoderLayer(self.d_model, self.num_heads, self.dff, self.dropout_rate)(X, enc_output, training, look_ahead_mask, padding_mask)
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    return X, attention_weights

In [0]:
output, attn = Decoder(target_vocab_size=8000, maximum_position_encoding=5000)(tf.random.uniform((64, 26)), enc_output=sample_encoder_output, training=False)
output.shape, attn['decoder_layer2_block2'].shape

In [0]:
def transformer(
    inp, tar, input_vocab_size, target_vocab_size, pe_input, pe_target, training,
    num_layers=6, d_model=512, num_heads=8, dff=2048, dropout_rate=0.1, enc_mask=None, look_ahead_mask=None, padding_mask=None):
  enc_output = encoder(inp, input_vocab_size, pe_input, training, num_layers, d_model, num_heads, dff, enc_mask ,dropout_rate)
  dec_output, attention_weights = decoder(tar, enc_output, target_vocab_size, pe_target, training, num_layers, d_model, num_heads, dff, look_ahead_mask, padding_mask, dropout_rate)
  final_output = layers.Dense(target_vocab_size)(dec_output)
  return final_output, attention_weights

In [0]:
class Transformer(layers.Layer):
  def __init__(
      self,
      input_vocab_size: int,
      target_vocab_size: int,
      pe_input: int,
      pe_target: int,
      num_layers: int=6,
      d_model: int=512,
      num_heads: int=8,
      dff: int=2048,
      dropout_rate: int=0.1
  ):
    super(Transformer, self).__init__()
    self.input_vocab_size = input_vocab_size
    self.target_vocab_size = target_vocab_size
    self.pe_input = pe_input
    self.pe_target = pe_target
    self.num_layers = num_layers
    self.d_model = d_model
    self.num_heads = num_heads
    self.dff = dff
    self.dropout_rate = dropout_rate
  
  def call(self, inp, tar, training, enc_mask=None, look_ahead_mask=None, padding_mask=None):
    enc_output = Encoder(self.input_vocab_size, self.pe_input, self.num_layers, self.d_model, self.num_heads, self.dff, self.dropout_rate)(inp, training, enc_mask)
    dec_output, attention_weights = Decoder(self.target_vocab_size, self.pe_input, self.num_layers, self.d_model, self.num_heads, self.dff, self.dropout_rate)(tar, enc_output, training, look_ahead_mask, padding_mask)
    final_output = layers.Dense(self.target_vocab_size)(dec_output)
    return final_output, attention_weights


In [0]:
temp_input = tf.random.uniform((64, 62))
temp_target = tf.random.uniform((64, 26))

fn_out, _ = transformer(
    temp_input, temp_target, training=False,
    num_layers=2, d_model=512, num_heads=8, dff=2048, 
    input_vocab_size=8500, target_vocab_size=8000, 
    pe_input=10000, pe_target=6000)
fn_out.shape

In [0]:
temp_input = tf.random.uniform((64, 62))
temp_target = tf.random.uniform((64, 26))

fn_out, _ = Transformer(num_layers=2, d_model=512, num_heads=8, dff=2048, 
    input_vocab_size=8500, target_vocab_size=8000, 
    pe_input=10000, pe_target=6000)(temp_input, temp_target, training=False)
fn_out.shape

**Depth Growing for NMT implementation** https://arxiv.org/abs/1907.01968


---

![alt text](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQI5Xun1a2X6Z81uf3qUmQLVeP4Ls-uYIG8sBGiVSuVQSRQVm0kJg)

In [0]:
class DepthGrowingTransformer(layers.Layer):
  def __init__(
      self, 
      input_vocab_size: int,
      target_vocab_size: int,
      pe_input: int,
      pe_target: int,
      n=6, 
      m=2, 
      d_model=1024, 
      dff=4096, 
      num_heads=16, 
      dropout_rate=0.3,
  ):
    super(DepthGrowingTransformer, self).__init__()
    self.input_vocab_size = input_vocab_size
    self.target_vocab_size = target_vocab_size
    self.pe_input = pe_input
    self.pe_target = pe_target
    self.n = n
    self.m = m
    self.d_model = d_model
    self.num_heads = num_heads
    self.dff = dff
    self.dropout_rate = dropout_rate

  def call(self, inp, tar, training, enc_mask=None, look_ahead_mask=None, padding_mask=None):
    seq_len = tf.shape(inp)[1]
    enc_out = layers.Embedding(self.input_vocab_size, self.d_model)(inp)
    pos_encoding = positional_encoding(self.pe_input, self.d_model)
    enc_out *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    enc_out += tf.slice(tf.cast(pos_encoding, tf.float32), [0, 0, 0], [pos_encoding.shape[0], seq_len, pos_encoding.shape[2]])
    enc_out = layers.Dropout(self.dropout_rate)(enc_out, training)
    enc_res = tf.identity(enc_out)
    for i in range(self.n):
      enc_out = EncoderLayer(self.d_model, self.num_heads, self.dff , self.dropout_rate)(enc_out, training, enc_mask)

    seq_len = tf.shape(tar)[1]
    dec_out = layers.Embedding(self.target_vocab_size, self.d_model)(tar)
    pos_encoding = positional_encoding(self.pe_target, self.d_model)
    dec_out *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    dec_out += tf.slice(tf.cast(pos_encoding, tf.float32), [0, 0, 0], [pos_encoding.shape[0], seq_len, pos_encoding.shape[2]])
    dec_out = layers.Dropout(self.dropout_rate)(dec_out, training)
    dec_res = tf.identity(dec_out)
    attention_weights = {}
    for i in range(self.n):
      dec_out, block1, block2 = DecoderLayer(self.d_model, self.num_heads, self.dff, self.dropout_rate)(dec_out, tf.identity(enc_out), training, look_ahead_mask, padding_mask)
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

    enc_out = enc_out + enc_res
    dec_out = dec_out + dec_res
    train_output = layers.Dense(self.target_vocab_size)(dec_out)

    for i in range(self.m):
      enc_out = EncoderLayer(self.d_model, self.num_heads, self.dff , self.dropout_rate)(enc_out, training, enc_mask)
    for i in range(self.m):
      dec_out, block3, block4 = DecoderLayer(self.d_model, self.num_heads, self.dff, self.dropout_rate)(dec_out, enc_out, training, look_ahead_mask, padding_mask)
      attention_weights['decoder_layer{}_block3'.format(i+1)] = block3
      attention_weights['decoder_layer{}_block4'.format(i+1)] = block4

    final_output = layers.Dense(self.target_vocab_size)(dec_out)
    return final_output, train_output, attention_weights


In [0]:
examples, metadata = tfds.load(name="wmt14_translate/de-en", with_info=True,
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']

In [0]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for de, en in train_examples), target_vocab_size=2**13)

tokenizer_de = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (de.numpy() for de, en in train_examples), target_vocab_size=2**13)

In [0]:
def encode(lang1, lang2):
  lang1 = [tokenizer_de.vocab_size] + tokenizer_de.encode(
      lang1.numpy()) + [tokenizer_de.vocab_size+1]

  lang2 = [tokenizer_en.vocab_size] + tokenizer_en.encode(
      lang2.numpy()) + [tokenizer_en.vocab_size+1]
  
  return lang1, lang2

In [0]:
MAX_LENGTH = 40
BUFFER_SIZE = 20000
BATCH_SIZE = 64
def filter_max_length(x, y, max_length=MAX_LENGTH):
  return tf.logical_and(tf.size(x) <= max_length,
                        tf.size(y) <= max_length)

In [0]:
def tf_encode(pt, en):
  return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])

In [0]:
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_max_length)
# cache the dataset to memory to get a speedup while reading from it.
train_dataset = train_dataset.cache()
train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)


val_dataset = val_examples.map(tf_encode)
val_dataset = val_dataset.filter(filter_max_length).padded_batch(
    BATCH_SIZE, padded_shapes=([-1], [-1]))
print(train_dataset.shape)

In [0]:
inp = K.Input(shape=(1,), name='en')
tar = K.Input(shape=(1,), name='de')
out = DepthGrowingTransformer( input_vocab_size=46675, target_vocab_size=63658, 
    pe_input=46675, pe_target=63658)(inp, tar, True)
dgt = K.model(inputs=[inp, tar], outputs=out)
optimizer = K.optimizers.Adam(learning_rate=1e-3)
dgt.compile(optimizer, loss=K.losses.MeanSquaredError())
dgt.fit(x_train, x_train, epochs=20, batch_size=64)
print(fn_out)
print(t_out)

In [0]:
temp_input = tf.random.uniform((64, 62))
temp_target = tf.random.uniform((64, 26))

fn_out, t_out, _ = DepthGrowingTransformer( input_vocab_size=8500, target_vocab_size=8000, 
    pe_input=10000, pe_target=6000)(temp_input, temp_target, training=False)
print(fn_out)
print(t_out)

In [0]:
wmt_train = tfds.load(name="wmt14_translate/de-en", split="train")
assert isinstance(wmt_train, tf.data.Dataset)

In [0]:
wmt_validation = tfds.load(name="wmt14_translate/de-en", split="validation")
wmt_test = tfds.load(name="wmt14_translate/de-en", split="test")

In [0]:
de="/root/tensorflow_datasets/downloads/extracted/TAR_GZ.statmt.org_wmt14_training-parallel-nc-v9y4lT8pIpjmh3rkM8mJErknywdmswP0VAUS3dKGx0hIU.tgz/training/news-commentary-v9.de-en.de"
en="/root/tensorflow_datasets/downloads/extracted/TAR_GZ.statmt.org_wmt14_training-parallel-nc-v9y4lT8pIpjmh3rkM8mJErknywdmswP0VAUS3dKGx0hIU.tgz/training/news-commentary-v9.de-en.en"


In [0]:
!subword-nmt learn-joint-bpe-and-vocab --input $de $en -s 89500 -o subwordBPE --write-vocabulary vocab.de vocab.en

In [0]:
!subword-nmt apply-bpe -c subwordBPE --vocabulary vocab.en --vocabulary-threshold 50 < {en} > train.BPE.en
!subword-nmt apply-bpe -c subwordBPE --vocabulary vocab.de --vocabulary-threshold 50 < {de} > train.BPE.de