In [None]:
import tensorflow as tf
 
import time
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import os
import re

In [None]:
os.mkdir('second_randomized')

In [None]:
os.chdir('second_randomized')

In [None]:
!cp -rav /gdrive/MyDrive/ywl_transformer/second_randomized/* /content/second_randomized/

'/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints' -> '/content/second_randomized/checkpoints'
'/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/baseline' -> '/content/second_randomized/checkpoints/baseline'
'/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/baseline/train' -> '/content/second_randomized/checkpoints/baseline/train'
'/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/baseline/train/ckpt-37.index' -> '/content/second_randomized/checkpoints/baseline/train/ckpt-37.index'
'/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/baseline/train/ckpt-38.index' -> '/content/second_randomized/checkpoints/baseline/train/ckpt-38.index'
'/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/baseline/train/ckpt-36.index' -> '/content/second_randomized/checkpoints/baseline/train/ckpt-36.index'
'/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/baseline/train/ckpt-39.index' -> '/content/second_randomized/checkp

In [None]:
# three types of sliding window:
#  1) left_slide: left-aligned with sliding right edge
#  2) right_slide: right-aligned with sliding left edge
#  3) lim_slide: sliding window absolutely throughout to end of each text
#     lim: side of sliding window
#     split_comma: split on commas rather than periods
 
def retrieve_sents(data_stream, 
                   left_slide=False, 
                   right_slide=False, 
                   lim_slide=False, 
                   split_comma=False, 
                   lim=0):
  sents = []
  current_sent = []
  slides = {}
  for i in range(lim):
    slides[i] = []
  for j, item in enumerate(data_stream):
    current_sent.append(item)
    if lim_slide == True:
      if j < lim:
        # j + 1 because otherwise initial sequences would have len > lim
        for i in range(j + 1):
          slides[i].append(item)
      # currently, this doesn't handle the last several instances correctly
      else:
        # check modulo, and add to sents, and clear dict entry
        current_slide = j % lim
        sents.append(slides[current_slide])
        slides[current_slide] = []
        # then add current word to all dict entries
        for i in range(lim):
          slides[i].append(item)
    else:
      if split_comma:
        split = ['.', ',']
      else:
        split = ['.']
      if item[0] in split:
        if len(current_sent) >= 3:
          # handle aligned versions
          if left_slide == True:
            sent_windows = []
            for i in range(len(current_sent) - 1, 1, -1):
              sent_windows.append(current_sent[:i])
            sents += sent_windows
          if right_slide == True:
            sent_windows = []
            for i in range(1, len(current_sent) - 1):
              sent_windows.append(current_sent[i:])
            sents += sent_windows
 
        # handle the basic current_sent in any case
        if len(current_sent) >= 2:
          sents.append(current_sent)
          current_sent = []
        else:
          current_sent = [] # clear single stray instance of comma or period
  
  # append whatever is left over at the end of a text (if no final comma or
  #  period); if lim_slide, only include the last complete unit
  if lim_slide == True:
    for i in range(lim):
      if len(slides[i]) == lim:
        sents.append(slides[i])
  else:
    if current_sent != []:
      sents.append(current_sent)
  return sents
 
def join_sents(sent_list):
  # sents are sequences of units like this: '.\t.\t[punc]\t100%\n'
  input_sents = []
  output_sents = []
  for sent in sent_list:
    input_sent = []
    output_sent = []
    for word in sent:
      split_word = word.strip('\n').split('\t')
      if len(split_word) > 1:
        output_sent.append(split_word[1])
      input_sent.append(split_word[0])
    input_ = re.sub('&', '', ' '.join(input_sent))
    input_ = re.sub('% ', '', input_)
    input_ = re.sub('%', '', input_)
    input_sents.append(input_)
    output_sents.append(' '.join(output_sent))
  return input_sents, output_sents

In [None]:
# sliding data could be of many different sizes
data = []
left_window_data = []
right_window_data = []
sliding_3_data = []
sliding_5_data = []
sliding_7_data = []
sliding_9_data = []
sliding_11_data = []
sliding_13_data = []
test_data = []

test_text_indices = [5, 10, 17, 18]
training_text_indices = [i for i in range(1, 21) if i not in test_text_indices]


for i in training_text_indices:
    f = open('text ' + str(i) + ' preprocess.txt', 'r')
    item_stream = f.readlines()
    f.close()
        
    data += retrieve_sents(item_stream, split_comma=True)
    left_window_data += retrieve_sents(item_stream, left_slide=True, split_comma=True)
    right_window_data += retrieve_sents(item_stream, right_slide=True, split_comma=True)
    sliding_3_data += retrieve_sents(item_stream, lim_slide=True, lim=3)
    sliding_5_data += retrieve_sents(item_stream, lim_slide=True, lim=5)
    sliding_7_data += retrieve_sents(item_stream, lim_slide=True, lim=7)
    sliding_9_data += retrieve_sents(item_stream, lim_slide=True, lim=9)
    sliding_11_data += retrieve_sents(item_stream, lim_slide=True, lim=11)
    sliding_13_data += retrieve_sents(item_stream, lim_slide=True, lim=13)
        
for i in test_text_indices:
    f = open('text ' + str(i) + ' preprocess.txt', 'r')
    item_stream = f.readlines()
    f.close()
        
    test_data += retrieve_sents(item_stream, split_comma=True)

In [None]:
train_input, train_output = join_sents(data)
left_window_input, left_window_output = join_sents(left_window_data)
right_window_input, right_window_output = join_sents(right_window_data)
sliding_3_input, sliding_3_output = join_sents(sliding_3_data)
sliding_5_input, sliding_5_output = join_sents(sliding_5_data)
sliding_7_input, sliding_7_output = join_sents(sliding_7_data)
sliding_9_input, sliding_9_output = join_sents(sliding_9_data)
sliding_11_input, sliding_11_output = join_sents(sliding_11_data)
sliding_13_input, sliding_13_output = join_sents(sliding_13_data)
test_input, test_output = join_sents(test_data)

In [None]:
train_in_tokens = [[c for c in line] for line in train_input]
train_out_tokens = [line.split(' ') for line in train_output]
left_window_in_tokens = [[c for c in line] for line in left_window_input]
left_window_out_tokens = [line.split(' ') for line in left_window_output]
right_window_in_tokens = [[c for c in line] for line in right_window_input]
right_window_out_tokens = [line.split(' ') for line in right_window_output]
sliding_in_3_tokens = [[c for c in line] for line in sliding_3_input]
sliding_out_3_tokens = [line.split(' ') for line in sliding_3_output]
sliding_in_5_tokens = [[c for c in line] for line in sliding_5_input]
sliding_out_5_tokens = [line.split(' ') for line in sliding_5_output]
sliding_in_7_tokens = [[c for c in line] for line in sliding_7_input]
sliding_out_7_tokens = [line.split(' ') for line in sliding_7_output]
sliding_in_9_tokens = [[c for c in line] for line in sliding_9_input]
sliding_out_9_tokens = [line.split(' ') for line in sliding_9_output]
sliding_in_11_tokens = [[c for c in line] for line in sliding_11_input]
sliding_out_11_tokens = [line.split(' ') for line in sliding_11_output]
sliding_in_13_tokens = [[c for c in line] for line in sliding_13_input]
sliding_out_13_tokens = [line.split(' ') for line in sliding_13_output]
test_in_tokens = [[c for c in line] for line in test_input]
test_out_tokens = [line.split(' ') for line in test_output]

In [None]:
# filters out empty strings
train_out_tokens = [[w for w in line if w != ''] for line in train_out_tokens]

In [None]:
# with 20 documents
# 6.6534 repeating
# with randomized results: 7.751824817 repeating
sum([len(i) for i in test_out_tokens])/len(test_out_tokens)

7.455830388692579

In [None]:
input_tokens = [[c for c in line] for line in train_input + test_input]
output_tokens = [line.split(' ') for line in train_output + test_output]

In [None]:
# generates vocabulary set
import io
import pandas as pd

vocab_df = pd.read_csv('vocab.csv', delimiter='\t')
vocab = vocab_df['word'].dropna().to_numpy().tolist()


In [None]:
charset = set([c for line in train_input+test_input for c in line])
wordset = list(set([w for line in train_output+test_output for w in line.split(' ')])) + vocab

complete_set = charset.union(wordset)

complete_set = ['<pad>'] + list(complete_set) + ['<start>', '<end>']

total_vocab = {k: i for (i, k) in enumerate(complete_set)}

inv_total_vocab = {i: k for (k, i) in total_vocab.items()}

In [None]:
import pickle

In [None]:
# loads pregenerated vocabulary
total_vocab = pickle.load(open('total_vocab_20.pkl', 'rb'))

In [None]:
inv_total_vocab = {i: k for (k, i) in total_vocab.items()}

In [None]:
BATCH_SIZE = 64

In [None]:
# add a start and end token to the input and target
def encode(lang1, lang2):
  start_token = total_vocab['<start>']
  end_token = total_vocab['<end>']
  lang1 = [start_token] + [total_vocab[w] for w in lang1] \
           + [end_token]

  lang2 = [start_token] + [total_vocab[w] for w in lang2] \
           + [end_token]
  
  return lang1, lang2

In [None]:
train_in_encoded, train_out_encoded = zip(*[encode(a, b) for a, b in zip(train_in_tokens, train_out_tokens)])
left_window_in_encoded, left_window_out_encoded = zip(*[encode(a, b) for a, b in zip(left_window_in_tokens, left_window_out_tokens)])
right_window_in_encoded, right_window_out_encoded = zip(*[encode(a, b) for a, b in zip(right_window_in_tokens, right_window_out_tokens)])
sliding_in_3_encoded, sliding_out_3_encoded = zip(*[encode(a, b) for a, b in zip(sliding_in_3_tokens, sliding_out_3_tokens)])
sliding_in_5_encoded, sliding_out_5_encoded = zip(*[encode(a, b) for a, b in zip(sliding_in_5_tokens, sliding_out_5_tokens)])
sliding_in_7_encoded, sliding_out_7_encoded = zip(*[encode(a, b) for a, b in zip(sliding_in_7_tokens, sliding_out_7_tokens)])
sliding_in_9_encoded, sliding_out_9_encoded = zip(*[encode(a, b) for a, b in zip(sliding_in_9_tokens, sliding_out_9_tokens)])
sliding_in_11_encoded, sliding_out_11_encoded = zip(*[encode(a, b) for a, b in zip(sliding_in_11_tokens, sliding_out_11_tokens)])
sliding_in_13_encoded, sliding_out_13_encoded = zip(*[encode(a, b) for a, b in zip(sliding_in_13_tokens, sliding_out_13_tokens)])
test_in_encoded, test_out_encoded = zip(*[encode(a, b) for a, b in zip(test_in_tokens, test_out_tokens)])

In [None]:
INPUT_LEN = max([len(line) for line in train_in_encoded] + \
                [len(line) for line in left_window_in_encoded] + \
                [len(line) for line in right_window_in_encoded] + \
                [len(line) for line in test_in_encoded])
OUTPUT_LEN = max([len(line) for line in train_out_encoded] + \
                [len(line) for line in left_window_out_encoded] + \
                [len(line) for line in right_window_out_encoded] + \
                [len(line) for line in test_out_encoded])

In [None]:
print(INPUT_LEN)
print(OUTPUT_LEN)

273
36


In [None]:
# for use in later testing
INPUT_LEN = 273
OUTPUT_LEN = 36

In [None]:
# pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_in_padded = pad_sequences(train_in_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
left_window_in_padded = pad_sequences(left_window_in_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
right_window_in_padded = pad_sequences(right_window_in_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
sliding_in_3_padded = pad_sequences(sliding_in_3_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
sliding_in_5_padded = pad_sequences(sliding_in_5_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
sliding_in_7_padded = pad_sequences(sliding_in_7_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
sliding_in_9_padded = pad_sequences(sliding_in_9_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
sliding_in_11_padded = pad_sequences(sliding_in_11_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
sliding_in_13_padded = pad_sequences(sliding_in_13_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
test_in_padded = pad_sequences(test_in_encoded, 
                             maxlen=INPUT_LEN,
                             padding='post', 
                             value=total_vocab['<pad>'])
train_out_padded = pad_sequences(train_out_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
left_window_out_padded = pad_sequences(left_window_out_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
right_window_out_padded = pad_sequences(right_window_out_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
sliding_out_3_padded = pad_sequences(sliding_out_3_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
sliding_out_5_padded = pad_sequences(sliding_out_5_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
sliding_out_7_padded = pad_sequences(sliding_out_7_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
sliding_out_9_padded = pad_sequences(sliding_out_9_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
sliding_out_11_padded = pad_sequences(sliding_out_11_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
sliding_out_13_padded = pad_sequences(sliding_out_13_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])
test_out_padded = pad_sequences(test_out_encoded,
                             maxlen=OUTPUT_LEN,
                             padding='post',
                             value=total_vocab['<pad>'])

In [None]:
# pickle datasets
pickle.dump(train_in_padded, open('train_in_padded_20_corr.pkl', 'wb'))
pickle.dump(train_out_padded, open('train_out_padded_20_corr.pkl', 'wb'))
pickle.dump(left_window_in_padded, open('left_window_in_padded_20_corr.pkl', 'wb'))
pickle.dump(left_window_out_padded, open('left_window_out_padded_20_corr.pkl', 'wb'))
pickle.dump(right_window_in_padded, open('right_window_in_padded_20_corr.pkl', 'wb'))
pickle.dump(right_window_out_padded, open('right_window_out_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_in_3_padded, open('sliding_in_3_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_out_3_padded, open('sliding_out_3_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_in_5_padded, open('sliding_in_5_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_out_5_padded, open('sliding_out_5_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_in_7_padded, open('sliding_in_7_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_out_7_padded, open('sliding_out_7_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_in_9_padded, open('sliding_in_9_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_out_9_padded, open('sliding_out_9_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_in_11_padded, open('sliding_in_11_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_out_11_padded, open('sliding_out_11_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_in_13_padded, open('sliding_in_13_padded_20_corr.pkl', 'wb'))
pickle.dump(sliding_out_13_padded, open('sliding_out_13_padded_20_corr.pkl', 'wb'))
pickle.dump(test_in_padded, open('test_in_padded_20_corr.pkl', 'wb'))
pickle.dump(test_out_padded, open('test_out_padded_20_corr.pkl', 'wb'))

In [None]:
# load picked datasets
train_in_padded = pickle.load(open('train_in_padded_20_corr.pkl', 'rb'))
train_out_padded = pickle.load(open('train_out_padded_20_corr.pkl', 'rb'))
left_window_in_padded = pickle.load(open('left_window_in_padded_20_corr.pkl', 'rb'))
left_window_out_padded = pickle.load(open('left_window_out_padded_20_corr.pkl', 'rb'))
right_window_in_padded = pickle.load(open('right_window_in_padded_20_corr.pkl', 'rb'))
right_window_out_padded = pickle.load(open('right_window_out_padded_20_corr.pkl', 'rb'))
sliding_in_3_padded = pickle.load(open('sliding_in_3_padded_20_corr.pkl', 'rb'))
sliding_out_3_padded = pickle.load(open('sliding_out_3_padded_20_corr.pkl', 'rb'))
sliding_in_5_padded = pickle.load(open('sliding_in_5_padded_20_corr.pkl', 'rb'))
sliding_out_5_padded = pickle.load(open('sliding_out_5_padded_20_corr.pkl', 'rb'))
sliding_in_7_padded = pickle.load(open('sliding_in_7_padded_20_corr.pkl', 'rb'))
sliding_out_7_padded = pickle.load(open('sliding_out_7_padded_20_corr.pkl', 'rb'))
sliding_in_9_padded = pickle.load(open('sliding_in_9_padded_20_corr.pkl', 'rb'))
sliding_out_9_padded = pickle.load(open('sliding_out_9_padded_20_corr.pkl', 'rb'))
sliding_in_11_padded = pickle.load(open('sliding_in_11_padded_20_corr.pkl', 'rb'))
sliding_out_11_padded = pickle.load(open('sliding_out_11_padded_20_corr.pkl', 'rb'))
sliding_in_13_padded = pickle.load(open('sliding_in_13_padded_20_corr.pkl', 'rb'))
sliding_out_13_padded = pickle.load(open('sliding_out_13_padded_20_corr.pkl', 'rb'))
test_in_padded = pickle.load(open('test_in_padded_20_corr.pkl', 'rb'))
test_out_padded = pickle.load(open('test_out_padded_20_corr.pkl', 'rb'))

In [None]:
# create TensorFlow datasets with data generated with different window types
# TODO: refactor the code in this cell
def create_tf_dataset(encoder_data, decoder_data):
  enc_numpy = np.asarray(encoder_data, dtype=np.int64)
  enc_dataset = tf.data.Dataset.from_tensor_slices(enc_numpy)
 
  dec_numpy = np.asarray(decoder_data, dtype=np.int64)
  dec_dataset = tf.data.Dataset.from_tensor_slices(dec_numpy)
 
  dataset = tf.data.Dataset.zip((enc_dataset, dec_dataset))
 
  dataset = dataset.cache()
  dataset = dataset.padded_batch(BATCH_SIZE)
  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
 
  return dataset
 
# sliding datasets
sliding_3_dataset = create_tf_dataset(sliding_in_3_padded, sliding_out_3_padded)
sliding_5_dataset = create_tf_dataset(sliding_in_5_padded, sliding_out_5_padded)
sliding_7_dataset = create_tf_dataset(sliding_in_7_padded, sliding_out_7_padded)
sliding_9_dataset = create_tf_dataset(sliding_in_9_padded, sliding_out_9_padded)
sliding_11_dataset = create_tf_dataset(sliding_in_11_padded, sliding_out_11_padded)
sliding_13_dataset = create_tf_dataset(sliding_in_13_padded, sliding_out_13_padded)
 
# cache the dataset to memory to get a speedup while reading from it.
 
# non-window training data
enc_training_numpy = np.asarray(train_in_padded, dtype=np.int64)
enc_train_dataset = tf.data.Dataset.from_tensor_slices(enc_training_numpy)
 
dec_training_numpy = np.asarray(train_out_padded, dtype=np.int64)
dec_train_dataset = tf.data.Dataset.from_tensor_slices(dec_training_numpy)
 
train_dataset = tf.data.Dataset.zip((enc_train_dataset, dec_train_dataset))
 
train_dataset = train_dataset.cache()
train_dataset = train_dataset.padded_batch(BATCH_SIZE)
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
 
# left window data
enc_left_window_numpy = np.asarray(left_window_in_padded, dtype=np.int64)
enc_left_window_dataset = tf.data.Dataset.from_tensor_slices(enc_left_window_numpy)
 
dec_left_window_numpy = np.asarray(left_window_out_padded, dtype=np.int64)
dec_left_window_dataset = tf.data.Dataset.from_tensor_slices(dec_left_window_numpy)
 
left_window_dataset = tf.data.Dataset.zip((enc_left_window_dataset, dec_left_window_dataset))
 
left_window_dataset = left_window_dataset.cache()
left_window_dataset = left_window_dataset.padded_batch(BATCH_SIZE)
left_window_dataset = left_window_dataset.prefetch(tf.data.experimental.AUTOTUNE)
 
# right window data
enc_right_window_numpy = np.asarray(right_window_in_padded, dtype=np.int64)
enc_right_window_dataset = tf.data.Dataset.from_tensor_slices(enc_right_window_numpy)
 
dec_right_window_numpy = np.asarray(right_window_out_padded, dtype=np.int64)
dec_right_window_dataset = tf.data.Dataset.from_tensor_slices(dec_right_window_numpy)
 
right_window_dataset = tf.data.Dataset.zip((enc_right_window_dataset, dec_right_window_dataset))
 
right_window_dataset = right_window_dataset.cache()
right_window_dataset = right_window_dataset.padded_batch(BATCH_SIZE)
right_window_dataset = right_window_dataset.prefetch(tf.data.experimental.AUTOTUNE)
 
# hybrid datasets base + other
base_left_window_padded_enc = np.concatenate((train_in_padded, left_window_in_padded))
base_left_window_padded_dec = np.concatenate((train_out_padded, left_window_out_padded))
base_left_window_dataset = create_tf_dataset(base_left_window_padded_enc, base_left_window_padded_dec)
 
base_right_window_padded_enc = np.concatenate((train_in_padded, right_window_in_padded))
base_right_window_padded_dec = np.concatenate((train_out_padded, right_window_out_padded))
base_right_window_dataset = create_tf_dataset(base_right_window_padded_enc, base_right_window_padded_dec)
 
base_sliding_3_padded_enc = np.concatenate((train_in_padded, sliding_in_3_padded))
base_sliding_3_padded_dec = np.concatenate((train_out_padded, sliding_out_3_padded))
base_sliding_3_dataset = create_tf_dataset(base_sliding_3_padded_enc, base_sliding_3_padded_dec)
 
base_sliding_5_padded_enc = np.concatenate((train_in_padded, sliding_in_5_padded))
base_sliding_5_padded_dec = np.concatenate((train_out_padded, sliding_out_5_padded))
base_sliding_5_dataset = create_tf_dataset(base_sliding_5_padded_enc, base_sliding_5_padded_dec)
 
base_sliding_7_padded_enc = np.concatenate((train_in_padded, sliding_in_7_padded))
base_sliding_7_padded_dec = np.concatenate((train_out_padded, sliding_out_7_padded))
base_sliding_7_dataset = create_tf_dataset(base_sliding_7_padded_enc, base_sliding_7_padded_dec)
 
base_sliding_9_padded_enc = np.concatenate((train_in_padded, sliding_in_9_padded))
base_sliding_9_padded_dec = np.concatenate((train_out_padded, sliding_out_9_padded))
base_sliding_9_dataset = create_tf_dataset(base_sliding_9_padded_enc, base_sliding_9_padded_dec)
 
base_sliding_11_padded_enc = np.concatenate((train_in_padded, sliding_in_11_padded))
base_sliding_11_padded_dec = np.concatenate((train_out_padded, sliding_out_11_padded))
base_sliding_11_dataset = create_tf_dataset(base_sliding_11_padded_enc, base_sliding_11_padded_dec)
 
base_sliding_13_padded_enc = np.concatenate((train_in_padded, sliding_in_13_padded))
base_sliding_13_padded_dec = np.concatenate((train_out_padded, sliding_out_13_padded))
base_sliding_13_dataset = create_tf_dataset(base_sliding_13_padded_enc, base_sliding_13_padded_dec)
 
# hybrid datasets right_window + other
right_window_left_window_padded_enc = np.concatenate((right_window_in_padded,
                                                      left_window_in_padded))
right_window_left_window_padded_dec = np.concatenate((right_window_out_padded,
                                                      left_window_out_padded))
right_window_left_window_dataset = create_tf_dataset(right_window_left_window_padded_enc,
                                                     right_window_left_window_padded_dec)
 
right_window_sliding_3_padded_enc = np.concatenate((right_window_in_padded,
                                                    sliding_in_3_padded))
right_window_sliding_3_padded_dec = np.concatenate((right_window_out_padded,
                                                    sliding_out_3_padded))
right_window_sliding_3_dataset = create_tf_dataset(right_window_sliding_3_padded_enc,
                                                   right_window_sliding_3_padded_dec)
 
right_window_sliding_5_padded_enc = np.concatenate((right_window_in_padded,
                                                    sliding_in_5_padded))
right_window_sliding_5_padded_dec = np.concatenate((right_window_out_padded,
                                                    sliding_out_5_padded))
right_window_sliding_5_dataset = create_tf_dataset(right_window_sliding_5_padded_enc,
                                                   right_window_sliding_5_padded_dec)
 
right_window_sliding_7_padded_enc = np.concatenate((right_window_in_padded, 
                                                    sliding_in_7_padded))
right_window_sliding_7_padded_dec = np.concatenate((right_window_out_padded, 
                                                   sliding_out_7_padded))
right_window_sliding_7_dataset = create_tf_dataset(right_window_sliding_7_padded_enc, 
                                                   right_window_sliding_7_padded_dec)
 
right_window_sliding_9_padded_enc = np.concatenate((right_window_in_padded, 
                                                    sliding_in_9_padded))
right_window_sliding_9_padded_dec = np.concatenate((right_window_out_padded, 
                                                    sliding_out_9_padded))
right_window_sliding_9_dataset = create_tf_dataset(right_window_sliding_9_padded_enc, 
                                                   right_window_sliding_9_padded_dec)
 
right_window_sliding_11_padded_enc = np.concatenate((right_window_in_padded, 
                                                     sliding_in_11_padded))
right_window_sliding_11_padded_dec = np.concatenate((right_window_out_padded, 
                                                     sliding_out_11_padded))
right_window_sliding_11_dataset = create_tf_dataset(right_window_sliding_11_padded_enc, 
                                                    right_window_sliding_11_padded_dec)
 
right_window_sliding_13_padded_enc = np.concatenate((right_window_in_padded, 
                                                     sliding_in_13_padded))
right_window_sliding_13_padded_dec = np.concatenate((right_window_out_padded, 
                                                     sliding_out_13_padded))
right_window_sliding_13_dataset = create_tf_dataset(right_window_sliding_13_padded_enc, 
                                                    right_window_sliding_13_padded_dec)
 
# hybrid datasets sliding 7 + other
sliding_7_left_window_padded_enc = np.concatenate((sliding_in_7_padded,
                                                      left_window_in_padded))
sliding_7_left_window_padded_dec = np.concatenate((sliding_out_7_padded,
                                                      left_window_out_padded))
sliding_7_left_window_dataset = create_tf_dataset(sliding_7_left_window_padded_enc,
                                                     sliding_7_left_window_padded_dec)
 
sliding_7_sliding_3_padded_enc = np.concatenate((sliding_in_7_padded,
                                                    sliding_in_3_padded))
sliding_7_sliding_3_padded_dec = np.concatenate((sliding_out_7_padded,
                                                    sliding_out_3_padded))
sliding_7_sliding_3_dataset = create_tf_dataset(sliding_7_sliding_3_padded_enc,
                                                   sliding_7_sliding_3_padded_dec)
 
sliding_7_sliding_5_padded_enc = np.concatenate((sliding_in_7_padded,
                                                    sliding_in_5_padded))
sliding_7_sliding_5_padded_dec = np.concatenate((sliding_out_7_padded,
                                                    sliding_out_5_padded))
sliding_7_sliding_5_dataset = create_tf_dataset(sliding_7_sliding_5_padded_enc,
                                                   sliding_7_sliding_5_padded_dec)
 
sliding_7_sliding_9_padded_enc = np.concatenate((sliding_in_7_padded, 
                                                    sliding_in_9_padded))
sliding_7_sliding_9_padded_dec = np.concatenate((sliding_out_7_padded, 
                                                    sliding_out_9_padded))
sliding_7_sliding_9_dataset = create_tf_dataset(sliding_7_sliding_9_padded_enc, 
                                                   sliding_7_sliding_9_padded_dec)
 
sliding_7_sliding_11_padded_enc = np.concatenate((sliding_in_7_padded, 
                                                     sliding_in_11_padded))
sliding_7_sliding_11_padded_dec = np.concatenate((sliding_out_7_padded, 
                                                     sliding_out_11_padded))
sliding_7_sliding_11_dataset = create_tf_dataset(sliding_7_sliding_11_padded_enc, 
                                                    sliding_7_sliding_11_padded_dec)
 
sliding_7_sliding_13_padded_enc = np.concatenate((sliding_in_7_padded, 
                                                     sliding_in_13_padded))
sliding_7_sliding_13_padded_dec = np.concatenate((sliding_out_7_padded, 
                                                     sliding_out_13_padded))
sliding_7_sliding_13_dataset = create_tf_dataset(sliding_7_sliding_13_padded_enc, 
                                                    sliding_7_sliding_13_padded_dec)
 
# hybrid datasets sliding 9 + other
sliding_9_left_window_padded_enc = np.concatenate((sliding_in_9_padded,
                                                      left_window_in_padded))
sliding_9_left_window_padded_dec = np.concatenate((sliding_out_9_padded,
                                                      left_window_out_padded))
sliding_9_left_window_dataset = create_tf_dataset(sliding_9_left_window_padded_enc,
                                                     sliding_9_left_window_padded_dec)
 
sliding_9_sliding_3_padded_enc = np.concatenate((sliding_in_9_padded,
                                                    sliding_in_3_padded))
sliding_9_sliding_3_padded_dec = np.concatenate((sliding_out_9_padded,
                                                    sliding_out_3_padded))
sliding_9_sliding_3_dataset = create_tf_dataset(sliding_9_sliding_3_padded_enc,
                                                   sliding_9_sliding_3_padded_dec)
 
sliding_9_sliding_5_padded_enc = np.concatenate((sliding_in_9_padded,
                                                    sliding_in_5_padded))
sliding_9_sliding_5_padded_dec = np.concatenate((sliding_out_9_padded,
                                                    sliding_out_5_padded))
sliding_9_sliding_5_dataset = create_tf_dataset(sliding_9_sliding_5_padded_enc,
                                                   sliding_9_sliding_5_padded_dec)
 
sliding_9_sliding_7_padded_enc = np.concatenate((sliding_in_9_padded, 
                                                    sliding_in_7_padded))
sliding_9_sliding_7_padded_dec = np.concatenate((sliding_out_9_padded, 
                                                    sliding_out_7_padded))
sliding_9_sliding_7_dataset = create_tf_dataset(sliding_9_sliding_7_padded_enc, 
                                                   sliding_9_sliding_7_padded_dec)
 
sliding_9_sliding_11_padded_enc = np.concatenate((sliding_in_9_padded, 
                                                     sliding_in_11_padded))
sliding_9_sliding_11_padded_dec = np.concatenate((sliding_out_9_padded, 
                                                     sliding_out_11_padded))
sliding_9_sliding_11_dataset = create_tf_dataset(sliding_9_sliding_11_padded_enc, 
                                                    sliding_9_sliding_11_padded_dec)
 
sliding_9_sliding_13_padded_enc = np.concatenate((sliding_in_9_padded, 
                                                     sliding_in_13_padded))
sliding_9_sliding_13_padded_dec = np.concatenate((sliding_out_9_padded, 
                                                     sliding_out_13_padded))
sliding_9_sliding_13_dataset = create_tf_dataset(sliding_9_sliding_13_padded_enc, 
                                                    sliding_9_sliding_13_padded_dec)

# hybrid datasets sliding 11 + other
sliding_11_left_window_padded_enc = np.concatenate((sliding_in_11_padded,
                                                      left_window_in_padded))
sliding_11_left_window_padded_dec = np.concatenate((sliding_out_11_padded,
                                                      left_window_out_padded))
sliding_11_left_window_dataset = create_tf_dataset(sliding_11_left_window_padded_enc,
                                                     sliding_11_left_window_padded_dec)
 
sliding_11_sliding_3_padded_enc = np.concatenate((sliding_in_11_padded,
                                                    sliding_in_3_padded))
sliding_11_sliding_3_padded_dec = np.concatenate((sliding_out_11_padded,
                                                    sliding_out_3_padded))
sliding_11_sliding_3_dataset = create_tf_dataset(sliding_11_sliding_3_padded_enc,
                                                   sliding_11_sliding_3_padded_dec)
 
sliding_11_sliding_5_padded_enc = np.concatenate((sliding_in_11_padded,
                                                    sliding_in_5_padded))
sliding_11_sliding_5_padded_dec = np.concatenate((sliding_out_11_padded,
                                                    sliding_out_5_padded))
sliding_11_sliding_5_dataset = create_tf_dataset(sliding_11_sliding_5_padded_enc,
                                                   sliding_11_sliding_5_padded_dec)
 
sliding_11_sliding_7_padded_enc = np.concatenate((sliding_in_11_padded, 
                                                    sliding_in_7_padded))
sliding_11_sliding_7_padded_dec = np.concatenate((sliding_out_11_padded, 
                                                    sliding_out_7_padded))
sliding_11_sliding_7_dataset = create_tf_dataset(sliding_11_sliding_7_padded_enc, 
                                                   sliding_11_sliding_7_padded_dec)
 
sliding_11_sliding_9_padded_enc = np.concatenate((sliding_in_11_padded, 
                                                    sliding_in_9_padded))
sliding_11_sliding_9_padded_dec = np.concatenate((sliding_out_11_padded, 
                                                    sliding_out_9_padded))
sliding_11_sliding_9_dataset = create_tf_dataset(sliding_11_sliding_9_padded_enc, 
                                                   sliding_11_sliding_9_padded_dec)
 
sliding_11_sliding_13_padded_enc = np.concatenate((sliding_in_11_padded, 
                                                     sliding_in_13_padded))
sliding_11_sliding_13_padded_dec = np.concatenate((sliding_out_11_padded, 
                                                     sliding_out_13_padded))
sliding_11_sliding_13_dataset = create_tf_dataset(sliding_11_sliding_13_padded_enc, 
                                                    sliding_11_sliding_13_padded_dec)
 
# right window + sliding 7 + other
right_sliding_7_left_window_padded_enc = np.concatenate((right_window_in_padded,
                                                         sliding_in_7_padded,
                                                         left_window_in_padded))
right_sliding_7_left_window_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_7_padded,
                                                        left_window_out_padded))
right_sliding_7_left_window_dataset = create_tf_dataset(right_sliding_7_left_window_padded_enc,
                                                     right_sliding_7_left_window_padded_dec)
 
right_sliding_7_sliding_3_padded_enc = np.concatenate((right_window_in_padded,
                                                       sliding_in_7_padded,
                                                       sliding_in_3_padded))
right_sliding_7_sliding_3_padded_dec = np.concatenate((right_window_out_padded,
                                                       sliding_out_7_padded,
                                                       sliding_out_3_padded))
right_sliding_7_sliding_3_dataset = create_tf_dataset(right_sliding_7_sliding_3_padded_enc,
                                                   right_sliding_7_sliding_3_padded_dec)
 
right_sliding_7_sliding_5_padded_enc = np.concatenate((right_window_in_padded,
                                                       sliding_in_7_padded,
                                                       sliding_in_5_padded))
right_sliding_7_sliding_5_padded_dec = np.concatenate((right_window_out_padded,
                                                       sliding_out_7_padded,
                                                       sliding_out_5_padded))
right_sliding_7_sliding_5_dataset = create_tf_dataset(right_sliding_7_sliding_5_padded_enc,
                                                   right_sliding_7_sliding_5_padded_dec)
 
right_sliding_7_sliding_9_padded_enc = np.concatenate((right_window_in_padded,
                                                       sliding_in_7_padded, 
                                                       sliding_in_9_padded))
right_sliding_7_sliding_9_padded_dec = np.concatenate((right_window_out_padded,
                                                       sliding_out_7_padded, 
                                                       sliding_out_9_padded))
right_sliding_7_sliding_9_dataset = create_tf_dataset(right_sliding_7_sliding_9_padded_enc, 
                                                   right_sliding_7_sliding_9_padded_dec)
 
right_sliding_7_sliding_11_padded_enc = np.concatenate((right_window_in_padded,
                                                        sliding_in_7_padded, 
                                                        sliding_in_11_padded))
right_sliding_7_sliding_11_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_7_padded, 
                                                        sliding_out_11_padded))
right_sliding_7_sliding_11_dataset = create_tf_dataset(right_sliding_7_sliding_11_padded_enc, 
                                                    right_sliding_7_sliding_11_padded_dec)
 
right_sliding_7_sliding_13_padded_enc = np.concatenate((right_window_in_padded,
                                                        sliding_in_7_padded, 
                                                        sliding_in_13_padded))
right_sliding_7_sliding_13_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_7_padded, 
                                                        sliding_out_13_padded))
right_sliding_7_sliding_13_dataset = create_tf_dataset(right_sliding_7_sliding_13_padded_enc, 
                                                    right_sliding_7_sliding_13_padded_dec)
 
# right window + sliding 13 + other
right_sliding_13_left_window_padded_enc = np.concatenate((right_window_in_padded,
                                                         sliding_in_13_padded,
                                                         left_window_in_padded))
right_sliding_13_left_window_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_13_padded,
                                                        left_window_out_padded))
right_sliding_13_left_window_dataset = create_tf_dataset(right_sliding_13_left_window_padded_enc,
                                                     right_sliding_13_left_window_padded_dec)
 
right_sliding_13_sliding_3_padded_enc = np.concatenate((right_window_in_padded,
                                                       sliding_in_13_padded,
                                                       sliding_in_3_padded))
right_sliding_13_sliding_3_padded_dec = np.concatenate((right_window_out_padded,
                                                       sliding_out_13_padded,
                                                       sliding_out_3_padded))
right_sliding_13_sliding_3_dataset = create_tf_dataset(right_sliding_13_sliding_3_padded_enc,
                                                   right_sliding_13_sliding_3_padded_dec)
 
right_sliding_13_sliding_5_padded_enc = np.concatenate((right_window_in_padded,
                                                       sliding_in_13_padded,
                                                       sliding_in_5_padded))
right_sliding_13_sliding_5_padded_dec = np.concatenate((right_window_out_padded,
                                                       sliding_out_13_padded,
                                                       sliding_out_5_padded))
right_sliding_13_sliding_5_dataset = create_tf_dataset(right_sliding_13_sliding_5_padded_enc,
                                                   right_sliding_13_sliding_5_padded_dec)
 
right_sliding_13_sliding_9_padded_enc = np.concatenate((right_window_in_padded,
                                                       sliding_in_13_padded, 
                                                       sliding_in_9_padded))
right_sliding_13_sliding_9_padded_dec = np.concatenate((right_window_out_padded,
                                                       sliding_out_13_padded, 
                                                       sliding_out_9_padded))
right_sliding_13_sliding_9_dataset = create_tf_dataset(right_sliding_13_sliding_9_padded_enc, 
                                                   right_sliding_13_sliding_9_padded_dec)
 
right_sliding_13_sliding_11_padded_enc = np.concatenate((right_window_in_padded,
                                                        sliding_in_13_padded, 
                                                        sliding_in_11_padded))
right_sliding_13_sliding_11_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_13_padded, 
                                                        sliding_out_11_padded))
right_sliding_13_sliding_11_dataset = create_tf_dataset(right_sliding_13_sliding_11_padded_enc, 
                                                    right_sliding_13_sliding_11_padded_dec)
 
right_sliding_13_sliding_7_padded_enc = np.concatenate((right_window_in_padded,
                                                        sliding_in_13_padded, 
                                                        sliding_in_7_padded))
right_sliding_13_sliding_7_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_13_padded, 
                                                        sliding_out_7_padded))
right_sliding_13_sliding_7_dataset = create_tf_dataset(right_sliding_13_sliding_7_padded_enc, 
                                                    right_sliding_13_sliding_7_padded_dec)

# right window + sliding 7 window + left window + other
right_sliding_7_left_window_sliding_3_padded_enc = np.concatenate((right_window_in_padded,
                                                         sliding_in_7_padded,
                                                         left_window_in_padded,
                                                         sliding_in_3_padded))
right_sliding_7_left_window_sliding_3_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_7_padded,
                                                        left_window_out_padded,
                                                         sliding_out_3_padded))
right_sliding_7_left_window_sliding_3_dataset = create_tf_dataset(right_sliding_7_left_window_sliding_3_padded_enc,
                                                     right_sliding_7_left_window_sliding_3_padded_dec)
 
right_sliding_7_left_window_sliding_5_padded_enc = np.concatenate((right_window_in_padded,
                                                         sliding_in_7_padded,
                                                         left_window_in_padded,
                                                         sliding_in_5_padded))
right_sliding_7_left_window_sliding_5_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_7_padded,
                                                        left_window_out_padded,
                                                        sliding_out_5_padded))
right_sliding_7_left_window_sliding_5_dataset = create_tf_dataset(right_sliding_7_left_window_sliding_5_padded_enc,
                                                     right_sliding_7_left_window_sliding_5_padded_dec)
 
right_sliding_7_left_window_sliding_9_padded_enc = np.concatenate((right_window_in_padded,
                                                         sliding_in_7_padded,
                                                         left_window_in_padded,
                                                         sliding_in_9_padded))
right_sliding_7_left_window_sliding_9_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_7_padded,
                                                        left_window_out_padded,
                                                        sliding_out_9_padded))
right_sliding_7_left_window_sliding_9_dataset = create_tf_dataset(right_sliding_7_left_window_sliding_9_padded_enc,
                                                     right_sliding_7_left_window_sliding_9_padded_dec)
 
right_sliding_7_left_window_sliding_11_padded_enc = np.concatenate((right_window_in_padded,
                                                         sliding_in_7_padded,
                                                         left_window_in_padded,
                                                         sliding_in_11_padded))
right_sliding_7_left_window_sliding_11_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_7_padded,
                                                        left_window_out_padded,
                                                        sliding_out_11_padded))
right_sliding_7_left_window_sliding_11_dataset = create_tf_dataset(right_sliding_7_left_window_sliding_11_padded_enc,
                                                     right_sliding_7_left_window_sliding_11_padded_dec)
 
right_sliding_7_left_window_sliding_13_padded_enc = np.concatenate((right_window_in_padded,
                                                         sliding_in_7_padded,
                                                         left_window_in_padded,
                                                         sliding_in_13_padded))
right_sliding_7_left_window_sliding_13_padded_dec = np.concatenate((right_window_out_padded,
                                                        sliding_out_7_padded,
                                                        left_window_out_padded,
                                                        sliding_out_13_padded))
right_sliding_7_left_window_sliding_13_dataset = create_tf_dataset(right_sliding_7_left_window_sliding_13_padded_enc,
                                                     right_sliding_7_left_window_sliding_13_padded_dec)
 
# test dataset
enc_test_numpy = np.asarray(test_in_padded, dtype=np.int64)
enc_test_dataset = tf.data.Dataset.from_tensor_slices(enc_test_numpy)
 
dec_test_numpy = np.asarray(test_out_padded, dtype=np.int64)
dec_test_dataset = tf.data.Dataset.from_tensor_slices(dec_test_numpy)
 
test_dataset = tf.data.Dataset.zip((enc_test_dataset, dec_test_dataset))

In [None]:
# the following set of cells contains the definitions for the Transformer architecture
# the code in this section is from https://www.tensorflow.org/tutorials/text/transformer
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
# create masks
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [None]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [None]:
# build machinery for transformer
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """
  #print('k:', k.shape)
  #print('q:', q.shape)
  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
 
  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  
 
  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
 
  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
 
  return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
 
    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)
 
    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

In [None]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()
 
    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):
 
    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()
 
    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)
 
    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)
 
    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()
 
    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):
 
    seq_len = tf.shape(x)[1]
    
    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
 
    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()
 
    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
 
    seq_len = tf.shape(x)[1]
    attention_weights = {}
    
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)
 
    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()
 
    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, pe_input, rate)
 
    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, pe_target, rate)
 
    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):
 
    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

In [None]:
num_layers = 1
d_model = 128
dff = 512
num_heads = 8
 
input_vocab_size = len(total_vocab)
target_vocab_size = len(total_vocab)
dropout_rate = 0.1

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)
 
    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)
 
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
# loss and accuracy functions -- note handling of masks
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
 
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)
 
 
def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.argmax(pred, axis=2))
  
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)
 
  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [None]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

In [None]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)
  
  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)
  
  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
  return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
# define checkpoint with example checkpoint path
gdrive_checkpoints = "/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/"
checkpoint_path = gdrive_checkpoints + "right_sliding_13_sliding_11/train"
 
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)
 
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [None]:
# training
EPOCHS = 100

In [None]:
# original documentation from TensorFlow documentation page:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.
 
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]
 
@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)
 
  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
  train_loss(loss)
  train_accuracy(accuracy_function(tar_real, predictions))

In [None]:
# training for loop
for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()
  
  # inp -> chars, tar -> words
  for (batch, (inp, tar)) in enumerate(right_sliding_13_sliding_11_dataset):
    train_step(inp, tar)
    
    if batch % 50 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))
 
  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 7.4739 Accuracy 0.0025
Epoch 1 Batch 50 Loss 7.4016 Accuracy 0.0032
Epoch 1 Batch 100 Loss 7.2634 Accuracy 0.0512
Epoch 1 Batch 150 Loss 7.1090 Accuracy 0.0646
Epoch 1 Batch 200 Loss 6.9102 Accuracy 0.0749
Epoch 1 Batch 250 Loss 6.7520 Accuracy 0.0762
Epoch 1 Batch 300 Loss 6.5506 Accuracy 0.0858
Epoch 1 Batch 350 Loss 6.3532 Accuracy 0.0935
Epoch 1 Loss 6.2491 Accuracy 0.0964
Time taken for 1 epoch: 25.94491147994995 secs

Epoch 2 Batch 0 Loss 5.2122 Accuracy 0.0810
Epoch 2 Batch 50 Loss 4.5409 Accuracy 0.2020
Epoch 2 Batch 100 Loss 4.5246 Accuracy 0.2049
Epoch 2 Batch 150 Loss 4.6094 Accuracy 0.1907
Epoch 2 Batch 200 Loss 4.5845 Accuracy 0.1889
Epoch 2 Batch 250 Loss 4.6138 Accuracy 0.1832
Epoch 2 Batch 300 Loss 4.5981 Accuracy 0.1874
Epoch 2 Batch 350 Loss 4.5599 Accuracy 0.1934
Epoch 2 Loss 4.5529 Accuracy 0.1950
Time taken for 1 epoch: 22.832216501235962 secs

Epoch 3 Batch 0 Loss 4.9523 Accuracy 0.1975
Epoch 3 Batch 50 Loss 3.8810 Accuracy 0.3359
Epoch 3 Batc

In [None]:
 train_accuracy.result()

<tf.Tensor: shape=(), dtype=float32, numpy=0.99833304>

In [None]:
# code for reloading checkpoints
gdrive_checkpoints = "/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/"
checkpoint_path = gdrive_checkpoints + "base_sliding_11_window/train"
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)
#checkpoint_path = './checkpoints/right_window_sliding_7_sliding_11_again/train'
latest = tf.train.latest_checkpoint(checkpoint_path)
print(latest)
ckpt.restore(latest)

/gdrive/MyDrive/ywl_transformer/second_randomized/checkpoints/base_sliding_11_window/train/ckpt-20


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa7f63d90b8>

In [None]:
# evaluation function; derived in part from TensorFlow documentation page:
def evaluate_test(test_data):
  accuracies = []
  bleu_real = []
  bleu_pred = []
  wer_scores = []
  for (inp, tar) in test_data:
    input_ = tf.expand_dims(inp, 0) 
    #print(input_)
 
    decoder_input = [total_vocab['<start>']]
    output = tf.cast(tf.expand_dims(decoder_input, 0), tf.int64)
    #print(output)
    #print(type(output))
 
    scorable_output = None
    for i in range(OUTPUT_LEN):
      enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
          input_, output)
      #print(inp)
      predictions, attention_weights = transformer(input_,
                                                   output,
                                                   False,
                                                   enc_padding_mask,
                                                   combined_mask,
                                                   dec_padding_mask)
      #print(type(predictions))
      predictions = predictions[:, -1:, :]
 
      predicted_id = tf.argmax(predictions, axis=-1)
      #print(predicted_id)
 
      output = tf.concat([output, predicted_id], axis=-1)
 
      if predicted_id == total_vocab['<end>']:
        break
 
    #print(output)
    scorable_output = tf.squeeze(output, axis=0)
    print("Actual: {}".format(' '.join(inv_total_vocab[i] for i in tar.numpy())))
    print("Predicted: {}".format(' '.join(inv_total_vocab[i] for i in scorable_output.numpy())))
 
    target_scorable = np.array([i for i in tar.numpy() if i not in [0, 1779, 1780]])
    #print("target:", target_scorable)
    pred_scorable = np.array([i for i in scorable_output.numpy() if i not in [0, 1779, 1780]])
    #print("predicted:", pred_scorable)
 
    bleu_real.append([target_scorable.tolist()])
    bleu_pred.append(pred_scorable.tolist())
 
    # if target and predicted are different lengths, then need to pad here
    if target_scorable.shape[0] != pred_scorable.shape[0]:
      if target_scorable.shape[0] > pred_scorable.shape[0]:
        diff = target_scorable.shape[0] - pred_scorable.shape[0]
        pred_scorable = np.concatenate((pred_scorable, np.zeros((diff,), dtype=np.int32)))
      else:
        diff = pred_scorable.shape[0] - target_scorable.shape[0]
        target_scorable = np.concatenate((target_scorable, np.zeros((diff,), dtype=np.int32)))
 
    target_scorable = tf.convert_to_tensor(target_scorable)
    pred_scorable = tf.convert_to_tensor(pred_scorable)
 
    acc = test_accuracy_function(target_scorable, pred_scorable)
    accuracies.append(acc)
 
    wer_out = wer(target_scorable, pred_scorable)
    wer_scores.append(wer_out/len(target_scorable))
  
  return np.mean(np.asarray(accuracies)), test_bleu_function(bleu_real, bleu_pred), np.mean(np.asarray(wer_scores))

In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
# BLEU score function
def test_bleu_function(real, pred):
  # real and pred here must be numpy
  bleu_1 = corpus_bleu(real, pred, weights=(1.0,))
  bleu_2 = corpus_bleu(real, pred, weights=(0.5, 0.5))
  bleu_3 = corpus_bleu(real, pred, weights=(0.33, 0.33, 0.33))
  bleu_4 = corpus_bleu(real, pred, weights=(0.25, 0.25, 0.25, 0.25))
  return (bleu_1, bleu_2, bleu_3, bleu_4)

In [None]:
# accuracy function, from TensorFlow documentation page
def test_accuracy_function(real, pred):
  accuracies = tf.equal(real, pred)
  
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)
 
  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [None]:
# Word Error Rate calculation function
# published on: https://martin-thoma.com/word-error-rate-calculation/
def wer(r, h):
    """
    Calculation of WER with Levenshtein distance.
 
    Works only for iterables up to 254 elements (uint8).
    O(nm) time ans space complexity.
 
    Parameters
    ----------
    r : list
    h : list
 
    Returns
    -------
    int
 
    Examples
    --------
    >>> wer("who is there".split(), "is there".split())
    1
    >>> wer("who is there".split(), "".split())
    3
    >>> wer("".split(), "who is there".split())
    3
    """
    # initialisation
    import numpy
 
    d = numpy.zeros((len(r) + 1) * (len(h) + 1), dtype=numpy.uint8)
    d = d.reshape((len(r) + 1, len(h) + 1))
    for i in range(len(r) + 1):
        for j in range(len(h) + 1):
            if i == 0:
                d[0][j] = j
            elif j == 0:
                d[i][0] = i
 
    # computation
    for i in range(1, len(r) + 1):
        for j in range(1, len(h) + 1):
            if r[i - 1] == h[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                substitution = d[i - 1][j - 1] + 1
                insertion = d[i][j - 1] + 1
                deletion = d[i - 1][j] + 1
                d[i][j] = min(substitution, insertion, deletion)
 
    return d[len(r)][len(h)]

In [None]:
# run evaluation
evaluate_test(test_dataset)

Actual: <start> limik' xaya: traw , <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predicted: <start> limik' xo: traw , <end>
Actual: <start> 'alwutr' no:chro' , <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predicted: <start> 'alwutr' no:chro' , <end>
Actual: <start> ye:tr'aw ma:gin pana: 'amak' hew'ta: , <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Predicted: <start> ye:tr'aw ma:gin pana: 'amak' hew'ta: , <end>
Actual: <start> 'ama' ye:t' pana: 'amak' tr'ama' , <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

(0.63602096,
 (0.7216080571824429,
  0.6153992604753744,
  0.5148178235898135,
  0.429310038760814),
 0.30638909932682257)

In [None]:
# copy locally saved files if necessary
!cp -r /content/second_randomized/* /gdrive/MyDrive/ywl_transformer/second_randomized/