In [2]:
# import logging
import time
import glob
import os
import csv
import math
from datetime import datetime

import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from transformers import DataCollatorWithPadding
from datasets import load_dataset

In [4]:
from tokenizers import Tokenizer, normalizers
from tokenizers.models import WordPiece
# from tokenizers.normalizers import NFD, Lowercase, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer


In [4]:
import sys
import logging

so = open("data.log", 'w', 10)
sys.stdout.echo = so
sys.stderr.echo = so

get_ipython().log.handlers[0].stream = so
get_ipython().log.setLevel(logging.INFO)

In [5]:
# !pip install -q tensorflow_datasets
# !pip install -q -U tensorflow-text tensorflow

### Load and save data as TF dataset

In [7]:
for i in ['train','val','test']:
    try:
        os.system(f"rm -r ./embedding_model_data/processed_pair_data/{i}/*")
        print("Done")
    except:
        pass

Done


rm: cannot remove ‘./training_data_text_files/train/*’: No such file or directory


In [6]:
def write_data_file_to_directory(pos_file_name, neg_file_name, type_of_data, file_i, hard_data_filename=""):
    temp_pos_df = pd.read_parquet(pos_file_name, columns=['name_1','name_2'])
    temp_pos_df['label'] = 1
    temp_neg_df = pd.read_parquet(neg_file_name, columns=['name_1','name_2'])
    temp_neg_df['label'] = 0
    if hard_data_filename:
        temp_hard_df = pd.read_parquet(hard_data_filename)
        temp_hard_df['name_1'] = temp_hard_df['hard_pairs'].apply(lambda x: x[0])
        temp_hard_df['name_2'] = temp_hard_df['hard_pairs'].apply(lambda x: x[1])
        temp_hard_df['label'] = 0
        
        pos_count = temp_pos_df.shape[0]
        hard_count = temp_hard_df.shape[0]
        temp_neg_df = pd.concat([temp_neg_df.sample(pos_count - hard_count), 
                                 temp_hard_df[['name_1','name_2','label']].copy()], 
                                axis=0)
    
    temp_df = pd.concat([temp_pos_df, temp_neg_df], axis=0)
    
    temp_df = temp_df[(~temp_df['name_1'].isnull()) & 
                      (~temp_df['name_2'].isnull())].copy()
    
    
    temp_df['name_1'] = temp_df['name_1'].apply(lambda x: " ".join(x.split()))
    temp_df['name_2'] = temp_df['name_2'].apply(lambda x: " ".join(x.split()))
    temp_df.sample(temp_df.shape[0]) \
        .to_parquet(f"./embedding_model_data/processed_pair_data/{type_of_data}/{file_i}.parquet")
    

In [7]:
def get_all_data_files():
    final_df = pd.DataFrame()
    for type_of_data in ['train','val','test']:
        list_of_pos_files = glob.glob(f"./embedding_model_data/raw_paired_data/{type_of_data}/pos/*")
        list_of_pos_files.sort()
        list_of_neg_files = glob.glob(f"./embedding_model_data/raw_paired_data/{type_of_data}/neg/*")
        list_of_neg_files.sort()
        for ith, (pos_file_name, neg_file_name) in enumerate(zip(list_of_pos_files, 
                                                                 list_of_neg_files[:len(list_of_pos_files)])):
            if ith == 0:
                hard_data_filename=glob.glob(f"./embedding_model_data/raw_paired_data/{type_of_data}/neg/hard*")[0]
            else:
                hard_data_filename = ""
            print(f"{type_of_data} - {ith} - {pos_file_name} - {neg_file_name}")
            _ = write_data_file_to_directory(pos_file_name, neg_file_name, type_of_data, ith, hard_data_filename)

In [8]:
_ = get_all_data_files()

train - 0 - ./embedding_model_data/raw_paired_data/train/pos/0.parquet - ./embedding_model_data/raw_paired_data/train/neg/0.parquet
train - 1 - ./embedding_model_data/raw_paired_data/train/pos/1.parquet - ./embedding_model_data/raw_paired_data/train/neg/1.parquet
train - 2 - ./embedding_model_data/raw_paired_data/train/pos/10.parquet - ./embedding_model_data/raw_paired_data/train/neg/10.parquet
train - 3 - ./embedding_model_data/raw_paired_data/train/pos/11.parquet - ./embedding_model_data/raw_paired_data/train/neg/11.parquet
train - 4 - ./embedding_model_data/raw_paired_data/train/pos/12.parquet - ./embedding_model_data/raw_paired_data/train/neg/12.parquet
train - 5 - ./embedding_model_data/raw_paired_data/train/pos/13.parquet - ./embedding_model_data/raw_paired_data/train/neg/13.parquet
train - 6 - ./embedding_model_data/raw_paired_data/train/pos/14.parquet - ./embedding_model_data/raw_paired_data/train/neg/14.parquet
train - 7 - ./embedding_model_data/raw_paired_data/train/pos/15.pa

In [6]:
def split_file(file_name, new_num):
    temp_df = pd.read_parquet(file_name)
    temp_df.iloc[:int(temp_df.shape[0]/2), :] \
        .to_parquet(f"./embedding_model_data/new_proc_pair_data/train/{new_num}a.parquet")
    
    temp_df.iloc[int(temp_df.shape[0]/2):, :] \
        .to_parquet(f"./embedding_model_data/new_proc_pair_data/train/{new_num}b.parquet")

In [7]:
for ith, file_name in enumerate(glob.glob("./embedding_model_data/processed_pair_data/train/*")):
    _ = split_file(file_name, ith)

### Make TFRecords

In [5]:
MAX_LEN = 32
name_tokenizer = PreTrainedTokenizerFast(tokenizer_file="name_transformer_wordpiece_tokenizer")

In [6]:
# for i in ['train','val','test']:
#     try:
#         os.system(f"rm -r ./embedding_model_data/tfrecords/{i}/*")
#         print("Done")
#     except:
#         pass

In [7]:
def tf_serialize_example(f0, f1, f2, f3, f4):
    """
    Serialization function.
    """
    tf_string = tf.py_function(serialize_example, (f0, f1, f2, f3, f4), tf.string)
    return tf.reshape(tf_string, ())

In [8]:
def serialize_example(name_1_x, name_1_y, name_2_x, name_2_y, labels):
    """
    Takes in features and outputs them to a serialized string that can be written to
    a file using the TFRecord Writer.
    """
    name_1_x_list = tf.train.Int64List(value=name_1_x.numpy().tolist())
    name_1_y_list = tf.train.Int64List(value=name_1_y.numpy().tolist())
    name_2_x_list = tf.train.Int64List(value=name_2_x.numpy().tolist())
    name_2_y_list = tf.train.Int64List(value=name_2_y.numpy().tolist())
    labels_list = tf.train.Int64List(value=labels.numpy().tolist())
    
    name_1_x_feature = tf.train.Feature(int64_list = name_1_x_list)
    name_1_y_feature = tf.train.Feature(int64_list = name_1_y_list)
    name_2_x_feature = tf.train.Feature(int64_list = name_2_x_list)
    name_2_y_feature = tf.train.Feature(int64_list = name_2_y_list)
    labels_feature = tf.train.Feature(int64_list = labels_list)
    
    features_for_example = {
        'name_1_x': name_1_x_feature,
        'name_1_y': name_1_y_feature,
        'name_2_x': name_2_x_feature,
        'name_2_y': name_2_y_feature,
        'labels': labels_feature
    }
    
    example_proto = tf.train.Example(features=tf.train.Features(feature=features_for_example))
    
    return example_proto.SerializeToString()

In [9]:
def create_labeled_raw_input(tok_input, start_tok, end_tok):
    temp_tok_input = (start_tok + tok_input)[:(MAX_LEN-1)]+end_tok
    final_tok_input = temp_tok_input + [0]*(MAX_LEN-len(temp_tok_input))
    return np.asarray(final_tok_input, dtype=np.int64)

In [11]:
def create_tfrecords_dataset(filename, iter_num, dataset_type='train'):
    """
    Creates a TF Dataset that can then be saved to a file to make it faster to read
    data during training and allow for transferring of data between compute instances.
    """
    data = pd.read_parquet(filename)
    
    # Getting tokenized data
    start_token = name_tokenizer("[START]")['input_ids']
    end_token = name_tokenizer("[END]")['input_ids']
    
    print("------------tokenizing name_1")
    data['tokenized_name_1_x'] = name_tokenizer(data['name_1'].tolist())['input_ids']
    data['tokenized_name_1_x'] = data['tokenized_name_1_x'].apply(lambda x: 
                                                                  create_labeled_raw_input(x, 
                                                                                           start_token, 
                                                                                           end_token))
    data['tokenized_name_1_y'] = [np.asarray(start_token + [0]*(MAX_LEN-1), dtype=np.int64)]*data.shape[0]
    print("------------tokenizing name_2")
    data['tokenized_name_2_x'] = name_tokenizer(data['name_2'].tolist())['input_ids']
    data['tokenized_name_2_x'] = data['tokenized_name_2_x'].apply(lambda x: 
                                                                  create_labeled_raw_input(x, 
                                                                                           start_token, 
                                                                                           end_token))
    data['tokenized_name_2_y'] = [np.asarray(start_token + [0]*(MAX_LEN-1), dtype=np.int64)]*data.shape[0]
    
    print("------------getting labeled data")
    data['labels'] = data['label'].apply(lambda x: np.asarray([x], dtype=np.int64))
    
    # Creating TF Dataset
    ds = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(data['tokenized_name_1_x'].to_list()),
                              tf.data.Dataset.from_tensor_slices(data['tokenized_name_1_y'].to_list()),
                              tf.data.Dataset.from_tensor_slices(data['tokenized_name_2_x'].to_list()),
                              tf.data.Dataset.from_tensor_slices(data['tokenized_name_2_y'].to_list()),
                              tf.data.Dataset.from_tensor_slices(data['labels'].to_list())))
    
    serialized_features_dataset = ds.map(tf_serialize_example)
    
    print("------------writing to tfrecord")
    
    filename = f"./embedding_model_data/tfrecords/{dataset_type}/{str(iter_num).zfill(4)}.tfrecord"
    writer = tf.data.experimental.TFRecordWriter(filename)
    writer.write(serialized_features_dataset)

In [None]:
%%time
for i in glob.glob("./embedding_model_data/new_proc_pair_data/train/*")[4:16]:
    print(f"{i} {i.split('/')[-1].split('.parquet')[0]}")
    create_tfrecords_dataset(i, i.split('/')[-1].split('.parquet')[0], 'train')

./embedding_model_data/new_proc_pair_data/train/2a.parquet 2a
------------tokenizing name_1
------------tokenizing name_2
------------getting labeled data


2023-04-21 14:55:11.134995: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 14:55:11.136072: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 14:55:11.136284: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 14:55:11.136771: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

------------writing to tfrecord
Instructions for updating:
To write TFRecords to disk, use `tf.io.TFRecordWriter`. To save and load the contents of a dataset, use `tf.data.experimental.save` and `tf.data.experimental.load`


In [57]:
%%time
for ith, i in enumerate(glob.glob("./embedding_model_data/processed_pair_data/val/*")):
    print(f"{ith} - {i}")
    create_tfrecords_dataset(i, ith, 'val')

0 - ./embedding_model_data/processed_pair_data/val/0.parquet
------------tokenizing name_1
------------tokenizing name_2
------------getting labeled data
------------writing to tfrecord
CPU times: user 41.6 s, sys: 3.74 s, total: 45.3 s
Wall time: 36.9 s


### Read training data from tfrecord files

In [6]:
def _parse_function(example_proto):
    """
    Parses the TFRecord file.
    """
    feature_description = {
        'name_1_x': tf.io.FixedLenFeature((32,), tf.int64),
        'name_1_y': tf.io.FixedLenFeature((32,), tf.int64),
        'name_2_x': tf.io.FixedLenFeature((32,), tf.int64),
        'name_2_y': tf.io.FixedLenFeature((32,), tf.int64),
        'labels': tf.io.FixedLenFeature((1,), tf.int64)
    }

    example = tf.io.parse_single_example(example_proto, feature_description)

    name_1_x = tf.cast(example['name_1_x'], dtype=tf.int32)
    name_1_y = tf.cast(example['name_1_y'], dtype=tf.int32)
    name_2_x = tf.cast(example['name_2_x'], dtype=tf.int32)
    name_2_y = tf.cast(example['name_2_y'], dtype=tf.int32)
    labels = tf.cast(example['labels'][0], dtype=tf.int32)

    return ((name_1_x,name_1_y),(name_2_x,name_2_y)),labels

In [7]:
def get_dataset(path, data_type='train', batch_size=512):
    """
    Takes in a path to the TFRecords and returns a TF Dataset to be used for training.
    """
    if data_type=='train':
        tfrecords = [f"{path}{data_type}/{x}" for x in os.listdir(f"{path}{data_type}/") 
                     if x.endswith('tfrecord')][:5]
    else:
        tfrecords = [f"{path}{data_type}/{x}" for x in os.listdir(f"{path}{data_type}/") 
                     if x.endswith('tfrecord')]
    tfrecords.sort()
    
    32
    raw_dataset = tf.data.TFRecordDataset(tfrecords, num_parallel_reads=tf.data.AUTOTUNE)
    parsed_dataset = raw_dataset.map(_parse_function, num_parallel_calls=tf.data.AUTOTUNE)

    parsed_dataset = parsed_dataset\
        .shuffle(batch_size)\
        .batch(batch_size,drop_remainder=True) \
        .prefetch(buffer_size=tf.data.AUTOTUNE)
    return parsed_dataset

### Model Architecture

In [8]:
def positional_encoding(length, depth):
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

    angle_rates = 1 / (10000**depths)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1) 

    return tf.cast(pos_encoding, dtype=tf.float32)

In [9]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)
        self.vocab_size = vocab_size

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[tf.newaxis, :length, :]
        return x
    
    def get_config(self):
        config = {
            'vocab_size': int(self.vocab_size),
            'd_model': int(self.d_model)
        }
        return config

In [10]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
        
class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)

        # Cache the attention scores for plotting later.
        self.last_attn_scores = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x
    
class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x
    
class CausalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x,
            use_causal_mask = True)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

In [11]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
          tf.keras.layers.Dense(dff, activation='relu'),
          tf.keras.layers.Dense(d_model),
          tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()
        
        self.d_model = d_model
        self.dff = dff
        self.dropout_rate = dropout_rate

    def call(self, x):
        x = self.add([x, self.seq(x)])
        x = self.layer_norm(x) 
        return x
    
    def get_config(self):
        config = {
            'd_model': int(self.d_model),
            'dff': int(self.dff),
            'dropout_rate': float(self.dropout_rate)
        }
        return config

In [12]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()

        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)

        self.ffn = FeedForward(d_model, dff)
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.dropout_rate = dropout_rate

    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        self.context_data = x
        return x
    
    def get_config(self):
        config = {
            'd_model': int(self.d_model),
            'num_heads': int(self.num_heads),
            'dff': int(self.dff),
            'dropout_rate': float(self.dropout_rate)
        }
        return config

In [13]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads,
                 dff, vocab_size, dropout_rate=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.dff = dff
        self.vocab_size = vocab_size
        self.dropout_rate = dropout_rate
        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(
            vocab_size=vocab_size, d_model=d_model)

        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads=num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x):
        # `x` is token-IDs shape: (batch, seq_len)
        x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

        # Add dropout.
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
            
        self.last_1of4_encoder_output = self.enc_layers[-4].context_data
        self.last_2of4_encoder_output = self.enc_layers[-3].context_data
        self.last_3of4_encoder_output = self.enc_layers[-2].context_data
        self.last_4of4_encoder_output = self.enc_layers[-1].context_data
        
        self.encoder_context_data = x

        return x  # Shape `(batch_size, seq_len, d_model)`.
    
    def get_config(self):
        config = {
            'num_layers': int(self.num_layers),
            'd_model': int(self.d_model),
            'num_heads': int(self.num_heads),
            'dff': int(self.dff),
            'vocab_size': int(self.vocab_size),
            'dropout_rate': float(self.dropout_rate)
        }
        return config

In [14]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
        super(DecoderLayer, self).__init__()
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.dropout_rate = dropout_rate

        self.causal_self_attention = CausalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)

        self.cross_attention = CrossAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)

        self.ffn = FeedForward(d_model, dff)

    def call(self, x, context):
        x = self.causal_self_attention(x=x)
        x = self.cross_attention(x=x, context=context)

        # Cache the last attention scores for plotting later
        self.last_attn_scores = self.cross_attention.last_attn_scores

        x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
        self.context_data = x
        return x
    
    def get_config(self):
        config = {
            'd_model': int(self.d_model),
            'num_heads': int(self.num_heads),
            'dff': int(self.dff),
            'dropout_rate': float(self.dropout_rate)
        }
        return config

In [15]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
        super(Decoder, self).__init__()
        self.num_heads = num_heads
        self.dff = dff
        self.vocab_size = vocab_size
        self.dropout_rate = dropout_rate
        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                                 d_model=d_model)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dec_layers = [
            DecoderLayer(d_model=self.d_model, num_heads=num_heads,
                         dff=self.dff, dropout_rate=dropout_rate)
            for _ in range(num_layers)]

        self.last_attn_scores = None

    def call(self, x, context):
        # `x` is token-IDs shape (batch, target_seq_len)
        x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

        x = self.dropout(x)

        for i in range(self.num_layers):
            x  = self.dec_layers[i](x, context)

        self.last_attn_scores = self.dec_layers[-1].last_attn_scores
        self.last_1of4_decoder_output = self.dec_layers[-4].context_data
        self.last_2of4_decoder_output = self.dec_layers[-3].context_data
        self.last_3of4_decoder_output = self.dec_layers[-2].context_data
        self.last_4of4_decoder_output = self.dec_layers[-1].context_data

        # The shape of x is (batch_size, target_seq_len, d_model).
        
        self.decoder_context_data = x
        return x
    
    def get_config(self):
        config = {
            'num_layers': int(self.num_layers),
            'd_model': int(self.d_model),
            'num_heads': int(self.num_heads),
            'dff': int(self.dff),
            'vocab_size': int(self.vocab_size),
            'dropout_rate': float(self.dropout_rate)
        }
        return config

In [16]:
class TransformerForSiam(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.dff = dff
        self.input_vocab_size = input_vocab_size
        self.target_vocab_size = target_vocab_size
        self.dropout_rate = dropout_rate

        self.d_model = d_model
        self.num_layers = num_layers
        self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                               num_heads=num_heads, dff=dff,
                               vocab_size=input_vocab_size,
                               dropout_rate=dropout_rate)

        self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                               num_heads=num_heads, dff=dff,
                               vocab_size=target_vocab_size,
                               dropout_rate=dropout_rate)

        self.dense_layer_1 = tf.keras.layers.Dense(1024)
        self.dense_layer_2 = tf.keras.layers.Dense(1024)
        self.pooling_layer = tf.keras.layers.GlobalAveragePooling1D()
        self.name_embedding_layer = tf.keras.layers.Dense(64)

    def call(self, inputs):
        # To use a Keras model with `.fit` you must pass all your inputs in the
        # first argument.
        context, x  = inputs

        context = self.encoder(context)  # (batch_size, context_len, d_model)

        x = self.decoder(x, context)  # (batch_size, target_len, d_model)
        
        # Adding head to get name embedding
        x = self.dense_layer_1(x)
        x = self.dense_layer_2(x)
        x = self.pooling_layer(x)
        logits = self.name_embedding_layer(x)

        try:
            del logits._keras_mask
        except AttributeError:
            pass

        return logits
    
    def get_config(self):
        config = {
            'num_layers': int(self.num_layers),
            'd_model': int(self.d_model),
            'num_heads': int(self.num_heads),
            'dff': int(self.dff),
            'input_vocab_size': int(self.input_vocab_size),
            'target_vocab_size': int(self.target_vocab_size),
            'dropout_rate': float(self.dropout_rate)
        }
        return config

In [17]:
class SiamNet(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super().__init__()
        self.num_heads = num_heads
        self.dff = dff
        self.input_vocab_size = input_vocab_size
        self.target_vocab_size = target_vocab_size
        self.dropout_rate = dropout_rate
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.transformer = TransformerForSiam(num_layers=num_layers, d_model=d_model,
                                                num_heads=num_heads, dff=dff,
                                                input_vocab_size=input_vocab_size,
                                                target_vocab_size=target_vocab_size,
                                                dropout_rate=dropout_rate)
        
        self.final_output = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        first_name, second_name  = inputs
        
        first_name_output = self.transformer(first_name)
        second_name_output = self.transformer(second_name)
        
        emb_diff = tf.keras.layers.subtract([first_name_output, second_name_output])
        concat_vector = tf.keras.layers.concatenate([first_name_output, second_name_output, emb_diff])
        
        x = self.final_output(concat_vector)
        
        return x
    
    def get_config(self):
        config = {
            'num_layers': int(self.num_layers),
            'd_model': int(self.d_model),
            'num_heads': int(self.num_heads),
            'dff': int(self.dff),
            'input_vocab_size': int(self.input_vocab_size),
            'target_vocab_size': int(self.target_vocab_size),
            'dropout_rate': float(self.dropout_rate)
        }
        return config

### Training

In [18]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
    def get_config(self):
        config = {
            'd_model': int(self.d_model),
            'warmup_steps': int(self.warmup_steps),
        }
        return config

In [19]:
# def masked_loss(label, pred):
#     mask = label != 0
#     loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
#     loss = loss_object(label, pred)

#     mask = tf.cast(mask, dtype=loss.dtype)
#     loss *= mask

#     loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
#     return loss


# def masked_accuracy(label, pred):
#     pred = tf.argmax(pred, axis=2)
#     label = tf.cast(label, pred.dtype)
#     match = label == pred

#     mask = label != 0

#     match = match & mask

#     match = tf.cast(match, dtype=tf.float32)
#     mask = tf.cast(mask, dtype=tf.float32)
#     return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [19]:
def scheduler(epoch, curr_lr):
    """
    Setting up a exponentially decaying learning rate.
    """
    rampup_epochs = 2
    exp_decay = 0.17
    def lr(epoch, beg_lr, rampup_epochs, exp_decay):
        if epoch < rampup_epochs:
            return beg_lr*4*epoch
        else:
            return beg_lr * math.exp(-exp_decay * epoch)
    return lr(epoch, start_lr, rampup_epochs, exp_decay)

In [20]:
num_layers = 6
d_model = 64
dff = 128
num_heads = 8
dropout_rate = 0.15
num_epochs = 12
MAX_LEN=32
BATCH_SIZE=512

In [21]:
# Allow for use of multiple GPUs
mirrored_strategy = tf.distribute.MirroredStrategy()

with mirrored_strategy.scope():
    
    enc_weights = tf.keras.models.load_model("./transformer_data/models/20230418_8_6_64_128_15/model_epoch12ckpt/", 
                                                    compile=False).encoder.get_weights()
    
    dec_weights = tf.keras.models.load_model("./transformer_data/models/20230418_8_6_64_128_15/model_epoch12ckpt/", 
                                                    compile=False).decoder.get_weights()
    
    siamese_network = SiamNet(
        num_layers=num_layers,
        d_model=d_model,
        num_heads=num_heads,
        dff=dff,
        input_vocab_size=name_tokenizer.vocab_size,
        target_vocab_size=name_tokenizer.vocab_size,
        dropout_rate=dropout_rate)
    
    learning_rate = CustomSchedule(d_model)

    optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                         epsilon=1e-9)

    siamese_network.compile(
        loss=tf.keras.losses.BinaryCrossentropy(name='binary_crossentropy'),
        optimizer=optimizer,
        metrics=[tf.keras.metrics.BinaryAccuracy()])

    curr_date = datetime.now().strftime("%Y%m%d")

    filepath_1 = f"./embedding_model_data/models/{curr_date}_{num_heads}_{num_layers}_{d_model}_{dff}_{int(dropout_rate*100)}/" \


    filepath = filepath_1 + "model_epoch{epoch:02d}ckpt"

    # Adding in checkpointing
    model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', 
                                                          verbose=0, save_best_only=False,
                                                          save_weights_only=False, mode='auto',
                                                          save_freq='epoch')

    # Adding in early stopping
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=4)

#     lr_schedule = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1)

    callbacks = [model_checkpoint, early_stopping]
    
    train_batches = get_dataset("./embedding_model_data/tfrecords/", data_type='train', batch_size=BATCH_SIZE)
    val_batches = get_dataset("./embedding_model_data/tfrecords/", data_type='val', batch_size=BATCH_SIZE)
    
    for (name_1, name_2), labels in val_batches.take(1):
        pass
        
    output = siamese_network((name_1, name_2))
    
    siamese_network.transformer.encoder.set_weights(enc_weights)
    siamese_network.transformer.decoder.set_weights(dec_weights)
    siamese_network.transformer.encoder.trainable = False
    siamese_network.transformer.decoder.trainable = False
    
    
    

2023-04-21 14:38:26.227095: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 14:38:26.228167: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 14:38:26.228382: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 14:38:26.229074: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/

2023-04-21 14:38:56.922179: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8302


In [22]:
siamese_network.summary()

Model: "siam_net"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer_for_siam (Trans  multiple                 6207040   
 formerForSiam)                                                  
                                                                 
 dense_27 (Dense)            multiple                  193       
                                                                 
Total params: 6,207,233
Trainable params: 1,181,953
Non-trainable params: 5,025,280
_________________________________________________________________


In [23]:
history = siamese_network.fit(val_batches,
                epochs=num_epochs,
                validation_data=val_batches,
                callbacks=[callbacks])

Epoch 1/12


2023-04-21 14:40:04.902027: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f8a2000e7c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-21 14:40:04.902099: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Tesla V100-SXM2-16GB, Compute Capability 7.0
2023-04-21 14:40:04.976147: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-04-21 14:40:05.651116: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


    120/Unknown - 100s 325ms/step - loss: 0.9275 - binary_accuracy: 0.5675

KeyboardInterrupt: 

In [29]:
import json

In [30]:
json.dump(str(history.history), open(f"{filepath_1}_{num_epochs}EPOCHS_HISTORY.json", 'w+'))

In [31]:
history.history

### Transforming with Model

In [24]:
MAX_TOKENS=32
MAX_LEN=32

In [23]:
temp_model = tf.keras.models.load_model("./models/20230418_8_6_64_128_15/model_epoch12ckpt/", 
                                                    compile=False)



In [24]:
enc_weights = temp_model.encoder.get_weights()
dec_weights = temp_model.decoder.get_weights()
final_layer_weights = temp_model.final_layer.get_weights()

In [25]:
transformer.encoder.set_weights(enc_weights)
transformer.decoder.set_weights(dec_weights)
transformer.final_layer.set_weights(final_layer_weights)
transformer.trainable = False

In [32]:
# embedding_model = tf.keras.Model(inputs=(x,y), 
#                            outputs=transformer.decoder.dec_layers[-1])

In [29]:
class Translator(tf.Module):
    def __init__(self, tokenizer, transformer):
        self.tokenizer = tokenizer
        self.transformer = transformer

    def __call__(self, sentence, max_length=MAX_TOKENS):
        # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
        assert isinstance(sentence, tf.Tensor)
        if len(sentence.shape) == 0:
            sentence = sentence[tf.newaxis]

        sentence = tf.keras.utils.pad_sequences(self.tokenizer([x.decode('utf-8') 
                                                    for x in sentence.numpy().tolist()])['input_ids'], 
                                                maxlen=MAX_TOKENS, 
                                                dtype='int32',
                                                padding='post',
                                                truncating='post',
                                                value=0)

        encoder_input = sentence

        # As the output language is English, initialize the output with the
        # English `[START]` token.
        start_end = tf.constant(self.tokenizer(['[START][END]'])['input_ids'][0])
        start = start_end[0][tf.newaxis]
        end = start_end[1][tf.newaxis]

        # `tf.TensorArray` is required here (instead of a Python list), so that the
        # dynamic-loop can be traced by `tf.function`.
        output_array = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
        output_array = output_array.write(0, start)

        for i in tf.range(max_length):
            output = tf.transpose(output_array.stack())
            predictions = self.transformer([encoder_input, output], training=False)

            # Select the last token from the `seq_len` dimension.
            predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

            predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

            # Concatenate the `predicted_id` to the output which is given to the
            # decoder as its input.
            output_array = output_array.write(i+1, predicted_id[0])

            if predicted_id == end:
                break

        output = tf.transpose(output_array.stack())
        # The output shape is `(1, tokens)`.
        print(output)
        text = self.tokenizer.decode(output.numpy().tolist()[0], skip_special_tokens=True, 
                 clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-")

        tokens = output.numpy().tolist()

        # `tf.function` prevents us from using the attention_weights that were
        # calculated on the last iteration of the loop.
        # So, recalculate them outside the loop.
        self.transformer([encoder_input, output[:,:-1]], training=False)
        attention_weights = self.transformer.decoder.last_attn_scores

        return text, tokens, attention_weights

NameError: name 'MAX_TOKENS' is not defined

In [57]:
name_tokenizer = PreTrainedTokenizerFast(tokenizer_file="name_transformer_wordpiece_tokenizer")

In [None]:
# name_tokenizer.decode(output_labels[4][:10], skip_special_tokens=True, 
#                  clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-")

In [25]:
translator = Translator(name_tokenizer, trained_model)

In [26]:
def print_translation(sentence, tokens, ground_truth):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {tokens}')
    print(f'{"Ground truth":15s}: {ground_truth}')

In [None]:
sentence = '[START]Oliver, John[END]'
ground_truth = 'John Oliver'

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

In [56]:
predictions

<tf.Tensor: shape=(1, 32, 15264), dtype=float32, numpy=
array([[[-10.503152  , -10.504201  ,   6.169404  , ...,  -8.845235  ,
          -8.071333  ,  -5.4130135 ],
        [ -5.9436817 ,  -5.9438343 ,  -1.4243493 , ...,   2.472556  ,
           4.703054  ,   0.6202928 ],
        [ -5.944227  ,  -5.9443827 ,  -1.4250772 , ...,   2.4673195 ,
           4.7044845 ,   0.62325895],
        ...,
        [ -5.96473   ,  -5.9648876 ,  -1.377768  , ...,   2.4583683 ,
           4.662452  ,   0.54505575],
        [ -5.965795  ,  -5.9659495 ,  -1.3787947 , ...,   2.4593058 ,
           4.6576986 ,   0.5326568 ],
        [ -5.965687  ,  -5.9658384 ,  -1.3768545 , ...,   2.4614625 ,
           4.658391  ,   0.53515875]]], dtype=float32)>

In [26]:
test_names = ["J.R. Tolkien",
              "Tolkien, J.R.",
              "Justin Earl Tolkien", 
              "Jarvis Richard Tolkien",
              "Tolkien, Justin", 
              "Tolkien JE", 
              "Max Trout", 
              "Maximus Trout", 
              "Maxe Trout", 
              "Mara Trout", 
              "M.R. Trout",
              "Trout MRF", 
              "Gooding, Emily", 
              "Gooding, E.Y. MD", 
              "Jarvis Richard James"]

In [27]:
embs = []
for test_name in test_names:
    print(test_name)
    start_end = tf.constant(name_tokenizer(['[START][END]'])['input_ids'][0])
    start = start_end[0].numpy()
    end = start_end[1].numpy()
    encoder_input = name_tokenizer([f'[START]{test_name}[END]'])['input_ids'][0]
    encoder_input = tf.convert_to_tensor([encoder_input + [0]*(MAX_LEN-len(encoder_input))])
    final_output = [start]
    for i in range(MAX_LEN):
        output = tf.convert_to_tensor([final_output + [0]*(MAX_LEN-len(final_output))])
        predictions = transformer((encoder_input, output), training=False)

        # Select the last token
        predicted_id = tf.argmax(predictions[0][i]).numpy()

        # Add to output
        final_output.append(predicted_id)

        if predicted_id == end:
            break
    print("----Decoded name: ", name_tokenizer.decode(final_output, skip_special_tokens=True, 
                 clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-"))
    _ = transformer((encoder_input, tf.convert_to_tensor([final_output[:1] + [0]*(MAX_LEN-len(final_output[:1]))])), 
            training=False)
    
    embs.append(transformer.decoder.dec_layers[-1].context_data[0][0].numpy())

J.R. Tolkien
----Decoded name:  J. R. Tolkien
Tolkien, J.R.
----Decoded name:  J. R. Tolkien
Justin Earl Tolkien
----Decoded name:  Justin Earl Tolkien
Jarvis Richard Tolkien
----Decoded name:  Richard Tolkien Jarvis
Tolkien, Justin
----Decoded name:  Justin Tolkien
Tolkien JE
----Decoded name:  J. E. Tolkien
Max Trout
----Decoded name:  Max Trout
Maximus Trout
----Decoded name:  Maximus Trout
Maxe Trout
----Decoded name:  Maxe Trout
Mara Trout
----Decoded name:  Mara Trout
M.R. Trout
----Decoded name:  M. R. Trout
Trout MRF
----Decoded name:  M. R. F. Trout
Gooding, Emily
----Decoded name:  Emily Gooding
Gooding, E.Y. MD
----Decoded name:  E. Y. Gooding
Jarvis Richard James
----Decoded name:  Richard James Jarvis


In [62]:
from sklearn.metrics.pairwise import cosine_similarity

In [77]:
for ith, (emb, test_name) in enumerate(zip(embs, test_names)):
    print(ith, test_name)
    for emb_1, test_name_1 in zip(embs, test_names):
        print(f"-------{test_name_1}: {round(cosine_similarity(emb.reshape(1, -1), emb_1.reshape(1, -1))[0][0], 4)}")
    print("")
    print("")

0 J.R. Tolkien
-------J.R. Tolkien: 1.0
-------Tolkien, J.R.: 0.6725000143051147
-------Justin Earl Tolkien: 0.5468999743461609
-------Jarvis Richard Tolkien: 0.5748000144958496
-------Tolkien, Justin: 0.43149998784065247
-------Tolkien JE: 0.5461999773979187
-------Max Trout: 0.24279999732971191
-------Maximus Trout: 0.25119999051094055
-------Maxe Trout: 0.2468000054359436
-------Mara Trout: 0.1137000024318695
-------M.R. Trout: 0.41499999165534973
-------Trout MRF: 0.16910000145435333
-------Gooding, Emily: 0.29100000858306885
-------Gooding, E.Y. MD: 0.27379998564720154
-------Jarvis Richard James: 0.5273000001907349


1 Tolkien, J.R.
-------J.R. Tolkien: 0.6725000143051147
-------Tolkien, J.R.: 1.0
-------Justin Earl Tolkien: 0.6068999767303467
-------Jarvis Richard Tolkien: 0.47290000319480896
-------Tolkien, Justin: 0.43050000071525574
-------Tolkien JE: 0.5874999761581421
-------Max Trout: 0.40700000524520874
-------Maximus Trout: 0.44209998846054077
-------Maxe Trout: 0.404799

## Testing different embeddings

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
def test_embeddings(embs_to_test, names_to_test):
    for ith, (emb, test_name) in enumerate(zip(embs_to_test, names_to_test)):
        all_scores = []
        print(ith, test_name)
        for emb_1, test_name_1 in zip(embs, names_to_test):
            all_scores.append(round(cosine_similarity(emb.reshape(1, -1), emb_1.reshape(1, -1))[0][0], 4))
        
        ind = np.argpartition(np.array(all_scores), -5)[-5:]
        
        top_5 = ind[np.argsort(np.array(all_scores)[ind])].tolist()[::-1]
        
        for top_ind in top_5[1:]:
            print(f"---------- {names_to_test[top_ind]} - {all_scores[top_ind]}")
        print("")
        print("")

### [START] token decoder output

##### Last layer only

In [62]:
embs = []
for test_name in test_names:
    print(test_name)
    start_end = tf.constant(name_tokenizer(['[START][END]'])['input_ids'][0])
    start = start_end[0].numpy()
    end = start_end[1].numpy()
    encoder_input = name_tokenizer([f'[START]{test_name}[END]'])['input_ids'][0]
    encoder_input = tf.convert_to_tensor([encoder_input + [0]*(MAX_LEN-len(encoder_input))])
    final_output = [start]
    for i in range(MAX_LEN):
        output = tf.convert_to_tensor([final_output + [0]*(MAX_LEN-len(final_output))])
        predictions = transformer((encoder_input, output), training=False)

        # Select the last token
        predicted_id = tf.argmax(predictions[0][i]).numpy()

        # Add to output
        final_output.append(predicted_id)

        if predicted_id == end:
            break
    print("----Decoded name: ", name_tokenizer.decode(final_output, skip_special_tokens=True, 
                 clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-"))
    _ = transformer((encoder_input, tf.convert_to_tensor([final_output[:1] + [0]*(MAX_LEN-len(final_output[:1]))])), 
            training=False)
    
    embs.append(transformer.decoder.dec_layers[-1].context_data[0][0].numpy())

J.R. Tolkien
----Decoded name:  J. R. Tolkien
Tolkien, J.R.
----Decoded name:  J. R. Tolkien
Justin Earl Tolkien
----Decoded name:  Justin Earl Tolkien
Jarvis Richard Tolkien
----Decoded name:  Jarvis Richard Tolkien
Tolkien, Justin
----Decoded name:  Justin Tolkien
Tolkien JE
----Decoded name:  J. E. Tolkien
Max Trout
----Decoded name:  Max Trout
Maximus Trout
----Decoded name:  Maximus Trout
Maxe Trout
----Decoded name:  Maxe Trout
Mara Trout
----Decoded name:  Mara Trout
M.R. Trout
----Decoded name:  M. R. Trout
Trout MRF
----Decoded name:  M. R. F. Trout
Gooding, Emily
----Decoded name:  Emily Gooding
Gooding, E.Y. MD
----Decoded name:  E. Y. Gooding
Jarvis Richard James
----Decoded name:  Richard James Jarvis


In [63]:
test_embeddings(embs, test_names)

0 J.R. Tolkien
---------- Tolkien, J.R. - 0.6912999749183655
---------- Justin Earl Tolkien - 0.6347000002861023
---------- Tolkien JE - 0.611299991607666
---------- M.R. Trout - 0.5792999863624573


1 Tolkien, J.R.
---------- J.R. Tolkien - 0.6912999749183655
---------- Tolkien JE - 0.525600016117096
---------- Gooding, E.Y. MD - 0.4986000061035156
---------- Mara Trout - 0.47290000319480896


2 Justin Earl Tolkien
---------- Tolkien, Justin - 0.7226999998092651
---------- J.R. Tolkien - 0.6347000002861023
---------- Gooding, E.Y. MD - 0.5054000020027161
---------- Maximus Trout - 0.5034000277519226


3 Jarvis Richard Tolkien
---------- Jarvis Richard James - 0.9329000115394592
---------- J.R. Tolkien - 0.5389999747276306
---------- Mara Trout - 0.5076000094413757
---------- Tolkien, J.R. - 0.44119998812675476


4 Tolkien, Justin
---------- Justin Earl Tolkien - 0.7226999998092651
---------- J.R. Tolkien - 0.37369999289512634
---------- Maximus Trout - 0.30410000681877136
---------- J

#### Concat last 4 layers

In [64]:
embs = []
for test_name in test_names:
    print(test_name)
    start_end = tf.constant(name_tokenizer(['[START][END]'])['input_ids'][0])
    start = start_end[0].numpy()
    end = start_end[1].numpy()
    encoder_input = name_tokenizer([f'[START]{test_name}[END]'])['input_ids'][0]
    encoder_input = tf.convert_to_tensor([encoder_input + [0]*(MAX_LEN-len(encoder_input))])
    final_output = [start]
    for i in range(MAX_LEN):
        output = tf.convert_to_tensor([final_output + [0]*(MAX_LEN-len(final_output))])
        predictions = transformer((encoder_input, output), training=False)

        # Select the last token
        predicted_id = tf.argmax(predictions[0][i]).numpy()

        # Add to output
        final_output.append(predicted_id)

        if predicted_id == end:
            break
    print("----Decoded name: ", name_tokenizer.decode(final_output, skip_special_tokens=True, 
                 clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-"))
    _ = transformer((encoder_input, tf.convert_to_tensor([final_output[:1] + [0]*(MAX_LEN-len(final_output[:1]))])), 
            training=False)
    
    embs.append(np.concatenate((transformer.decoder.dec_layers[-4].context_data[0][0].numpy(),
                    transformer.decoder.dec_layers[-3].context_data[0][0].numpy(),
                    transformer.decoder.dec_layers[-2].context_data[0][0].numpy(),
                    transformer.decoder.dec_layers[-1].context_data[0][0].numpy())))

J.R. Tolkien
----Decoded name:  J. R. Tolkien
Tolkien, J.R.
----Decoded name:  J. R. Tolkien
Justin Earl Tolkien
----Decoded name:  Justin Earl Tolkien
Jarvis Richard Tolkien
----Decoded name:  Jarvis Richard Tolkien
Tolkien, Justin
----Decoded name:  Justin Tolkien
Tolkien JE
----Decoded name:  J. E. Tolkien
Max Trout
----Decoded name:  Max Trout
Maximus Trout
----Decoded name:  Maximus Trout
Maxe Trout
----Decoded name:  Maxe Trout
Mara Trout
----Decoded name:  Mara Trout
M.R. Trout
----Decoded name:  M. R. Trout
Trout MRF
----Decoded name:  M. R. F. Trout
Gooding, Emily
----Decoded name:  Emily Gooding
Gooding, E.Y. MD
----Decoded name:  E. Y. Gooding
Jarvis Richard James
----Decoded name:  Richard James Jarvis


In [65]:
test_embeddings(embs, test_names)

0 J.R. Tolkien
---------- Tolkien, J.R. - 0.6916999816894531
---------- Justin Earl Tolkien - 0.6344000101089478
---------- Tolkien JE - 0.6119999885559082
---------- M.R. Trout - 0.5781000256538391


1 Tolkien, J.R.
---------- J.R. Tolkien - 0.6916999816894531
---------- Tolkien JE - 0.5270000100135803
---------- Gooding, E.Y. MD - 0.49799999594688416
---------- Mara Trout - 0.47189998626708984


2 Justin Earl Tolkien
---------- Tolkien, Justin - 0.7232999801635742
---------- J.R. Tolkien - 0.6344000101089478
---------- Gooding, E.Y. MD - 0.5044999718666077
---------- Maximus Trout - 0.5030999779701233


3 Jarvis Richard Tolkien
---------- Jarvis Richard James - 0.9330999851226807
---------- J.R. Tolkien - 0.5388000011444092
---------- Mara Trout - 0.5069000124931335
---------- Tolkien, J.R. - 0.4415999948978424


4 Tolkien, Justin
---------- Justin Earl Tolkien - 0.7232999801635742
---------- J.R. Tolkien - 0.3741999864578247
---------- Maximus Trout - 0.3043000102043152
---------- J

### Pool full decoder output

#### Last layer only

In [66]:
embs = []
for test_name in test_names:
    print(test_name)
    start_end = tf.constant(name_tokenizer(['[START][END]'])['input_ids'][0])
    start = start_end[0].numpy()
    end = start_end[1].numpy()
    encoder_input = name_tokenizer([f'[START]{test_name}[END]'])['input_ids'][0]
    encoder_input = tf.convert_to_tensor([encoder_input + [0]*(MAX_LEN-len(encoder_input))])
    final_output = [start]
    for i in range(MAX_LEN):
        output = tf.convert_to_tensor([final_output + [0]*(MAX_LEN-len(final_output))])
        predictions = transformer((encoder_input, output), training=False)

        # Select the last token
        predicted_id = tf.argmax(predictions[0][i]).numpy()

        # Add to output
        final_output.append(predicted_id)

        if predicted_id == end:
            break
    final_output_len = len(final_output) - 1
    print("----Decoded name: ", name_tokenizer.decode(final_output, skip_special_tokens=True, 
                 clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-"))
    _ = transformer((encoder_input, tf.convert_to_tensor([final_output[:-1] + 
                                                          [0]*(MAX_LEN-len(final_output[:-1]))])), 
            training=False)
    
    embs.append(np.mean(transformer.decoder.dec_layers[-1].context_data[0][:final_output_len].numpy(), axis=0))

J.R. Tolkien
----Decoded name:  J. R. Tolkien
Tolkien, J.R.
----Decoded name:  J. R. Tolkien
Justin Earl Tolkien
----Decoded name:  Justin Earl Tolkien
Jarvis Richard Tolkien
----Decoded name:  Jarvis Richard Tolkien
Tolkien, Justin
----Decoded name:  Justin Tolkien
Tolkien JE
----Decoded name:  J. E. Tolkien
Max Trout
----Decoded name:  Max Trout
Maximus Trout
----Decoded name:  Maximus Trout
Maxe Trout
----Decoded name:  Maxe Trout
Mara Trout
----Decoded name:  Mara Trout
M.R. Trout
----Decoded name:  M. R. Trout
Trout MRF
----Decoded name:  M. R. F. Trout
Gooding, Emily
----Decoded name:  Emily Gooding
Gooding, E.Y. MD
----Decoded name:  E. Y. Gooding
Jarvis Richard James
----Decoded name:  Richard James Jarvis


In [67]:
test_embeddings(embs, test_names)

0 J.R. Tolkien
---------- Justin Earl Tolkien - 0.8522999882698059
---------- Tolkien, J.R. - 0.7605999708175659
---------- Tolkien JE - 0.7462000250816345
---------- Jarvis Richard Tolkien - 0.7218000292778015


1 Tolkien, J.R.
---------- J.R. Tolkien - 0.7605999708175659
---------- Tolkien JE - 0.7236999869346619
---------- Tolkien, Justin - 0.6431000232696533
---------- Gooding, E.Y. MD - 0.5831000208854675


2 Justin Earl Tolkien
---------- J.R. Tolkien - 0.8522999882698059
---------- Jarvis Richard Tolkien - 0.7939000129699707
---------- Tolkien, Justin - 0.6883999705314636
---------- Tolkien JE - 0.6370999813079834


3 Jarvis Richard Tolkien
---------- Justin Earl Tolkien - 0.7939000129699707
---------- J.R. Tolkien - 0.7218000292778015
---------- Tolkien, Justin - 0.6274999976158142
---------- Jarvis Richard James - 0.5228000283241272


4 Tolkien, Justin
---------- Justin Earl Tolkien - 0.6883999705314636
---------- J.R. Tolkien - 0.6549999713897705
---------- Tolkien, J.R. - 0.

#### Concat last 4 layers

In [68]:
embs = []
for test_name in test_names:
    print(test_name)
    start_end = tf.constant(name_tokenizer(['[START][END]'])['input_ids'][0])
    start = start_end[0].numpy()
    end = start_end[1].numpy()
    encoder_input = name_tokenizer([f'[START]{test_name}[END]'])['input_ids'][0]
    encoder_input = tf.convert_to_tensor([encoder_input + [0]*(MAX_LEN-len(encoder_input))])
    final_output = [start]
    for i in range(MAX_LEN):
        output = tf.convert_to_tensor([final_output + [0]*(MAX_LEN-len(final_output))])
        predictions = transformer((encoder_input, output), training=False)

        # Select the last token
        predicted_id = tf.argmax(predictions[0][i]).numpy()

        # Add to output
        final_output.append(predicted_id)

        if predicted_id == end:
            break
    final_output_len = len(final_output) - 1
    print("----Decoded name: ", name_tokenizer.decode(final_output, skip_special_tokens=True, 
                 clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-"))
    _ = transformer((encoder_input, tf.convert_to_tensor([final_output[:-1] + 
                                                          [0]*(MAX_LEN-len(final_output[:-1]))])), 
            training=False)
    
    embs.append(np.mean(transformer.decoder.dec_layers[-4].context_data[0][:final_output_len].numpy(), axis=0))
    embs.append(np.mean(transformer.decoder.dec_layers[-3].context_data[0][:final_output_len].numpy(), axis=0))
    embs.append(np.mean(transformer.decoder.dec_layers[-2].context_data[0][:final_output_len].numpy(), axis=0))
    embs.append(np.mean(transformer.decoder.dec_layers[-1].context_data[0][:final_output_len].numpy(), axis=0))

J.R. Tolkien
----Decoded name:  J. R. Tolkien
Tolkien, J.R.
----Decoded name:  J. R. Tolkien
Justin Earl Tolkien
----Decoded name:  Justin Earl Tolkien
Jarvis Richard Tolkien
----Decoded name:  Jarvis Richard Tolkien
Tolkien, Justin
----Decoded name:  Justin Tolkien
Tolkien JE
----Decoded name:  J. E. Tolkien
Max Trout
----Decoded name:  Max Trout
Maximus Trout
----Decoded name:  Maximus Trout
Maxe Trout
----Decoded name:  Maxe Trout
Mara Trout
----Decoded name:  Mara Trout
M.R. Trout
----Decoded name:  M. R. Trout
Trout MRF
----Decoded name:  M. R. F. Trout
Gooding, Emily
----Decoded name:  Emily Gooding
Gooding, E.Y. MD
----Decoded name:  E. Y. Gooding
Jarvis Richard James
----Decoded name:  Richard James Jarvis


In [69]:
test_embeddings(embs, test_names)

0 J.R. Tolkien
---------- Maxe Trout - 0.819599986076355
---------- Tolkien, Justin - 0.8172000050544739
---------- Gooding, Emily - 0.8073999881744385
---------- Tolkien, J.R. - 0.7820000052452087


1 Tolkien, J.R.
---------- Mara Trout - 0.8208000063896179
---------- Gooding, E.Y. MD - 0.8140000104904175
---------- Tolkien JE - 0.78329998254776
---------- J.R. Tolkien - 0.7820000052452087


2 Justin Earl Tolkien
---------- M.R. Trout - 0.8011000156402588
---------- Jarvis Richard James - 0.7623999714851379
---------- Max Trout - 0.7620999813079834
---------- Tolkien, J.R. - 0.6766999959945679


3 Jarvis Richard Tolkien
---------- Trout MRF - 0.8522999882698059
---------- Maximus Trout - 0.7605999708175659
---------- Justin Earl Tolkien - 0.446399986743927
---------- M.R. Trout - 0.27639999985694885


4 Tolkien, Justin
---------- J.R. Tolkien - 0.8172000050544739
---------- Maxe Trout - 0.8046000003814697
---------- Gooding, Emily - 0.7900999784469604
---------- Tolkien JE - 0.7624999

### Pool full encoder output

#### Last layer only

In [70]:
embs = []
for test_name in test_names:
    print(test_name)
    start_end = tf.constant(name_tokenizer(['[START][END]'])['input_ids'][0])
    start = start_end[0].numpy()
    end = start_end[1].numpy()
    encoder_input = name_tokenizer([f'[START]{test_name}[END]'])['input_ids'][0]
    final_input_len = len(encoder_input)
    encoder_input = tf.convert_to_tensor([encoder_input + [0]*(MAX_LEN-len(encoder_input))])
    final_output = [start]
    for i in range(MAX_LEN):
        output = tf.convert_to_tensor([final_output + [0]*(MAX_LEN-len(final_output))])
        predictions = transformer((encoder_input, output), training=False)

        # Select the last token
        predicted_id = tf.argmax(predictions[0][i]).numpy()

        # Add to output
        final_output.append(predicted_id)

        if predicted_id == end:
            break
    print("----Decoded name: ", name_tokenizer.decode(final_output, skip_special_tokens=True, 
                 clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-"))
    _ = transformer((encoder_input, tf.convert_to_tensor([final_output[:-1] + 
                                                          [0]*(MAX_LEN-len(final_output[:-1]))])), 
            training=False)
    
    embs.append(np.mean(transformer.encoder.enc_layers[-1].context_data[0][:final_input_len].numpy(), axis=0))

J.R. Tolkien
----Decoded name:  J. R. Tolkien
Tolkien, J.R.
----Decoded name:  J. R. Tolkien
Justin Earl Tolkien
----Decoded name:  Justin Earl Tolkien
Jarvis Richard Tolkien
----Decoded name:  Jarvis Richard Tolkien
Tolkien, Justin
----Decoded name:  Justin Tolkien
Tolkien JE
----Decoded name:  J. E. Tolkien
Max Trout
----Decoded name:  Max Trout
Maximus Trout
----Decoded name:  Maximus Trout
Maxe Trout
----Decoded name:  Maxe Trout
Mara Trout
----Decoded name:  Mara Trout
M.R. Trout
----Decoded name:  M. R. Trout
Trout MRF
----Decoded name:  M. R. F. Trout
Gooding, Emily
----Decoded name:  Emily Gooding
Gooding, E.Y. MD
----Decoded name:  E. Y. Gooding
Jarvis Richard James
----Decoded name:  Richard James Jarvis


In [71]:
test_embeddings(embs, test_names)

0 J.R. Tolkien
---------- Tolkien, J.R. - 0.7846999764442444
---------- M.R. Trout - 0.7739999890327454
---------- Justin Earl Tolkien - 0.7416999936103821
---------- Jarvis Richard Tolkien - 0.728600025177002


1 Tolkien, J.R.
---------- Gooding, E.Y. MD - 0.8529000282287598
---------- Tolkien, Justin - 0.8291000127792358
---------- Gooding, Emily - 0.817300021648407
---------- Tolkien JE - 0.7897999882698059


2 Justin Earl Tolkien
---------- Jarvis Richard Tolkien - 0.9140999913215637
---------- Tolkien, Justin - 0.8712999820709229
---------- Tolkien JE - 0.795199990272522
---------- Maximus Trout - 0.7799000144004822


3 Jarvis Richard Tolkien
---------- Justin Earl Tolkien - 0.9140999913215637
---------- Jarvis Richard James - 0.8639000058174133
---------- Tolkien, Justin - 0.8409000039100647
---------- Gooding, Emily - 0.7947999835014343


4 Tolkien, Justin
---------- Gooding, Emily - 0.8968999981880188
---------- Justin Earl Tolkien - 0.8712999820709229
---------- Jarvis Richard

#### Concat last 4 layers

In [72]:
embs = []
for test_name in test_names:
    print(test_name)
    start_end = tf.constant(name_tokenizer(['[START][END]'])['input_ids'][0])
    start = start_end[0].numpy()
    end = start_end[1].numpy()
    encoder_input = name_tokenizer([f'[START]{test_name}[END]'])['input_ids'][0]
    final_input_len = len(encoder_input)
    encoder_input = tf.convert_to_tensor([encoder_input + [0]*(MAX_LEN-len(encoder_input))])
    final_output = [start]
    for i in range(MAX_LEN):
        output = tf.convert_to_tensor([final_output + [0]*(MAX_LEN-len(final_output))])
        predictions = transformer((encoder_input, output), training=False)

        # Select the last token
        predicted_id = tf.argmax(predictions[0][i]).numpy()

        # Add to output
        final_output.append(predicted_id)

        if predicted_id == end:
            break
    
    print("----Decoded name: ", name_tokenizer.decode(final_output, skip_special_tokens=True, 
                 clean_up_tokenization_spaces=True).replace(" ##", "").replace(" - ", "-"))
    _ = transformer((encoder_input, tf.convert_to_tensor([final_output[:-1] + 
                                                          [0]*(MAX_LEN-len(final_output[:-1]))])), 
            training=False)
    
    embs.append(np.mean(transformer.encoder.enc_layers[-4].context_data[0][1:final_input_len+1].numpy(), axis=0))
    embs.append(np.mean(transformer.encoder.enc_layers[-3].context_data[0][1:final_input_len+1].numpy(), axis=0))
    embs.append(np.mean(transformer.encoder.enc_layers[-2].context_data[0][1:final_input_len+1].numpy(), axis=0))
    embs.append(np.mean(transformer.encoder.enc_layers[-1].context_data[0][1:final_input_len+1].numpy(), axis=0))

J.R. Tolkien
----Decoded name:  J. R. Tolkien
Tolkien, J.R.
----Decoded name:  J. R. Tolkien
Justin Earl Tolkien
----Decoded name:  Justin Earl Tolkien
Jarvis Richard Tolkien
----Decoded name:  Jarvis Richard Tolkien
Tolkien, Justin
----Decoded name:  Justin Tolkien
Tolkien JE
----Decoded name:  J. E. Tolkien
Max Trout
----Decoded name:  Max Trout
Maximus Trout
----Decoded name:  Maximus Trout
Maxe Trout
----Decoded name:  Maxe Trout
Mara Trout
----Decoded name:  Mara Trout
M.R. Trout
----Decoded name:  M. R. Trout
Trout MRF
----Decoded name:  M. R. F. Trout
Gooding, Emily
----Decoded name:  Emily Gooding
Gooding, E.Y. MD
----Decoded name:  E. Y. Gooding
Jarvis Richard James
----Decoded name:  Richard James Jarvis


In [73]:
test_embeddings(embs, test_names)

0 J.R. Tolkien
---------- Tolkien, Justin - 0.9211000204086304
---------- Tolkien, J.R. - 0.8880000114440918
---------- Tolkien JE - 0.8255000114440918
---------- Maxe Trout - 0.8184999823570251


1 Tolkien, J.R.
---------- J.R. Tolkien - 0.8880000114440918
---------- Justin Earl Tolkien - 0.8725000023841858
---------- Tolkien JE - 0.8712000250816345
---------- Tolkien, Justin - 0.8111000061035156


2 Justin Earl Tolkien
---------- Tolkien, J.R. - 0.8725000023841858
---------- Max Trout - 0.8222000002861023
---------- Jarvis Richard Tolkien - 0.7678999900817871
---------- J.R. Tolkien - 0.7429999709129333


3 Jarvis Richard Tolkien
---------- Maximus Trout - 0.7853999733924866
---------- Justin Earl Tolkien - 0.7678999900817871
---------- Trout MRF - 0.7423999905586243
---------- Tolkien, J.R. - 0.66839998960495


4 Tolkien, Justin
---------- J.R. Tolkien - 0.9211000204086304
---------- Tolkien JE - 0.896399974822998
---------- Tolkien, J.R. - 0.8111000061035156
---------- Gooding, Emi