In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [2]:
class TransformerEmbedding(keras.layers.Layer):
    def __init__(self, vocab_size, embedding_size, **kwargs):
        super(TransformerEmbedding, self).__init__(**kwargs)
        self.x_dim = vocab_size
        self.y_dim = embedding_size
        
    def build(self, input_shape):
        self.w = self.add_weight(shape=(self.x_dim, self.y_dim), initializer="random_normal", trainable=True)
        
    def call(self, input_layer, **kwargs):
        # Here I've implemented it to only return the final relevant time step for the final projection
        if 'projection' in kwargs.keys():
            if kwargs['projection'] == True:
                return tf.matmul(input_layer, tf.transpose(tf.expand_dims(self.w, axis=0), perm=[0,2,1]))
        # do the normal input embedding operation if 'projection' is not in kwargs or equals False
        # this sets the current time step for the output
        one_hot = tf.keras.backend.one_hot(input_layer, self.x_dim)
        return tf.matmul(one_hot, tf.expand_dims(self.w * np.sqrt(512), axis=0))
    
    def get_config(self):
        config = {"vocab_size" : self.x_dim,
                 "embedding_size" : self.y_dim}
        return config

In [3]:
class MultiheadAttention(keras.layers.Layer):
    def __init__(self, h=8, projection_dim=64, mask=False, **kwargs):
        super(MultiheadAttention, self).__init__(**kwargs)
        self.h = h
        self.projection_dim = projection_dim
        self.mask = mask
        
        self.concat = keras.layers.Concatenate(name=self.name + "_concat")
        self.dropout = keras.layers.Dropout(.1, name=self.name + "_dropout")
        self.addition = keras.layers.Add(name = self.name + "_add")
        self.normalize = keras.layers.BatchNormalization(name = self.name + "_batchnorm")
        
    def generate_mask(self, input_tensor):
        shape = input_tensor.shape
        mask = np.ndarray((shape[1], shape[2]), dtype=np.float32)
        for i in range(shape[1]):
            for j in range(shape[2]):
                if j > i:
                    mask[i][j] = np.NINF
                else:
                    mask[i][j] = 0
        mask = tf.convert_to_tensor(mask)
        mask = tf.expand_dims(mask, axis=0)
        return tf.math.add(input_tensor, mask)
        
    def _linearly_project(self, queries, keys, values):
        projections = []
        for i in range(self.h):
            proj_dict = {}
            proj_dict['queries'] = tf.matmul(queries, self.query_weights[i])
            proj_dict['keys'] = tf.matmul(keys, self.key_weights[i])
            proj_dict['values'] = tf.matmul(values, self.value_weights[i])
            projections.append(proj_dict)
        return projections
    
    def _linearly_project_concat(self, input_):
        return tf.matmul(input_, self.w_o)
    
    def _ScaledDotProduct_step(self, queries, keys, values):
        step_1 = tf.matmul(queries, tf.transpose(keys, perm=[0,2,1]))
        step_2 = tf.math.scalar_mul(1/np.sqrt(self.projection_dim), step_1)
        #print('Step 2 shape:', step_2.shape)
        # the output of step 2 is essentially oriented as all positions (32 x 32)
        if self.mask:
            # this fails: the documentation says ragged tensor would work to preserve dimensionality
            # but neither step_2 nor step_4 are ragged tensors
            step_3 = self.generate_mask(step_2)
            print(step_3.shape)
        else:
            step_3 = step_2
        step_4 = tf.nn.softmax(step_3)
        #print('Step 4 shape:', step_4.shape)
        step_5 = tf.matmul(step_4, values)
        # step 5 returns the shape to 32 x 64
        #print('Step 5 shape:', step_5.shape)
        return step_5
    
    def _ScaledDotProduct(self, projections):
        results = []
        for i in range(self.h):
            results.append(self._ScaledDotProduct_step(
                projections[i]['queries'],
                projections[i]['keys'],
                projections[i]['values'],
            ))
        return results
    
    def build(self, input_shape): # what to do with input_shape if 3 inputs?
        print(input_shape)
        self.query_weights = []
        self.key_weights = []
        self.value_weights = []
        # successfully completes cycle
        for i in range(self.h):
            self.query_weights.append(self.add_weight(
                shape=(input_shape[0][-1], self.projection_dim),
                initializer="random_normal",
                trainable=True,
                name='queries:' + str(i),
            ))
            self.key_weights.append(self.add_weight(
                shape=(input_shape[0][-1], self.projection_dim),
                initializer="random_normal",
                trainable=True,
                name='keys:' + str(i),
            ))
            self.value_weights.append(self.add_weight(
                shape=(input_shape[0][-1], self.projection_dim),
                initializer="random_normal",
                trainable=True,
                name='values:' + str(i),
            ))
        self.w_o = self.add_weight(
            shape=(self.h * self.projection_dim, input_shape[0][-1]),
            initializer="random_normal",
            trainable=True,
        )
        print('Build complete.')
        
    def call(self, input_set):
        '''keys and values are assumed to be the same'''
        print('Calling MultiheadAttention.')
        queries, keys, values = input_set
        projections = self._linearly_project(queries, keys, values) 
        scaled_attention = self._ScaledDotProduct(projections)
        concat = self.concat(scaled_attention)
        projected = self._linearly_project_concat(concat)
        dropout = self.dropout(projected)
        addition = self.addition([queries, dropout])
        normalize = self.normalize(addition)
        return normalize
    
    def get_config(self):
        config = {}
        config["h"] = self.h
        config["projection_dim"] = self.projection_dim
        config["mask"] = self.mask
        return config

In [4]:
def _positional_encoding(input_tensor, output_dim=512):
    print('Calling _positional_encoding.')
    num_pos = input_tensor.shape[1]
    encoding = np.zeros((num_pos, output_dim))
    for pos in range(num_pos):
        for i in range(int(output_dim / 2)):
            encoding[pos, 2*i] = np.sin(pos/10000**(2*i/output_dim))
            encoding[pos, 2*i+1] = np.cos(pos/10000**(2*i/output_dim))
    encoding = tf.convert_to_tensor(encoding, dtype=tf.float32)
    encoding = tf.expand_dims(encoding, axis=0)
    return encoding

In [5]:
class FeedforwardUnit(keras.layers.Layer):
    def __init__(self, model_dim=512, **kwargs):
        super(FeedforwardUnit, self).__init__(**kwargs)
        self.model_dim = model_dim
        
        self.ff1 = keras.layers.Dense(2048, activation='relu', name=self.name+"_ff1")
        self.ff2 = keras.layers.Dense(self.model_dim, activation=None, name=self.name+"_ff2")
        self.dropout = keras.layers.Dropout(0.1, name=self.name+"_dropout")
        self.addition = keras.layers.Add(name=self.name+"_add")
        self.normalize = keras.layers.BatchNormalization(name=self.name+"_batchnorm")
    
    def call(self, input_tensor):
        print('Calling FeedforwardUnit.')
        ff1 = self.ff1(input_tensor)
        ff2 = self.ff2(ff1)
        dropout_ff = self.dropout(ff2)
        added_ff = self.addition([input_tensor, dropout_ff])
        normalize_ff = self.normalize(added_ff)
        return normalize_ff
    
    def get_config(self):
        return {"model_dim" : self.model_dim}

In [6]:
#NotImplementedError: Layer EncoderUnit has arguments in `__init__` and therefore must override `get_config`.
class EncoderUnit(keras.layers.Layer):
    def __init__(self, h=8, projection_dim=64, model_dim=512, mask=False, **kwargs):
        super(EncoderUnit, self).__init__(**kwargs)
        self.h = h
        self.projection_dim = projection_dim
        self.model_dim = model_dim
        self.mask = mask
        
        self.ma = MultiheadAttention(h=self.h, 
                                projection_dim=self.projection_dim, 
                                mask=self.mask, name=self.name+"_MultiheadAttention")
        self.ff = FeedforwardUnit(name=self.name+"_FeedforwardUnit")
        
    def call(self, input_tensor):
        print('Calling EncoderUnit.')
        ma = self.ma([input_tensor, input_tensor, input_tensor])
        ff = self.ff(ma)
        return ff
    
    def get_config(self):
        config = {}
        config["h"] = self.h
        config["projection_dim"] = self.projection_dim
        config["model_dim"] = self.model_dim
        config["mask"] = self.mask
        return config

In [7]:
class DecoderUnit(keras.layers.Layer):
    def __init__(self, h=8, projection_dim=64, model_dim=512, mask=True, **kwargs):
        super(DecoderUnit, self).__init__(**kwargs)
        self.h = h
        self.projection_dim = projection_dim
        self.model_dim = model_dim
        self.mask = mask
        
        self.mma = MultiheadAttention(h=self.h,
                               projection_dim=self.projection_dim,
                               mask=self.mask,
                               name=self.name+"_MultiheadAttention_1")
        self.ma = MultiheadAttention(h=self.h,
                               projection_dim=self.projection_dim,
                               mask=False,
                               name=self.name+"_MultiheadAttention_2")
        self.ff = FeedforwardUnit(name=self.name+"_FeedforwardUnit")
        
    def call(self, input_set):
        print('Calling DecoderUnit.')
        encoding = input_set[0]
        encoder_stack_output = input_set[1]
        mma = self.mma([encoding, encoding, encoding])
        ma = self.ma([mma, encoder_stack_output, encoder_stack_output])
        ff = self.ff(ma)
        return ff
    
    def get_config(self):
        config = {}
        config["h"] = self.h
        config["projection_dim"] = self.projection_dim
        config["model_dim"] = self.model_dim
        config["mask"] = self.mask
        return config

In [8]:
class Linear(keras.layers.Layer):
    def __init__(self, external_weights, **kwargs):
        super(Linear, self).__init__(**kwargs)
        self.external_weights = tf.convert_to_tensor(external_weights)
        
    def build(self, input_shape):
        # this is highly problematic:
        # this wasn't added as a weight before, and now it's being added wrong because it needs to be
        # a tensor passed in
        # I might want to undo these changes first with trainable_weights, then figure out the name issue
        #   then return to this problem
        self.w = self.external_weights
        
    def call(self, input_layer):
        print('Calling Linear.')
        return tf.matmul(input_layer, tf.transpose(tf.expand_dims(tf.math.scalar_mul((1/np.sqrt(512)), self.w), axis=0), perm=[0,2,1]))
    
    def get_config(self):
        return {"external_weights": self.external_weights.numpy()}

In [9]:
import os
import re

In [10]:
os.chdir(os.path.expanduser(os.path.join('~',
                                        'python_workspace',
                                        'processed_data')))
filenames = [f for f in os.listdir('.') if 'preprocess.txt' in f and \
            'text' in f]

In [11]:
filenames += ['relative_examples.txt']

In [12]:
stride = 5
data = []
for file in filenames:
    with open(file, 'r') as f:
        in_data = f.readlines()
        for i in range(0, len(in_data) - stride, 1):
            segment = in_data[i : i + stride]
            data.append([line.strip().split('\t') for line in segment])

In [13]:
dataset = []
input_data = []
output_data = []
for item in data:
    for line in item:
        try:
            input_data.append(line[0])
            output_data.append(line[1])
        except IndexError:
            continue
    if input_data[-1][-1] == '&':
        input_data = input_data[:-1]
        output_data = output_data[:-1]
    input_item = ' '.join(input_data)
    input_item = re.sub('& ', ' ', input_item)
    input_item = re.sub('% ', '', input_item)
    input_item = re.sub('%$', '', input_item)
    input_item = re.sub(' ,', '', input_item)
    output_item = ' '.join(output_data)
    output_item = re.sub(' ,', '', output_item)
    dataset.append((input_item, output_item))
    input_data = []
    output_data = []

complete_set = set(''.join([item[0] for item in dataset])).union(' '.join([item[1] for item in dataset]).split())

total_vocab = {k: i for (i, k) in enumerate(complete_set)}

inv_total_vocab = {i: k for (k, i) in total_vocab.items()}

In [14]:
import pickle

total_vocab['<pad>'] = max(total_vocab.values()) + 1
inv_total_vocab[total_vocab['<pad>']] = '<pad>'
total_vocab['<start>'] = max(total_vocab.values()) + 1
inv_total_vocab[total_vocab['<start>']] = '<start>'
#total_vocab['<end>'] = max(total_vocab.values()) + 1
#inv_total_vocab[total_vocab['<end>']] = '<end>'

In [15]:
total_vocab = pickle.load(open('total_vocab_2.pkl', 'rb'))
inv_total_vocab = pickle.load(open('inv_total_vocab_2.pkl', 'rb'))

pickle.dump(total_vocab, open('total_vocab_2.pkl', 'wb'))
pickle.dump(inv_total_vocab, open('inv_total_vocab_2.pkl', 'wb'))

In [16]:
m = len(dataset)

In [17]:
data_dict = {}
for item in dataset:
    try:
        data_dict[len(item[0])].append(item)
    except KeyError:
        data_dict[len(item[0])] = [item]

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
def string_to_int(sent, Tx, human_vocab):
    input_l = list(sent)
    input_n = np.array([[human_vocab[c] for c in input_l]])
    input_pad = pad_sequences(input_n, maxlen=Tx, dtype='int32',
                  padding='post', value=total_vocab['<pad>'])
    input_pad = np.asarray([value for line in input_pad for value in line]).reshape(len(input_n), Tx)
    return input_pad

In [20]:
def preprocess_data(dataset, total_vocab, inv_total_vocab):
    
    input_pre, output_pre = zip(*dataset)
    
    input_l = [list(l) for l in input_pre]
    output_w = [l.split() for l in output_pre]
    
    max_input = max([len(l) for l in input_l])
    max_output = max([len(l) for l in output_w])
    
    input_n = np.array([[total_vocab[c] for c in line] for line in input_l])
    
    print(type(input_n))
    if '<pad>' not in total_vocab.keys():
        total_vocab['<pad>'] = max(total_vocab.values()) + 1
        inv_total_vocab[total_vocab['<pad>']] = '<pad>'
    if '<start>' not in total_vocab.keys():
        total_vocab['<start>'] = max(total_vocab.values()) + 1
        inv_total_vocab[total_vocab['<start>']] = '<start>'
    if '<end>' not in total_vocab.keys():
        total_vocab['<end>'] = max(total_vocab.values()) + 1
        inv_total_vocab[total_vocab['<end>']] = '<end>'
        
    output_n = np.array([[total_vocab['<start>']] + \
                      [total_vocab[c] for c in line] + \
                      [total_vocab['<end>']] for line in output_w])
    max_output += 2 # adjusts for <start> and <end> tokens
        
    input_positionalized = []
    output_in_positionalized = []
    output_out_positionalized = []
    
    for i, line in enumerate(output_n):
        for j in range(1,len(line)):
            input_positionalized.append(input_n[i])
            output_in_positionalized.append(line[:j])
            output_out_positionalized.append(line[:j+1])
    
    # now pad
    input_pad = pad_sequences(input_positionalized, maxlen=max_input, dtype='int32',
                  padding='post', value=total_vocab['<pad>'])
    output_in_pad = pad_sequences(output_in_positionalized, maxlen=max_output, dtype='int32',
                  padding='post', value=total_vocab['<pad>'])
    output_out_pad = pad_sequences(output_out_positionalized, maxlen=max_output, dtype='int32',
                  padding='post', value=total_vocab['<pad>'])

    

    # these convert the data into numpy arrays with depth
    input_pad = np.asarray([value for line in input_pad for value in line]).reshape(len(input_positionalized), max_input)
    output_in_pad = np.asarray([value for line in output_in_pad for value in line]).reshape(len(output_in_positionalized), max_output)
    output_out_pad = np.asarray([value for line in output_out_pad for value in line]).reshape(len(output_out_positionalized), max_output)
    
    output_oh = np.zeros((len(output_out_positionalized), max_output, len(total_vocab)))
    
    # assign 1 values
    for i, line in enumerate(output_out_pad):
        for j, value in enumerate(line):
            output_oh[i,j,value] = 1
    
    return input_pad, output_in_pad, output_oh

from sklearn.model_selection import train_test_split

X, Y_in, Yoh = preprocess_data(dataset, total_vocab, inv_total_vocab)
Tx = X.shape[1]
Ty_in = Y_in.shape[1]
Ty = Yoh.shape[1]

print(X.shape)
print(Y_in.shape)
print(Yoh.shape)

X_train, X_test, Y_in_train, Y_in_test, Yoh_train, Yoh_test = \
    train_test_split(X, Y_in, Yoh, test_size=0.2, shuffle=False)

pickle.dump(X_train, open('X_train_2.pkl', 'wb'))
pickle.dump(X_test, open('X_test_2.pkl', 'wb'))
pickle.dump(Y_in_train, open('Y_in_train_2.pkl', 'wb'))
pickle.dump(Y_in_test, open('Y_in_test_2.pkl', 'wb'))
pickle.dump(Yoh_train, open('Yoh_train_2.pkl', 'wb'))
pickle.dump(Yoh_test, open('Yoh_test_2.pkl', 'wb'))

In [47]:
X_train = pickle.load(open('X_train_2.pkl', 'rb'))
X_test = pickle.load(open('X_test_2.pkl', 'rb'))
#X_decoder_train = pickle.load(open('X_decoder_train.pkl', 'rb'))
#X_decoder_test = pickle.load(open('X_decoder_test.pkl', 'rb'))
Y_in_train = pickle.load(open('Y_in_train_2.pkl', 'rb'))
Y_in_test = pickle.load(open('Y_in_test_2.pkl', 'rb'))
Yoh_train = pickle.load(open('Yoh_train_2.pkl', 'rb'))
Yoh_test = pickle.load(open('Yoh_test_2.pkl', 'rb'))

In [22]:
sequence_length_X = X_train.shape[1] # normalized length of all members of batch
sequence_length_Y = Yoh_train.shape[1]
vocab_size = len(total_vocab) # vocab_size would be the total number of possible byte-pair encodings
embedding_size = 512 # I should probably adjust this
# I'm going to do padding and see what happens
# for input and output language issues, I'm going to include both input and output language content into
#    a single embedding matrix

In [23]:
#embed_layer = keras.layers.Embedding(vocab_size, embedding_size, weights=[np.sqrt(512) * embed_weights], name="input_embedding")
embed_layer = TransformerEmbedding(vocab_size, embedding_size, name="embedding")

In [24]:
x_i = keras.layers.Input(shape=(sequence_length_X,), dtype="int32", name="encoder_input")
embed_i = embed_layer(x_i)
pe_i_layer = keras.layers.Lambda(_positional_encoding, name="encoder_input_lambda")
pe_i = pe_i_layer(x_i)
added_i = keras.layers.Add(name="encoder_input_add")([embed_i, pe_i])
dropout_intro_i = keras.layers.Dropout(0.1, name="encoder_input_dropout")(added_i)

Calling _positional_encoding.


In [25]:
ee_unit = EncoderUnit(name="EncoderUnit")
ee = ee_unit(dropout_intro_i)

Calling EncoderUnit.
[TensorShape([None, 57, 512]), TensorShape([None, 57, 512]), TensorShape([None, 57, 512])]
Build complete.
Calling MultiheadAttention.
Calling FeedforwardUnit.


In [26]:
x_o = keras.layers.Input(shape=(sequence_length_Y,), dtype="int32", name="decoder_input")
embed_o = embed_layer(x_o)
pe_o_layer = keras.layers.Lambda(_positional_encoding, name="decoder_input_lambda")
pe_o = pe_o_layer(x_o)
added_o = keras.layers.Add(name="decoder_input_add")([embed_o, pe_o])
dropout_intro_o = keras.layers.Dropout(0.1, name="decoder_input_dropout")(added_o)

Calling _positional_encoding.


In [27]:
du = DecoderUnit(name="DecoderUnit")([dropout_intro_o, ee])

Calling DecoderUnit.
[TensorShape([None, 7, 512]), TensorShape([None, 7, 512]), TensorShape([None, 7, 512])]
Build complete.
Calling MultiheadAttention.
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
[TensorShape([None, 7, 512]), TensorShape([None, 57, 512]), TensorShape([None, 57, 512])]
Build complete.
Calling MultiheadAttention.
Calling FeedforwardUnit.


In [28]:
#lambda_3 = keras.layers.Lambda(_embed_to_linear)(embed_weights)
#linear_layer = Linear(weights=lambda_3) # the problem is actually here
#linear_layer = Linear(embed_layer.weights[0], name="LinearLayer")
#linear = linear_layer(du)
linear = embed_layer(du, projection=True)
softmax = keras.layers.Activation('softmax', name="FinalActivation")(linear)

In [29]:
model = keras.Model(inputs=[x_i, x_o], outputs=softmax, name='transformer')

In [30]:
import math

def decay(epoch):
    epoch_adj = epoch + 73
    initial_rate = 512**-0.5
    warmup_steps = 4000
    lrate = initial_rate * min(epoch_adj**-0.5, epoch_adj * warmup_steps ** -1.5)
    return lrate

lrate = keras.callbacks.LearningRateScheduler(decay, verbose=1)

In [31]:
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.getcwd(),
    save_weights_only=False,
    monitor='accuracy',
    mode='max',
    save_best_only=True)

In [32]:
adam = keras.optimizers.Adam(lr=512**-0.5, beta_1=0.9, beta_2=0.98, epsilon=1e-09)
cc = keras.losses.CategoricalCrossentropy(label_smoothing=0.1)

In [33]:
model.compile(optimizer=adam, loss=cc, metrics=['accuracy'])
training = False

In [34]:
model.summary()
# I probably need to just download the weights for each layer

Model: "transformer"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      [(None, 7)]          0                                            
__________________________________________________________________________________________________
encoder_input (InputLayer)      [(None, 57)]         0                                            
__________________________________________________________________________________________________
embedding (TransformerEmbedding multiple             299008      encoder_input[0][0]              
                                                                 decoder_input[0][0]              
                                                                 DecoderUnit[0][0]                
________________________________________________________________________________________

In [36]:
# lrate has to be passed in model fit inside a list of callbacks
training = True
model.fit([X_train, Y_in_train], Yoh_train, epochs=20, batch_size=32, callbacks=[lrate])


Epoch 00001: LearningRateScheduler reducing learning rate to 1.275257518417849e-05.
Epoch 1/20
Calling _positional_encoding.
Calling _positional_encoding.
Calling EncoderUnit.
Calling MultiheadAttention.
Calling FeedforwardUnit.
Calling DecoderUnit.
Calling MultiheadAttention.
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
Calling MultiheadAttention.
Calling FeedforwardUnit.
Calling _positional_encoding.
Calling _positional_encoding.
Calling EncoderUnit.
Calling MultiheadAttention.
Calling FeedforwardUnit.
Calling DecoderUnit.
Calling MultiheadAttention.
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
Calling MultiheadAttention.
Calling FeedforwardUnit.

Epoch 00002: LearningRateScheduler reducing learning rate to 1.2927267994920662e-05.
Epoch 2/20

Epoch 00003: LearningRateScheduler reducing learning rate to 1.3101960805662831e-05.
Epoch 3/20

Epoch 00004: LearningRateSche

<tensorflow.python.keras.callbacks.History at 0x7f458e7cc350>

In [37]:
model.fit([X_train, Y_in_train], Yoh_train, epochs=20, batch_size=32, callbacks=[lrate])


Epoch 00001: LearningRateScheduler reducing learning rate to 1.100564707675678e-05.
Epoch 1/5

Epoch 00002: LearningRateScheduler reducing learning rate to 1.118033988749895e-05.
Epoch 2/5

Epoch 00003: LearningRateScheduler reducing learning rate to 1.1355032698241122e-05.
Epoch 3/5

Epoch 00004: LearningRateScheduler reducing learning rate to 1.1529725508983293e-05.
Epoch 4/5

Epoch 00005: LearningRateScheduler reducing learning rate to 1.1704418319725463e-05.
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fef40217710>

In [None]:
keras.utils.plot_model(model)

In [None]:
help(tf.keras.metrics.CategoricalAccuracy.update_state)

In [None]:
class MaskedAccuracy(tf.keras.metrics.CategoricalAccuracy):
    def __init__(self, *args, **kwargs):
        super(MaskedAccuracy, self).__init__(self, *args, **kwargs)
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        return super(MaskedAccuracy, self).update_state(y_true, y_pred)
    
    def reset_states(self):
        super(MaskedAccuracy, self).reset_states()

In [None]:
masked_accuracy = tf.keras.metrics.CategoricalAccuracy()

In [94]:
model.evaluate([X_test, Y_in_test], Yoh_test)

Calling _positional_encoding.
Calling _positional_encoding.
Calling EncoderUnit.
Calling MultiheadAttention.
Calling FeedforwardUnit.
Calling DecoderUnit.
Calling MultiheadAttention.
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
(None, 7, 7)
Calling MultiheadAttention.
Calling FeedforwardUnit.


[56.34616470336914, 0.8978829979896545]

In [None]:
''.join([inv_total_vocab[w] for w in X_train[100]])

In [None]:
[inv_total_vocab[w] for w in Y_in_train[106]]

In [57]:
[inv_total_vocab[np.argmax(w)] for w in Yoh_train[106]]

['<start>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

In [88]:
[inv_total_vocab[w] for w in Y_in_test[1197]]

['<start>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

In [74]:
start_index = 1300

In [78]:
start_index = 1012

In [82]:
start_index = 500

In [66]:
start_index = 505 # test

In [89]:
start_index = 1301 # test

In [90]:
start_index = 1197 # test

In [91]:
prediction = model.predict([X_test[start_index:start_index+1], Y_in_test[start_index:start_index+1]])
prediction.shape
output = np.array(list(np.argmax(w) for w in prediction[0]))
output_read = [inv_total_vocab[np.argmax(w)] for w in prediction[0]]

In [92]:
print(''.join([inv_total_vocab[w] for w in X_test[start_index]]))

tr`aw thāwitrhin 'ama' tr`ānit<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [93]:
print(output_read[1])
i = 2
while True:
    prediction = model.predict([X_test[start_index:start_index+1], output.reshape((1,7))])
    output = np.array(list(np.argmax(w) for w in prediction[0]))
    output_read = [inv_total_vocab[np.argmax(w)] for w in prediction[0]]
    print(output_read[i])
    i += 1
    if '<end>' in output_read:
        break

traw
ta:na
'ama'
tra:nit
<end>


In [107]:
output_read

['<start>', ',', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']

In [108]:
prediction = model.predict([X_train[95:96], output.reshape((1,7))])

In [109]:
output = np.array(list(np.argmax(w) for w in prediction[0]))
output_read = [inv_total_vocab[np.argmax(w)] for w in prediction[0]]

In [110]:
output_read

['<start>', ',', "'ama'", '<pad>', '<pad>', '<pad>', '<pad>']

In [None]:
prediction = model.predict([X_train[102:103], output.reshape((1,7))])

# FINISHED ONLY

In [38]:
# 93 epochs so far
# problem: this is taking the original sequence as input to the output, 
#  rather than the previous time points of the output!
#training = True

# stop! only use this when FINISHED training!
if training == True:
    model.save_weights('weights_2.keras')
    print("Saved.")
else:
    print("Training is false!")

Saved.


# STARTING ONLY

In [35]:
# stop! only use this when starting training!
if training == False:
    model.load_weights('weights_2.keras')
    print("Loaded weights.")
else:
    print("Training is true!")

Loaded weights.
