In [1]:
import numpy as np
import os
import tensorflow as tf

In [5]:
# open .npy files from a folder and concatenate all the tokens into one numpy array
tokens = np.array(()) # empty numpy array
for filename in os.listdir('Dataset_tokenized_BPE'): # iterate through all files in the folder
    if filename.endswith('.npy'): # if the file is a .npy file
        for i in range(len(np.load(('Dataset_tokenized_BPE/' + filename), allow_pickle=True)[()].get('tokens'))): # iterate through all the tokens in the file
            tokens = np.append(tokens, np.load(('Dataset_tokenized_BPE/' + filename), allow_pickle=True)[()].get('tokens')[i]) # append the tokens to the numpy array
        continue 
    else: 
        continue 

In [111]:
# tokens = tokens.astype(int) # convert the numpy array to integers

data = tf.cast(tokens, tf.int32) # put tokens into a tensor and cast to int32
# print(data)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n] # first 90% of data
# print(train_data)
val_data = data[n:] # last 10% of data
# print(val_data)

BLOCK_SIZE = 8
BATCH_SIZE = 4
    
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = tf.random.uniform((BATCH_SIZE,), maxval=len(data)-BLOCK_SIZE, dtype=tf.int32)
    x = tf.stack([data[i:i+BLOCK_SIZE] for i in ix])
    y = tf.stack([data[i+1:i+BLOCK_SIZE+1] for i in ix])
    return x, y

inputs, targets = get_batch('train')
print(inputs, targets)

tf.keras.layers.MultiHeadAttention(1, 8) # 1 head, 8 units per head

tf.Tensor(
[[224 212  50 227 216  52 231 218]
 [ 50 106 251  43 244  46 261 202]
 [198  45 224 200  48 331  43 220]
 [122 204  59 264 205  47  94 122]], shape=(4, 8), dtype=int32) tf.Tensor(
[[212  50 227 216  52 231 218  45]
 [106 251  43 244  46 261 202  51]
 [ 45 224 200  48 331  43 220 204]
 [204  59 264 205  47  94 122 206]], shape=(4, 8), dtype=int32)


<keras.layers.attention.multi_head_attention.MultiHeadAttention at 0x2388bf03640>

In [115]:
targets

<tf.Tensor: shape=(4, 8), dtype=int32, numpy=
array([[212,  50, 227, 216,  52, 231, 218,  45],
       [106, 251,  43, 244,  46, 261, 202,  51],
       [ 45, 224, 200,  48, 331,  43, 220, 204],
       [204,  59, 264, 205,  47,  94, 122, 206]])>

In [91]:
for b in range(BATCH_SIZE): # batch dimension
    for t in range(BLOCK_SIZE): # time dimension
        context = inputs[b, :t+1]
        target = targets[b,t]
        print(f"when input is {context} the target is: {target}")

when input is [50] the target is: 229
when input is [ 50 229] the target is: 57
when input is [ 50 229  57] the target is: 289
when input is [ 50 229  57 289] the target is: 214
when input is [ 50 229  57 289 214] the target is: 50
when input is [ 50 229  57 289 214  50] the target is: 228
when input is [ 50 229  57 289 214  50 228] the target is: 57
when input is [ 50 229  57 289 214  50 228  57] the target is: 253
when input is [225] the target is: 65
when input is [225  65] the target is: 261
when input is [225  65 261] the target is: 216
when input is [225  65 261 216] the target is: 57
when input is [225  65 261 216  57] the target is: 321
when input is [225  65 261 216  57 321] the target is: 60
when input is [225  65 261 216  57 321  60] the target is: 225
when input is [225  65 261 216  57 321  60 225] the target is: 65
when input is [24] the target is: 337
when input is [ 24 337] the target is: 12
when input is [ 24 337  12] the target is: 224
when input is [ 24 337  12 224] t

In [67]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model, activation='relu'),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

In [89]:
get_batch('train')[1].shape
inputs = tf.keras.layers.Input(shape=get_batch('train')[0].shape)
outputs= FeedForward(8, 32)(inputs)
model = tf.keras.Model(inputs, outputs)

model.compile(optimizer='adam', loss='categorical_crossentropy')
# model.fit(get_batch('train'),
#                 epochs=20,
#                 validation_data=get_batch('val'))

In [118]:
em = tf.keras.layers.Embedding(1000,8)
em(inputs)

<tf.Tensor: shape=(4, 8, 8), dtype=float32, numpy=
array([[[-0.01894864, -0.00758777, -0.02547537, -0.03709454,
          0.02172568, -0.02753102, -0.03409537, -0.02683078],
        [-0.00862032, -0.0002634 ,  0.01840265, -0.03801345,
         -0.02777342, -0.01620481, -0.03822521, -0.00355053],
        [-0.04214961, -0.02287751,  0.0376593 ,  0.04500557,
         -0.0372509 ,  0.01384381,  0.02985987,  0.025936  ],
        [-0.02272687,  0.01554424,  0.04035195,  0.00617802,
         -0.04731195,  0.00525496, -0.03953175, -0.01883389],
        [ 0.01639735, -0.04936351, -0.04055194,  0.00983968,
          0.04348755,  0.03789325,  0.04307434,  0.03306111],
        [-0.02681906,  0.03159245,  0.01588969,  0.01478067,
         -0.02102259,  0.01165945, -0.00497264,  0.02824814],
        [-0.04940389, -0.01953031, -0.01591098, -0.04055402,
          0.01773492, -0.04411858,  0.00464853,  0.03141815],
        [-0.01987718, -0.01801397, -0.03207477,  0.04858736,
         -0.02665482, -0.03

In [117]:
ffn = FeedForward(d_model= 8, dff =32)
x = ffn(tf.cast(inputs, tf.float32))
x

<tf.Tensor: shape=(4, 8), dtype=float32, numpy=
array([[ 0.4418457 ,  0.7790316 , -2.3305297 ,  0.9312438 ,  0.85393375,
        -0.28479034, -0.11583445, -0.27490097],
       [-0.9555565 , -0.74374133,  1.0320394 , -1.6921114 ,  1.2062762 ,
        -0.04276661,  1.0634617 ,  0.13239864],
       [-0.06966723, -0.73711395,  0.16599175,  0.73687583, -0.7739256 ,
         2.022661  , -1.4745574 ,  0.12973653],
       [-0.10219621,  1.1680634 , -0.9239531 ,  1.6237562 ,  0.766556  ,
        -0.38831973, -1.2936971 , -0.8502097 ]], dtype=float32)>

In [32]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()


In [33]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [106]:
ahead = BaseAttention(num_heads=1, key_dim=8)
ahead = CausalSelfAttention(num_heads=1, key_dim=8)

In [34]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [45]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context='gay'):
    x = self.causal_self_attention(x=x)
    # x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [52]:
dl = DecoderLayer(d_model=4, num_heads=8, dff=8)
dl(inputs)

InvalidArgumentError: Exception encountered when calling layer "query" "                 f"(type EinsumDense).

cannot compute Einsum as input #1(zero-based) was expected to be a int32 tensor but is a float tensor [Op:Einsum]

Call arguments received by layer "query" "                 f"(type EinsumDense):
  • inputs=tf.Tensor(shape=(4, 8), dtype=int32)