# Bad code below

In [2]:
from tensorflow import keras
import tensorflow as tf
import numpy as np
import csv
from math import sqrt
from copy import deepcopy

In [3]:
input_data = []
output_data = []
with open('eng_de.csv') as f:
    reader = csv.reader(f)
    for row in reader:
        input_data.append(row[0].strip().split())
        output_data.append(row[1].strip().split())

In [13]:
all_words = np.unique([word for row in input_data for word in row]+[word for row in output_data for word in row])
word_dict = {all_words[i]:i for i in range(len(all_words))}
word_dict

{'apfel': 0,
 'apple': 1,
 'beer': 2,
 'bier': 3,
 'book': 4,
 'bread': 5,
 'brot': 6,
 'buch': 7,
 'can': 8,
 'drink': 9,
 'eat': 10,
 'essen': 11,
 'i': 12,
 'ich': 13,
 'konnen': 14,
 'lesen': 15,
 'mochten': 16,
 'newspaper': 17,
 'read': 18,
 'to': 19,
 'trinken': 20,
 'want': 21,
 'wasser': 22,
 'water': 23,
 'we': 24,
 'wir': 25,
 'zeitung': 26}

In [14]:
one_hot_inputs = np.zeros((len(input_data),10,32))
for sentence_ind in range(len(input_data)):
    for word_ind in range(len(input_data[sentence_ind])):
        one_hot_inputs[sentence_ind][word_ind][word_dict[input_data[sentence_ind][word_ind]]] = 1


one_hot_outputs = np.zeros((len(output_data),10,32))
for sentence_ind in range(len(output_data)):
    for word_ind in range(len(output_data[sentence_ind])):
        one_hot_outputs[sentence_ind][word_ind][word_dict[output_data[sentence_ind][word_ind]]] = 1

In [15]:
class PositionalEncoding(keras.layers.Layer):
    def __init__(self,d_model,max_seq_len):
        super(PositionalEncoding,self).__init__()
        self.encodings = np.array([pos/np.power(10_000,(np.arange(d_model)//2)/d_model) for pos in range(max_seq_len)])
        self.encodings[:,0::2] = np.sin(self.encodings[:,0::2])
        self.encodings[:,1::2] = np.cos(self.encodings[:,1::2])
        
        
    def call(self,inputs):
        if inputs.shape[1]!=None:
            return inputs+self.encodings[:inputs.shape[1]]
        else:
            return inputs#this is stupid
                             

class AttentionHead(keras.layers.Layer):
    def __init__(self,d_model,d_k):
        super(AttentionHead,self).__init__()
        self.w_q = tf.Variable(np.random.uniform(-1.0,1.0,size=(d_model,d_k)),dtype=tf.float32)
        self.w_k = tf.Variable(np.random.uniform(-1.0,1.0,size=(d_model,d_k)),dtype=tf.float32)
        self.w_v = tf.Variable(np.random.uniform(-1.0,1.0,size=(d_model,d_k)),dtype=tf.float32)
        self.d_model = d_model
        self.d_k = d_k
    
    def call(self,Q,K,V):
        queries = tf.matmul(Q,self.w_q)
        keys = tf.matmul(K,self.w_k)
        values = tf.matmul(V,self.w_v)
        scores = tf.nn.softmax(tf.matmul(queries,tf.transpose(keys,perm=[0,2,1]))/sqrt(self.d_k))
        return tf.matmul(scores,values)

class MultiHeadAttention(keras.layers.Layer):
    def __init__(self,d_model,n_heads):
        super(MultiHeadAttention,self).__init__()
        assert d_model%n_heads==0
        
        d_k = d_model//n_heads
        self.heads = [AttentionHead(d_model,d_k) for _ in range(n_heads)]
        self.w_o = tf.Variable(np.random.uniform(-1.0,1.0,size=(d_model,d_model)),dtype=tf.float32)
    
    def call(self,Q,K,V):
        heads_outputs = tf.concat([head(Q,K,V) for head in self.heads],-1)
        return tf.matmul(heads_outputs,self.w_o)

class EncoderBlock(keras.layers.Layer):
    def __init__(self,d_model,n_heads):
        super(EncoderBlock,self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.attention = MultiHeadAttention(d_model,n_heads)
        self.norm_layer_1 = keras.layers.LayerNormalization()
        self.ff_layer = keras.layers.Dense(d_model,activation='relu')
        self.norm_layer_2 = keras.layers.LayerNormalization()
    
    def call(self,inputs):
        x = inputs + self.attention(inputs,inputs,inputs)
        x = self.norm_layer_1(x)
        x = x + self.ff_layer(x)
        return self.norm_layer_2(x)

class DecoderBlock(keras.layers.Layer):
    def __init__(self,d_model,n_heads):
        super(DecoderBlock,self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        
        self.attention_1 = MultiHeadAttention(d_model,n_heads)
        self.norm_layer_1 = keras.layers.LayerNormalization()
        self.attention_2 = MultiHeadAttention(d_model,n_heads)
        self.norm_layer_2 = keras.layers.LayerNormalization()
        self.ff_layer = keras.layers.Dense(d_model,activation='relu')
        self.norm_layer_3 = keras.layers.LayerNormalization()
    
    def call(self,prev_tokens,encoder_tokens):
        x = prev_tokens + self.attention_1(prev_tokens,prev_tokens,prev_tokens)
        x = self.norm_layer_1(x)
        x = x + self.attention_2(prev_tokens,prev_tokens,encoder_tokens)
        x = self.norm_layer_2(x)
        x = x + self.ff_layer(x)
        return self.norm_layer_3(x)


    
def make_transformer(n_encoders=3,n_decoders=3,n_heads=4,token_size=16,max_tokens=10):
    encoder_input = keras.Input(shape=(None,token_size),dtype='float32',name='encoder_input')
    encoder_output = PositionalEncoding(token_size,max_tokens)(encoder_input)
    for encoder in range(n_encoders):
        encoder_output = EncoderBlock(token_size,n_heads)(encoder_output)
    
    decoder_input = keras.Input(shape=(None,token_size),dtype='float32',name='decoder_input')
    decoder_output = PositionalEncoding(token_size,max_tokens)(decoder_input)
    for decoder in range(n_decoders):
        decoder_output = DecoderBlock(token_size,n_heads)(decoder_output,encoder_output)
    
    transformer_model = keras.Model(inputs = [encoder_input,decoder_input],outputs=decoder_output)
    transformer_model.compile(
        optimizer = 'adam',
        loss = 'binary_crossentropy',
        metrics = ['accuracy']
    )
    return transformer_model
        

In [8]:
def train_generator(inputs,outputs,mask_val=-1.0):
    while(True):
        batch = []
        for seq_ind in range(len(inputs)):
            batch_encoder_input = np.array([inputs[seq_ind] for _ in range(len(outputs[seq_ind]))])
            batch_decoder_input = np.array([outputs[seq_ind] for _ in range(len(outputs[seq_ind]))])
            batch_output = deepcopy(batch_decoder_input)
            for mask_ind in range(len(outputs[seq_ind])):
                batch_decoder_input[mask_ind][mask_ind+1:] = mask_val
            yield {'encoder_input':batch_encoder_input,'decoder_input':batch_decoder_input},batch_output

In [16]:
N_EPOCHS = 500
EPOCH_STEPS = 50

In [17]:
transformer = make_transformer(token_size=32)

#use this to plot model topology
# keras.utils.plot_model(transformer,show_shapes=False)

#use to test
# outie = transformer({'encoder_input':one_hot_inputs,'decoder_input':one_hot_outputs})

In [18]:
hist = transformer.fit(x=train_generator(one_hot_inputs,one_hot_outputs),verbose=1,epochs=N_EPOCHS,steps_per_epoch=EPOCH_STEPS)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155

Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 

Epoch 243/500
Epoch 244/500
Epoch 245/500
Epoch 246/500
Epoch 247/500
Epoch 248/500
Epoch 249/500
Epoch 250/500
Epoch 251/500
Epoch 252/500
Epoch 253/500
Epoch 254/500
Epoch 255/500
Epoch 256/500
Epoch 257/500
Epoch 258/500
Epoch 259/500
Epoch 260/500
Epoch 261/500
Epoch 262/500
Epoch 263/500
Epoch 264/500
Epoch 265/500
Epoch 266/500
Epoch 267/500
Epoch 268/500
Epoch 269/500
Epoch 270/500
Epoch 271/500
Epoch 272/500
Epoch 273/500
Epoch 274/500
Epoch 275/500
Epoch 276/500
Epoch 277/500
Epoch 278/500
Epoch 279/500
Epoch 280/500
Epoch 281/500
Epoch 282/500
Epoch 283/500
Epoch 284/500
Epoch 285/500
Epoch 286/500
Epoch 287/500
Epoch 288/500
Epoch 289/500
Epoch 290/500
Epoch 291/500
Epoch 292/500
Epoch 293/500
Epoch 294/500
Epoch 295/500
Epoch 296/500
Epoch 297/500
Epoch 298/500
Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 303/500
Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 

Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 

Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 411/500
Epoch 412/500
Epoch 413/500
Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epoch 429/500
Epoch 430/500
Epoch 431/500
Epoch 432/500
Epoch 433/500
Epoch 434/500
Epoch 435/500
Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 451/500
Epoch 452/500
Epoch 453/500
Epoch 454/500
Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 459/500
Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 

Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


In [276]:
gen = train_generator(one_hot_inputs,one_hot_outputs)
test_inputs, test_outputs = next(gen)
pred = transformer(test_inputs)
pred

<tf.Tensor: shape=(10, 10, 32), dtype=float32, numpy=
array([[[ 0.08211152, -0.20555604, -0.16429874, ..., -0.66962045,
         -0.5916419 , -0.5481715 ],
        [ 0.08211157, -0.20555602, -0.16429877, ..., -0.6696204 ,
         -0.59164184, -0.5481714 ],
        [ 0.08211158, -0.20555599, -0.16429877, ..., -0.66962045,
         -0.59164184, -0.5481714 ],
        ...,
        [ 0.08211157, -0.20555602, -0.16429877, ..., -0.6696204 ,
         -0.59164184, -0.5481714 ],
        [ 0.08211157, -0.20555602, -0.16429877, ..., -0.6696204 ,
         -0.59164184, -0.5481714 ],
        [ 0.08211158, -0.20555599, -0.16429877, ..., -0.66962034,
         -0.59164184, -0.54817134]],

       [[ 0.08211109, -0.20555617, -0.16429842, ..., -0.6696202 ,
         -0.5916417 , -0.5481709 ],
        [ 0.08211105, -0.20555621, -0.16429844, ..., -0.66962034,
         -0.5916417 , -0.54817086],
        [ 0.08211105, -0.20555627, -0.1642986 , ..., -0.6696203 ,
         -0.59164166, -0.5481708 ],
        ...,


In [277]:
pred[0]

<tf.Tensor: shape=(10, 32), dtype=float32, numpy=
array([[ 0.08211152, -0.20555604, -0.16429874, -1.5145501 , -0.37675244,
        -0.81116295,  3.4811935 , -0.08330107, -0.53607965, -0.7392224 ,
        -0.3941177 , -0.88468444, -0.55713075, -0.59213126, -0.7105564 ,
        -0.46688604, -0.80185425, -0.20627993, -0.3647418 ,  0.06609145,
        -0.08202042, -0.65933204, -0.81193537, -0.14005584, -0.36213696,
        -0.15784243, -0.82751   , -0.28333318, -0.8281327 , -0.66962045,
        -0.5916419 , -0.5481715 ],
       [ 0.08211157, -0.20555602, -0.16429877, -1.5145501 , -0.3767524 ,
        -0.81116307,  3.4811935 , -0.08330107, -0.5360796 , -0.7392223 ,
        -0.39411765, -0.88468456, -0.5571307 , -0.5921311 , -0.7105564 ,
        -0.46688598, -0.8018542 , -0.20627995, -0.3647417 ,  0.06609151,
        -0.08202031, -0.6593319 , -0.8119353 , -0.14005578, -0.36213687,
        -0.15784234, -0.82751   , -0.2833331 , -0.8281327 , -0.6696204 ,
        -0.59164184, -0.5481714 ],
    