# Encoder

## Dataset

In [None]:
text = """
The cat lazily stretched out on the sunny windowsill.
The aroma of freshly baked cookies filled the kitchen.
The stars twinkled brightly in the clear night sky.
"""

## Vocab Size

In [None]:
words = text.lower().split()

words = [''.join(e for e in word if e.isalnum()) for word in words]

vocab_size = len(set(words))

print(f"Words: {words}")
print(f"Vocab Size: {vocab_size}")

Words: ['the', 'cat', 'lazily', 'stretched', 'out', 'on', 'the', 'sunny', 'windowsill', 'the', 'aroma', 'of', 'freshly', 'baked', 'cookies', 'filled', 'the', 'kitchen', 'the', 'stars', 'twinkled', 'brightly', 'in', 'the', 'clear', 'night', 'sky']
Vocab Size: 22


In [None]:
word_to_num = {}
counter = 1

for word in words:
    if word not in word_to_num:
        word_to_num[word] = counter
        counter += 1

print(f"Encode words: {word_to_num}")

Encode words: {'the': 1, 'cat': 2, 'lazily': 3, 'stretched': 4, 'out': 5, 'on': 6, 'sunny': 7, 'windowsill': 8, 'aroma': 9, 'of': 10, 'freshly': 11, 'baked': 12, 'cookies': 13, 'filled': 14, 'kitchen': 15, 'stars': 16, 'twinkled': 17, 'brightly': 18, 'in': 19, 'clear': 20, 'night': 21, 'sky': 22}


## Encoding

In [None]:
encoded_words = {'the': 7, 'cat': 3,
                 'lazily': 2, 'stretched': 5,
                 'out': 4, 'on': 6,
                 'sunny': 1, 'windowsill': 11,
                 'aroma': 9, 'of': 10, 'freshly': 8,
                 'baked': 20, 'cookies': 13, 'filled': 17,
                 'kitchen': 15, 'stars': 16, 'twinkled': 14,
                 'brightly': 21, 'in': 19, 'clear': 9,
                 'night': 18, 'sky': 22}

In [None]:
encoded_words = {'i': 1,
                 'drink': 2,
                 'things': 3,
                 'know': 4,
                 'when': 5,
                 'wont': 6,
                 'play': 7,
                 'out': 8,
                 'true': 9,
                 'storm': 10,
                 'brings': 11,
                 'game': 12,
                 'the': 13,
                 'win': 14,
                 'of': 15,
                 'enemy': 16,
                 'you': 17,
                 'wait': 18,
                 'thrones': 19,
                 'and': 20,
                 'or': 21,
                 'die': 22,
                 'he': 23}

## Word Embedding

In [None]:
import numpy as np

word_embeddings = {}

for word in encoded_words.keys():
    word_embeddings[word] = np.random.rand(6)

print(word_embeddings)

{'i': array([0.83326976, 0.4116421 , 0.7059852 , 0.35628425, 0.494453  ,
       0.30329869]), 'drink': array([0.2104907 , 0.08943647, 0.39845702, 0.31514905, 0.17653371,
       0.09613043]), 'things': array([0.93623006, 0.29567487, 0.40411039, 0.77489315, 0.2405066 ,
       0.81255297]), 'know': array([0.93808735, 0.3741915 , 0.48425115, 0.36750703, 0.64167477,
       0.57419308]), 'when': array([0.57098576, 0.40369725, 0.63765342, 0.2554719 , 0.16109369,
       0.24444861]), 'wont': array([0.2806265 , 0.41638408, 0.18022047, 0.71173044, 0.64462371,
       0.10324108]), 'play': array([0.3387918 , 0.48694483, 0.40335745, 0.53233174, 0.23242059,
       0.20920436]), 'out': array([0.68280454, 0.22586614, 0.8038382 , 0.10114445, 0.25228328,
       0.05541636]), 'true': array([0.0986547 , 0.09433718, 0.63339645, 0.20169918, 0.39858992,
       0.32230135]), 'storm': array([0.38636369, 0.6114414 , 0.60333219, 0.81792873, 0.1691446 ,
       0.52786631]), 'brings': array([0.1464202 , 0.9976896 

## Positional Embedding

In [None]:
import numpy as np

given_embeddings = {
    'when': np.array([0.79, 0.6, 0.96, 0.64, 0.97, 0.2]),
    'you': np.array([0.38, 0.12, 0.06, 0.79, 0.9, 0.74]),
    'play': np.array([0.01, 0.51, 0.27, 0.31, 0.56, 0.59]),
    'game': np.array([0.12, 0.6, 0.65, 0.22, 0.07, 0.37]),
    'of': np.array([0.88, 0.41, 0.79, 0.62, 0.5, 0.7]),
    'thrones': np.array([0.6, 0.33, 0.75, 0.48, 0.94, 0.21]),
}

positional_embeddings = {}

for i, word in enumerate(given_embeddings.keys()):
    positional_embeddings[word] = np.zeros(6)
    for d in range(6):
        if d % 2 == 0:
            positional_embeddings[word][d] = np.sin(i / (10000 ** (2*d / 6)))
            print(f"Word: {word}, Position: {i}, Dimension: {d}, Formula: sin({i} / (10000 ** ((2*{d}) / 6))), Value: {positional_embeddings[word][d]}")
        else:
            positional_embeddings[word][d] = np.cos(i / (10000 ** (2*d / 6)))
            print(f"Word: {word}, Position: {i}, Dimension: {d}, Formula: cos({i} / (10000 ** ((2*{d}) / 6))), Value: {positional_embeddings[word][d]}")

for word, embedding in positional_embeddings.items():
    positional_embeddings[word] = np.round(embedding, 4)

print(positional_embeddings)

Word: when, Position: 0, Dimension: 0, Formula: sin(0 / (10000 ** ((2*0) / 6))), Value: 0.0
Word: when, Position: 0, Dimension: 1, Formula: cos(0 / (10000 ** ((2*1) / 6))), Value: 1.0
Word: when, Position: 0, Dimension: 2, Formula: sin(0 / (10000 ** ((2*2) / 6))), Value: 0.0
Word: when, Position: 0, Dimension: 3, Formula: cos(0 / (10000 ** ((2*3) / 6))), Value: 1.0
Word: when, Position: 0, Dimension: 4, Formula: sin(0 / (10000 ** ((2*4) / 6))), Value: 0.0
Word: when, Position: 0, Dimension: 5, Formula: cos(0 / (10000 ** ((2*5) / 6))), Value: 1.0
Word: you, Position: 1, Dimension: 0, Formula: sin(1 / (10000 ** ((2*0) / 6))), Value: 0.8414709848078965
Word: you, Position: 1, Dimension: 1, Formula: cos(1 / (10000 ** ((2*1) / 6))), Value: 0.9989229760406304
Word: you, Position: 1, Dimension: 2, Formula: sin(1 / (10000 ** ((2*2) / 6))), Value: 0.0021544330233656045
Word: you, Position: 1, Dimension: 3, Formula: cos(1 / (10000 ** ((2*3) / 6))), Value: 0.999999995
Word: you, Position: 1, Dime

## Concat Positional & Word Embedding

In [None]:
summed_embeddings = {}

for word in given_embeddings.keys():
    summed_embeddings[word] = given_embeddings[word] + positional_embeddings[word]

print(summed_embeddings)

{'when': array([0.79, 1.6 , 0.96, 1.64, 0.97, 1.2 ]), 'you': array([1.2215, 1.1189, 0.0622, 1.79  , 0.9   , 1.74  ]), 'play': array([0.9193, 1.5057, 0.2743, 1.31  , 0.56  , 1.59  ]), 'game': array([0.2611, 1.5903, 0.6565, 1.22  , 0.07  , 1.37  ]), 'of': array([0.1232, 1.3928, 0.7986, 1.62  , 0.5   , 1.7   ]), 'thrones': array([-0.3589,  1.3032,  0.7608,  1.48  ,  0.94  ,  1.21  ])}


## Multi Head Attention

### Calculate Q, K, V with Word + Positional Embedding

In [None]:
query_weights = np.array([
    [0.52, 0.45, 0.91, 0.69],
    [0.05, 0.85, 0.37, 0.83],
    [0.49, 0.1 , 0.56, 0.61],
    [0.71, 0.64, 0.4, 0.14],
    [0.76, 0.27, 0.92, 0.67],
    [0.85, 0.56, 0.57, 0.07]
])
key_weights = np.array([
    [0.74, 0.57, 0.21, 0.73],
    [0.55, 0.16, 0.9, 0.17],
    [0.25, 0.74, 0.8, 0.98],
    [0.8, 0.73, 0.2, 0.31],
    [0.37, 0.96, 0.42, 0.08],
    [0.28, 0.41, 0.87, 0.86]
])
value_weights = np.array([
    [0.62, 0.07, 0.7, 0.95],
    [0.2, 0.97, 0.61, 0.35],
    [0.57, 0.8, 0.61, 0.5],
    [0.67, 0.35, 0.98, 0.54],
    [0.47, 0.83, 0.34, 0.94],
    [0.6, 0.69, 0.13, 0.98]
])

In [None]:
query = {}

for word in summed_embeddings.keys():
    query [word] = np.zeros(4)
    for i in range(4):
        query [word][i] = np.nansum(summed_embeddings[word] * query_weights[:, i])

print(query)

{'when': array([3.8828, 3.795 , 4.0809, 3.4222]), 'you': array([4.155503, 3.86996 , 4.09619 , 2.784864]), 'play': array([3.394928, 3.60096 , 3.49278 , 2.721271]), 'game': array([2.620872, 3.1018  , 2.526952, 2.214173]), 'of': array([3.500218, 3.44298 , 3.151664, 2.408978]), 'thrones': array([3.045024, 2.900895, 2.728133, 2.219803])}


In [None]:
key = {}

for word in summed_embeddings.keys():
    key[word] = np.zeros(4)
    for i in range(4):
        key[word][i] = np.nansum(summed_embeddings[word] * key_weights[:, i])

print(key)

{'when': array([3.7115, 4.0371, 4.1533, 3.4075]), 'you': array([3.787055, 3.805407, 3.563085, 3.266164]), 'play': array([3.277392, 3.113695, 3.648123, 3.014172]), 'game': array([2.617504, 2.408585, 3.476601, 2.666324]), 'of': array([3.013858, 3.243636, 3.931272, 3.11354 ]), 'thrones': array([2.511974, 3.045831, 3.449651, 2.279731])}


In [None]:
value = {}

for word in summed_embeddings.keys():
    value[word] = np.zeros(4)
    for i in range(4):
        value[word][i] = np.nansum(summed_embeddings[word] * value_weights[:, i])

print(value)

{'when': array([3.6317, 4.5824, 4.2076, 4.7639]), 'you': array([3.682864, 3.794698, 3.861921, 5.10094 ]), 'play': array([3.122357, 3.76472 , 3.41021 , 4.32948 ]), 'game': array([2.526447, 3.516468, 2.950818, 3.2001  ]), 'of': array([3.150546, 4.15352 , 3.401594, 4.01462 ]), 'thrones': array([2.631178, 3.980721, 2.93511 , 3.364165])}


In [None]:
query_matrix = np.array(list(query.values()))
key_matrix = np.array(list(key.values()))

single_head = np.dot(query_matrix, key_matrix.T)

print(single_head)

[[58.34215517 54.86395674 49.74465475 42.61617962 50.71009096 43.19179749]
 [57.54869491 54.15478698 49.00655143 41.8643413  49.85086005 42.70492635]
 [50.91700499 47.89308685 43.28327092 36.95824848 44.11588873 37.74852434]
 [40.28962745 37.96459475 34.14021405 29.01999836 34.78809803 29.79594224]
 [48.18911229 45.45522221 40.95070805 34.83473867 41.6074317  35.64315452]
 [41.90754329 39.54156992 35.65567235 30.36075861 36.2231957  30.95631748]]


### Scaling resultant Matrix

In [None]:
embedding_vectors = np.array(list(given_embeddings.values()))
scale = np.sqrt(6/embedding_vectors.size)
scaled_matrix = single_head * scale
print(scaled_matrix)

[[23.81808511 22.39811655 20.30817026 17.39798248 20.70230794 17.63297749]
 [23.49415631 22.1085992  20.00684084 17.09104577 20.35152839 17.43421318]
 [20.78678024 19.55227083 17.67032136 15.08814176 18.01023616 15.41077053]
 [16.44817153 15.49898091 13.93768402 11.84736472 14.20218155 12.16414248]
 [19.67312271 18.55701676 16.71805655 14.22122251 16.98616286 14.5512569 ]
 [17.10868291 16.14277832 14.55636728 12.3947278  14.78805772 12.63786369]]


### Softmax

In [None]:
def softmax(matrix):

    exp_matrix = np.exp(matrix)
    row_sums = np.sum(exp_matrix, axis=1)
    softmax_matrix = exp_matrix / row_sums[:, np.newaxis]

    return softmax_matrix

softmax_matrix = softmax(scaled_matrix)

print(softmax_matrix)

[[0.75777469 0.18317052 0.02265705 0.00123403 0.03360278 0.00156092]
 [0.75305323 0.18840216 0.02303052 0.00124736 0.03250871 0.00175803]
 [0.71147977 0.20702459 0.03152838 0.00238383 0.04429195 0.00329148]
 [0.62580719 0.24222128 0.05083346 0.00628544 0.06622462 0.008628  ]
 [0.68588579 0.22466382 0.03571765 0.00294118 0.04670033 0.00409123]
 [0.63405143 0.24134453 0.04939328 0.00568696 0.06227156 0.00725224]]


### Single Head Attention

In [None]:
value_matrix = np.array(list(value.values()))
single_head_matrix = np.dot(softmax_matrix, value_matrix)

print(single_head_matrix)

[[3.61043776 4.40292387 4.09559408 4.78650057]
 [3.61082969 4.39883395 4.09410208 4.78862463]
 [3.59899426 4.37002885 4.06801163 4.7784569 ]
 [3.57075759 4.30974275 4.01107918 4.75192832]
 [3.595188   4.35060071 4.05491432 4.77878656]
 [3.57538626 4.31477208 4.01821963 4.75808201]]


### Normalizing Single Head Matrix

In [None]:
weights_matrix = np.array([[0.8, 0.34, 0.45, 0.54, 0.07, 0.53],
                           [0.85, 0.74, 0.78, 0.5, 0.75, 0.55],
                           [0.53, 0.81, 0.55, 0.59, 0.49, 0.14],
                           [0.7, 0.6, 0.12, 0.42, 0.29, 0.87]])
normalized_single_head = np.dot(single_head_matrix, weights_matrix)
print(normalized_single_head)

[[12.15205075 10.67504404  7.88593442  8.57782907  6.94984981  9.0727788 ]
 [12.14958395 10.67221668  7.88235494  8.57600758  6.9466947   9.07237613]
 [12.09468593 10.61964296  7.83899114  8.53555008  6.90852943  9.03776196]
 [11.9721092  10.50339834  7.74476521  8.44542708  6.8257481   8.95858876]
 [12.06841618 10.59356098  7.81496041  8.51619168  6.88736981  9.02351234]
 [11.98817909 10.51816977  7.75543668  8.45723865  6.8351275   8.97016146]]


## Adding & Normalizing

In [None]:
summed_matrix = np.array(list(summed_embeddings.values()))
added_matrix = normalized_single_head + summed_matrix
print(added_matrix)

[[12.94205075 12.27504404  8.84593442 10.21782907  7.91984981 10.2727788 ]
 [13.37108395 11.79111668  7.94455494 10.36600758  7.8466947  10.81237613]
 [13.01398593 12.12534296  8.11329114  9.84555008  7.46852943 10.62776196]
 [12.2332092  12.09369834  8.40126521  9.66542708  6.8957481  10.32858876]
 [12.19161618 11.98636098  8.61356041 10.13619168  7.38736981 10.72351234]
 [11.62927909 11.82136977  8.51623668  9.93723865  7.7751275  10.18016146]]


### Normalizing resultant Matrix

In [None]:
def normalize_rows(matrix, error=0.0001):

    row_means = np.mean(matrix, axis=1)
    row_stds = np.std(matrix, axis=1)
    normalized_added_matrix = (matrix - row_means[:, np.newaxis]) / (row_stds[:, np.newaxis] + error)

    return normalized_added_matrix

normalized_added_matrix = normalize_rows(added_matrix)
print(normalized_added_matrix)

[[ 1.43703778  1.05814904 -0.88973394 -0.11043828 -1.41579016 -0.07922445]
 [ 1.52473129  0.72592405 -1.21883863  0.00541072 -1.26831526  0.23108782]
 [ 1.41479079  0.9681533  -1.04832892 -0.17768479 -1.37239017  0.21545977]
 [ 1.20607904  1.13282286 -0.80604803 -0.14224544 -1.59658456  0.20597613]
 [ 1.16991027  1.05094645 -0.90389432 -0.02139279 -1.61458187  0.31901227]
 [ 1.11601572  1.24572764 -0.98610975 -0.02655827 -1.4865541   0.13747876]]


## Feed Forward

### Linear layer

In [None]:
w = np.array([[0.5, 0.05, 0.97, 0.22, 0.56, 0.02],
              [0.17, 0.52, 0.63, 0.48, 0.06, 0.6],
              [0.53, 0.87, 0.47, 0.1, 0.31, 0.79],
              [0.83, 0.58, 0.38, 0.09, 0.64, 0.25],
              [0.81, 0.85, 0.74, 0.35, 0.31, 0.53],
              [0.25, 0.31, 0.22, 0.77, 0.57, 0.85]])
x_dot_w = np.dot(normalized_added_matrix, w)
print(x_dot_w)

[[-0.83141467 -1.44401455  0.53530495  0.16861763  0.0376792  -0.88457876]
 [-0.72528424 -1.60996506  0.47780945  0.29651481  0.26157018 -0.97126322]
 [-0.88888233 -1.54066319  0.45388127  0.3407105   0.10904293 -0.80763922]
 [-0.99138923 -1.42763658  0.3145214   0.31548249  0.02492684 -0.63963411]
 [-0.96126223 -1.467309    0.23934174  0.35005556  0.10562453 -0.6500266 ]
 [-0.9446391  -1.39069269  0.32397523  0.32803622 -0.00544774 -0.6869261 ]]


In [None]:
b = np.array([[0.42, 0.18, 0.25, 0.42, 0.35, 0.45]])
linear_layer_matrix = x_dot_w + b
print(linear_layer_matrix)

[[-0.41141467 -1.26401455  0.78530495  0.58861763  0.3876792  -0.43457876]
 [-0.30528424 -1.42996506  0.72780945  0.71651481  0.61157018 -0.52126322]
 [-0.46888233 -1.36066319  0.70388127  0.7607105   0.45904293 -0.35763922]
 [-0.57138923 -1.24763658  0.5645214   0.73548249  0.37492684 -0.18963411]
 [-0.54126223 -1.287309    0.48934174  0.77005556  0.45562453 -0.2000266 ]
 [-0.5246391  -1.21069269  0.57397523  0.74803622  0.34455226 -0.2369261 ]]


### ReLU

In [None]:
def relu(matrix):

    relu_matrix = np.maximum(0, matrix)

    return relu_matrix

relu_matrix = relu(linear_layer_matrix)
print(relu_matrix)

[[0.         0.         0.78530495 0.58861763 0.3876792  0.        ]
 [0.         0.         0.72780945 0.71651481 0.61157018 0.        ]
 [0.         0.         0.70388127 0.7607105  0.45904293 0.        ]
 [0.         0.         0.5645214  0.73548249 0.37492684 0.        ]
 [0.         0.         0.48934174 0.77005556 0.45562453 0.        ]
 [0.         0.         0.57397523 0.74803622 0.34455226 0.        ]]


In [None]:
added_matrix_2 = normalized_added_matrix + relu_matrix
print(added_matrix_2)

[[ 1.43703778  1.05814904 -0.10442899  0.47817935 -1.02811096 -0.07922445]
 [ 1.52473129  0.72592405 -0.49102917  0.72192554 -0.65674508  0.23108782]
 [ 1.41479079  0.9681533  -0.34444765  0.58302572 -0.91334723  0.21545977]
 [ 1.20607904  1.13282286 -0.24152662  0.59323705 -1.22165772  0.20597613]
 [ 1.16991027  1.05094645 -0.41455258  0.74866277 -1.15895735  0.31901227]
 [ 1.11601572  1.24572764 -0.41213452  0.72147794 -1.14200183  0.13747876]]


In [None]:
normalized_added_matrix_2 = normalize_rows(added_matrix_2)
print(normalized_added_matrix_2)

[[ 1.40545695  0.93974561 -0.48923796  0.22687547 -1.62458229 -0.45825778]
 [ 1.57140817  0.50950891 -1.10825524  0.50419347 -1.32855068 -0.14830463]
 [ 1.4005104   0.82883336 -0.85124018  0.33588638 -1.57940752 -0.13458244]
 [ 1.10571699  1.01833053 -0.62111533  0.37466476 -1.79030256 -0.08729439]
 [ 1.06157669  0.91872743 -0.8410131   0.55575153 -1.73487875  0.0398362 ]
 [ 0.9846054   1.13696363 -0.81034338  0.52118558 -1.66763766 -0.16477356]]


# Decoder

In [None]:
# embeddings = {
#     '<start>': np.array([0.31, 0.21, 0.12, 0.64, 0.98, 0.2]),
# }

In [None]:
# positional_embeddings = {}

# for i, word in enumerate(embeddings.keys()):
#     positional_embeddings[word] = np.zeros(6)
#     for d in range(6):
#         if d % 2 == 0:
#             positional_embeddings[word][d] = np.sin(i / (10000 ** (2*d / 6)))
#             print(f"Word: {word}, Position: {i}, Dimension: {d}, Formula: sin({i} / (10000 ** ((2*{d}) / 6))), Value: {positional_embeddings[word][d]}")
#         else:
#             positional_embeddings[word][d] = np.cos(i / (10000 ** (2*d / 6)))
#             print(f"Word: {word}, Position: {i}, Dimension: {d}, Formula: cos({i} / (10000 ** ((2*{d}) / 6))), Value: {positional_embeddings[word][d]}")

# for word, embedding in positional_embeddings.items():
#     positional_embeddings[word] = np.round(embedding, 4)

# print(positional_embeddings)

Word: <start>, Position: 0, Dimension: 0, Formula: sin(0 / (10000 ** ((2*0) / 6))), Value: 0.0
Word: <start>, Position: 0, Dimension: 1, Formula: cos(0 / (10000 ** ((2*1) / 6))), Value: 1.0
Word: <start>, Position: 0, Dimension: 2, Formula: sin(0 / (10000 ** ((2*2) / 6))), Value: 0.0
Word: <start>, Position: 0, Dimension: 3, Formula: cos(0 / (10000 ** ((2*3) / 6))), Value: 1.0
Word: <start>, Position: 0, Dimension: 4, Formula: sin(0 / (10000 ** ((2*4) / 6))), Value: 0.0
Word: <start>, Position: 0, Dimension: 5, Formula: cos(0 / (10000 ** ((2*5) / 6))), Value: 1.0
{'<start>': array([0., 1., 0., 1., 0., 1.])}


In [None]:
# summed_embeddings = {}

# for word in embeddings.keys():
#     summed_embeddings[word] = embeddings[word] + positional_embeddings[word]

# print(summed_embeddings)

{'<start>': array([0.31, 1.21, 0.12, 1.64, 0.98, 1.2 ])}


In [None]:
# query_weights = np.array([
#     [0.52, 0.45, 0.91, 0.69],
# ])
# key_weights = np.array([
#     [0.74, 0.57, 0.21, 0.73],
# ])
# value_weights = np.array([
#     [0.62, 0.07, 0.7, 0.95],
# ])

In [None]:
# query = {}

# for word in summed_embeddings.keys():
#     query [word] = np.zeros(4)
#     for i in range(4):
#         query [word][i] = np.nansum(summed_embeddings[word] * query_weights[:, i])

# print(query)

{'<start>': array([2.8392, 2.457 , 4.9686, 3.7674])}
