In [1]:
import numpy as np
from scipy.special import softmax

### Step 1: Represent the input

In [2]:
d_model = 4
num_inputs = 3

In [3]:
print("Step: 1\nd_model: {}\nnum_inputs: {}".format(d_model, num_inputs))

x =np.array([[1.0, 0.0, 1.0, 0.0],  # Input 1
            [0.0, 2.0, 0.0, 2.0],   # Input 2
            [1.0, 1.0, 1.0, 1.0]])  # Input 3
x

Step: 1
d_model: 4
num_inputs: 3


array([[1., 0., 1., 0.],
       [0., 2., 0., 2.],
       [1., 1., 1., 1.]])

### Step 2: Initializing the weight matrices

In [4]:
print("Step 2, weights 3 dimnesions x d_model=4")

Step 2, weights 3 dimnesions x d_model=4


In [5]:
print("w_query")
w_query =np.array([[1, 0, 1],
                [1, 0, 0],
                [0, 0, 1],
                [0, 1, 1]])

w_query

w_query


array([[1, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 1]])

In [6]:
print("w_key")
w_key =np.array([[0, 0, 1],
                [1, 1, 0],
                [0, 1, 0],
                [1, 1, 0]])
w_key

w_key


array([[0, 0, 1],
       [1, 1, 0],
       [0, 1, 0],
       [1, 1, 0]])

In [7]:
print("w_value")
w_value = np.array([[0, 2, 0],
                    [0, 3, 0],
                    [1, 0, 3],
                    [1, 1, 0]])
w_value

w_value


array([[0, 2, 0],
       [0, 3, 0],
       [1, 0, 3],
       [1, 1, 0]])

### Step 3: Matrix multiplication to obtain Q, K, and V

>> We will now multiply the input vectors by the weight matrices to obtain a query, key, and value
vector for each input.

In [8]:
print("Step 3: Matrix multiplication to obtain Q,K,V")
print("Query: x * w_query")
Q=np.matmul(x,w_query)
print(Q)

Step 3: Matrix multiplication to obtain Q,K,V
Query: x * w_query
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]


In [9]:
print("Key: x * w_key")
K=np.matmul(x,w_key)
print(K)

Key: x * w_key
[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]


In [10]:
print("Value: x * w_value")
V=np.matmul(x,w_value)
print(V)

Value: x * w_value
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


### Step 4: Scaled attention scores

In [11]:
print("Step 4: Scaled Attention Scores")
k_d=3
#square root of k_d=3 rounded down to 1 for this example
attention_scores = (Q @ K.transpose())/np.sqrt(k_d)
print(attention_scores)

Step 4: Scaled Attention Scores
[[1.15470054 2.30940108 2.30940108]
 [2.30940108 9.23760431 6.92820323]
 [2.30940108 6.92820323 5.77350269]]


### Step 5: Scaled softmax attention scores for each vector

In [12]:
print("Step 5: Scaled softmax attention_scores for each vector")
attention_scores[0]=softmax(attention_scores[0])
attention_scores[1]=softmax(attention_scores[1])
attention_scores[2]=softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step 5: Scaled softmax attention_scores for each vector
[0.1361258 0.4319371 0.4319371]
[8.90447391e-04 9.08842647e-01 9.02669054e-02]
[0.00744489 0.75470758 0.23784753]


### Step 6: The final attention representations

In [13]:
print("Step 6: attention value obtained by score1/k_d * V")
print(V[0])
print(V[1])
print(V[2])
print("Attention 1")
attention1=attention_scores[0].reshape(-1,1)
attention1=attention_scores[0][0]*V[0]
print(attention1)
print("Attention 2")
attention2=attention_scores[0][1]*V[1]
print(attention2)
print("Attention 3")
attention3=attention_scores[0][2]*V[2]
print(attention3)

Step 6: attention value obtained by score1/k_d * V
[1. 2. 3.]
[2. 8. 0.]
[2. 6. 3.]
Attention 1
[0.1361258  0.2722516  0.40837739]
Attention 2
[0.8638742  3.45549681 0.        ]
Attention 3
[0.8638742  2.59162261 1.2958113 ]


### Step 7: Summing up the results

In [14]:
print("Step 7: summed the results to create the first line of the output matrix")
attention_input1=attention1+attention2+attention3
print(attention_input1)

Step 7: summed the results to create the first line of the output matrix
[1.8638742  6.31937101 1.7041887 ]


### Step 8: Steps 1 to 7 for all the inputs

In [15]:
print("Step 8: Step 1 to 7 for inputs 1 to 3")
# We assume we have 3 results with learned weights (they were not trained
# in this example)
# We assume we are implementing the original Transformer paper.We will have
# 3 results of 64 dimensions each
attention_head1=np.random.random((3, 64))
print(attention_head1)

Step 8: Step 1 to 7 for inputs 1 to 3
[[8.24859659e-01 7.60556023e-01 8.83689840e-01 1.15265212e-04
  7.10441296e-01 5.89614305e-01 1.53699583e-01 7.11568853e-02
  4.40883363e-01 3.59541317e-01 2.68940293e-01 5.56302084e-01
  1.37928957e-03 8.95447209e-01 3.22348158e-01 9.91331155e-01
  6.66787366e-01 9.52685046e-01 5.44194990e-02 9.34306880e-01
  3.66149979e-02 3.99657194e-01 9.76992955e-01 5.17889447e-01
  7.63209663e-01 3.24616026e-01 2.18124783e-01 5.03773780e-01
  7.45387077e-02 7.52548805e-01 5.43039412e-01 3.12124503e-01
  8.69622622e-01 3.07280766e-01 7.57450557e-01 7.94524705e-01
  3.21616920e-01 7.95210866e-02 9.97776744e-01 2.51340285e-01
  5.52160281e-01 6.30664248e-01 1.08351210e-01 9.10212989e-01
  8.88401912e-03 2.59325293e-01 7.25931095e-01 4.60036619e-01
  6.60118657e-01 9.37065022e-01 1.14424572e-01 8.67366361e-01
  2.38109930e-01 9.26472090e-01 4.95820807e-01 3.54165605e-01
  7.84194779e-02 4.33090374e-01 9.71595440e-01 7.08742329e-01
  5.87650881e-01 5.94664849e-01 

### Step 9: The output of the heads of the attention sublayer

In [16]:
# We assume that we have trained the 8 heads of the attention sublayer. The Transformer now has 3
# output vectors (of the 3 input vectors that are words or word pieces) of d_model = 64 dimensions each:
print("Step 9: We assume we have trained the 8 heads of the attention sublayer")
z0h1=np.random.random((3, 64))
z1h2=np.random.random((3, 64))
z2h3=np.random.random((3, 64))
z3h4=np.random.random((3, 64))
z4h5=np.random.random((3, 64))
z5h6=np.random.random((3, 64))
z6h7=np.random.random((3, 64))
z7h8=np.random.random((3, 64))
print("shape of one head",z0h1.shape,"\ndimension of 8 heads",64*8)

Step 9: We assume we have trained the 8 heads of the attention sublayer
shape of one head (3, 64) 
dimension of 8 heads 512


### Step 10: Concatenation of the output of the heads

In [17]:
print("Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 ouput dimension of the model")
output_attention=np.hstack((z0h1,z1h2,z2h3,z3h4,z4h5,z5h6,z6h7,z7h8))
print(output_attention), output_attention.shape

Step 10: Concantenation of heads 1 to 8 to obtain the original 8x64=512 ouput dimension of the model
[[0.11739286 0.68214805 0.07517389 ... 0.30970564 0.48738415 0.30776542]
 [0.12471924 0.52269836 0.59861807 ... 0.14236979 0.30968904 0.62350204]
 [0.30895642 0.6722681  0.38038981 ... 0.13022529 0.30561542 0.07303612]]


(None, (3, 512))

In [18]:
from transformers import pipeline
translator = pipeline("translation_en_to_fr")
translator("Hello, my dog is cute", max_length=40)

2023-02-06 17:05:31.710445: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-06 17:05:32.339716: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.2/lib64
2023-02-06 17:05:32.339778: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/local/cuda-11.2/lib64


RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
/home/adminvbdi/anaconda3/lib/python3.9/site-packages/torch/lib/libtorch_cuda_cpp.so: undefined symbol: cudaGraphRetainUserObject, version libcudart.so.11.0