In [217]:
%load_ext autoreload
%autoreload 2

from kernel_lib import *
from matrix import Matrix
import math

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [218]:
vocab = ["This", "is", "a", "sentence"]
vocab_size = len(vocab)

pos_enc_sequence_len = 10
token_dims = vocab_size

In [219]:
pos_encodings = Matrix(pos_enc_sequence_len, token_dims, np.float32, gpu=True)
pos_encodings.alloc_on_gpu()
gen_pos_encodings(pos_encodings.a_gpu,
                  np.int32(pos_encodings.num_rows),
                  np.int32(pos_encodings.num_cols),
                  block=(pos_encodings.num_cols, pos_encodings.num_rows, 1))
cuda.Context.synchronize()

print(pos_encodings)

[[ 0.          1.          0.          1.        ]
 [ 0.841471    0.5403023   0.00999983  0.99995   ]
 [ 0.90929747 -0.4161468   0.01999867  0.9998    ]
 [ 0.14112    -0.9899925   0.0299955   0.99955004]
 [-0.7568025  -0.65364367  0.03998933  0.9992001 ]
 [-0.9589243   0.28366217  0.04997917  0.99875027]
 [-0.2794155   0.96017027  0.059964    0.99820054]
 [ 0.6569866   0.75390226  0.06994285  0.997551  ]
 [ 0.98935825 -0.14550003  0.07991469  0.99680173]
 [ 0.4121185  -0.91113025  0.08987855  0.9959527 ]]


In [220]:
rows_np = np.arange(pos_enc_sequence_len)
rows_np = rows_np[:, np.newaxis]

pos_cols_np = np.arange(token_dims)
pos_cols_np = pos_cols_np[np.newaxis, :]
pos_cols_np = np.power(10000, (2 * (pos_cols_np // 2)) / token_dims)

pos_enc_pre_sin = rows_np / pos_cols_np

pos_encoding_np = np.zeros(pos_enc_pre_sin.shape)
pos_encoding_np[:, 0::2] = np.sin(pos_enc_pre_sin[:, 0::2])
pos_encoding_np[:, 1::2] = np.cos(pos_enc_pre_sin[:, 1::2])

if (pos_encodings.compare(pos_encoding_np)):
  print("They are similar!")
else:
  print("They are not similar!")

They are similar!


In [221]:
sentence = "This is a sentence"
sentence_toks = [0, 1, 2, 3] # Straight forward
word2tok = {"This" : 0, "is" : 1, "a" : 2, "sentence" : 3}

In [222]:
def xavier_uniform(fan_in, fan_out):
  return math.sqrt(6 / (fan_in + fan_out))

In [223]:
embeddings = Matrix(vocab_size, token_dims, np.float32, gpu=True)
embeddings.alloc_on_gpu()
embeddings_scale = xavier_uniform(embeddings.num_rows, embeddings.num_cols)
embeddings.init_uniform_rand(embeddings_scale)

cuda.Context.synchronize()

print(f"{embeddings=}")


embeddings=[[ 0.41607213  0.72918266 -0.7984399   0.81226754]
 [ 0.736365   -0.09292436  0.28980532 -0.67561984]
 [-0.0515829   0.0228521   0.47834936 -0.35547632]
 [ 0.37067556 -0.24508198  0.31422633 -0.3602407 ]]


In [224]:
embeddings.copy_d_to_h()
embeddings_np = embeddings.a_host.copy()
print(embeddings_np)

[[ 0.41607213  0.72918266 -0.7984399   0.81226754]
 [ 0.736365   -0.09292436  0.28980532 -0.67561984]
 [-0.0515829   0.0228521   0.47834936 -0.35547632]
 [ 0.37067556 -0.24508198  0.31422633 -0.3602407 ]]


In [225]:
#TODO: Add a function that adds two matrices but has the ability to "scale" down matrices based on which one is bigger, etc
# Special "trimmed" matrix add...
kernel_code = """
// Assumes embedding matrix has been sized such that Dim(embedding_matrix) < Dim(pos_enc)
extern "C" __global__ void add_pos_enc_and_embed(float* embedding_matrix, float* pos_enc, float* output, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {
    output[idx] = embedding_matrix[idx] + pos_enc[idx];
  }
}
"""

mod = SourceModule(kernel_code,
                   no_extern_c=True,
                   options=["-std=c++11",
                           "-Xcompiler",
                           "-fPIC"])

add_pos_enc_and_embed = mod.get_function("add_pos_enc_and_embed")

In [226]:
embeddings_w_pos = Matrix(embeddings.num_rows, embeddings.num_cols, np.float32, gpu=True)
embeddings_w_pos.alloc_on_gpu()
add_pos_enc_and_embed(embeddings.a_gpu,
                      pos_encodings.a_gpu,
                      embeddings_w_pos.a_gpu,
                      np.int32(embeddings.num_elements()),
                      block=(embeddings.num_elements(), 1, 1))
cuda.Context.synchronize()

print(f"{embeddings_w_pos=}")

embeddings_w_pos=[[ 0.41607213  1.7291827  -0.7984399   1.8122675 ]
 [ 1.577836    0.44737792  0.29980516  0.32433015]
 [ 0.8577146  -0.39329472  0.49834803  0.6443237 ]
 [ 0.5117956  -1.2350745   0.34422183  0.63930935]]


In [227]:
trimmed_pos_enc_np = pos_encoding_np[:4, :]
print(f"{embeddings_np=}")
print(f"{trimmed_pos_enc_np=}")
embeddings_w_pos_np = embeddings_np + trimmed_pos_enc_np
print(f"{embeddings_w_pos_np=}")

if (embeddings_w_pos.compare(embeddings_w_pos_np)):
  print("Embeddings are similar!")
else:
  print("Embeddings not similar!")

embeddings_np=array([[ 0.41607213,  0.72918266, -0.7984399 ,  0.81226754],
       [ 0.736365  , -0.09292436,  0.28980532, -0.67561984],
       [-0.0515829 ,  0.0228521 ,  0.47834936, -0.35547632],
       [ 0.37067556, -0.24508198,  0.31422633, -0.3602407 ]],
      dtype=float32)
trimmed_pos_enc_np=array([[ 0.        ,  1.        ,  0.        ,  1.        ],
       [ 0.84147098,  0.54030231,  0.00999983,  0.99995   ],
       [ 0.90929743, -0.41614684,  0.01999867,  0.99980001],
       [ 0.14112001, -0.9899925 ,  0.0299955 ,  0.99955003]])
embeddings_w_pos_np=array([[ 0.41607213,  1.72918266, -0.79843992,  1.81226754],
       [ 1.57783601,  0.44737795,  0.29980516,  0.32433016],
       [ 0.85771453, -0.39329474,  0.49834802,  0.64432369],
       [ 0.51179557, -1.23507447,  0.34422183,  0.63930934]])
Embeddings are similar!


In [228]:
test_a = Matrix(4,4,np.float32,gpu=True)
test_a.alloc_on_gpu()
test_a.init_incremental()

test_b = Matrix(4,4,np.float32,gpu=True)
test_b.alloc_on_gpu()
test_b.init_incremental()

test_c = test_a * test_b
print(test_c)

[[ 56.  62.  68.  74.]
 [152. 174. 196. 218.]
 [248. 286. 324. 362.]
 [344. 398. 452. 506.]]


In [229]:
print(f"Initially: {test_c=}")
test_c = test_c / 2
print(f"After scalar divide: {test_c=}")

Initially: test_c=[[ 56.  62.  68.  74.]
 [152. 174. 196. 218.]
 [248. 286. 324. 362.]
 [344. 398. 452. 506.]]
After scalar divide: test_c=[[ 28.  31.  34.  37.]
 [ 76.  87.  98. 109.]
 [124. 143. 162. 181.]
 [172. 199. 226. 253.]]


In [230]:
# Embedding matrix = [vocab_size, token_dims]
# Technically you can make swap the dimensions of this and it will still work
# One way requires a transpose, the other doesn't
# Weights: [3 * token_dims, token_dims]

#TODO: Make a weight transpose instead for learnings... Even though a bit less efficient...
# weights = Matrix(3 * embeddings.num_cols, embeddings.num_cols, np.float32, gpu=True)
# weights.alloc_on_gpu()
weights_t = Matrix(embeddings_w_pos.num_cols, 3 * embeddings_w_pos.num_cols, np.float32, gpu=True)
weights_t.alloc_on_gpu()
weights_scale = xavier_uniform(weights_t.num_rows, weights_t.num_cols)
weights_t.init_uniform_rand(weights_scale)

# QKV matrix = [vocab_size, 3 * token_dims]
QKV = embeddings_w_pos * weights_t

bias_scale = xavier_uniform(QKV.num_cols, 1)

b = Matrix(QKV.num_cols, 1, np.float32, gpu=True)
b.alloc_on_gpu()
b.init_uniform_rand(bias_scale)

QKV_b = Matrix(QKV.num_rows, QKV.num_cols, np.float32, gpu=True)
QKV_b.alloc_on_gpu()

add_matrix_w_vector(QKV.a_gpu,
                    b.a_gpu,
                    np.int32(QKV.num_rows),
                    np.int32(QKV.num_cols),
                    QKV_b.a_gpu,
                    block=(QKV.num_cols, QKV.num_rows, 1))

Q = Matrix(QKV_b.num_rows, QKV_b.num_cols / 3, np.float32, gpu=True)
Q.set_gpu_matrix(QKV_b.a_gpu, stride=QKV_b.num_cols, start_idx=(0 * QKV_b.num_cols) / 3)

K = Matrix(QKV_b.num_rows, QKV_b.num_cols / 3, np.float32, gpu=True)
K.set_gpu_matrix(QKV_b.a_gpu, stride=QKV_b.num_cols, start_idx=(1 * QKV_b.num_cols) / 3)

V = Matrix(QKV_b.num_rows, QKV_b.num_cols / 3, np.float32, gpu=True)
V.set_gpu_matrix(QKV_b.a_gpu, stride=QKV_b.num_cols, start_idx=(2 * QKV_b.num_cols) / 3)

score_scaled = (Q * K.transpose()) / math.sqrt(Q.num_cols)

print(f"{Q=}")
print(f"{K=}")
print(f"{V=}")
# print(f"{score_scaled=}")

Q=[[ 0.5858841  -0.07326937 -1.5218644   0.3128903 ]
 [-0.14234573  1.1255311  -0.9151788  -1.2557507 ]
 [-1.4943337   0.22304492  0.5137239  -0.76285994]
 [ 0.7072086   1.4134607  -1.5254618   1.6387186 ]]
K=[[ 0.5858841  -0.07326937 -1.5218644   0.3128903 ]
 [-0.14234573  1.1255311  -0.9151788  -1.2557507 ]
 [-1.4943337   0.22304492  0.5137239  -0.76285994]
 [ 0.7072086   1.4134607  -1.5254618   1.6387186 ]]
V=[[ 0.5858841  -0.07326937 -1.5218644   0.3128903 ]
 [-0.14234573  1.1255311  -0.9151788  -1.2557507 ]
 [-1.4943337   0.22304492  0.5137239  -0.76285994]
 [ 0.7072086   1.4134607  -1.5254618   1.6387186 ]]


In [231]:
b.copy_d_to_h()
b_np = b.a_host.copy()
b_np = b_np.T

# weights_t_np = np.random.uniform(low=-weights_scale, high=weights_scale, size=(weights_t.num_rows, weights_t.num_cols))
weights_t.copy_d_to_h()
weights_t_np = weights_t.a_host.copy()

QKV_np = embeddings_w_pos_np @ weights_t_np
QKV_b_np = QKV_np + b_np

split_dim = int(QKV_b_np.shape[1] / 3)

Q_np = QKV_b_np[:, : split_dim]
K_np = QKV_b_np[:, split_dim : 2 * split_dim]
V_np = QKV_b_np[:, 2 * split_dim :]

print(f"{Q_np=}")
print(f"{K_np=}")
print(f"{V_np=}")

Q_np=array([[ 0.58588406, -0.07326933, -1.52186439,  0.3128903 ],
       [ 0.70720858,  1.4134607 , -1.52546181,  1.63871851],
       [ 0.10867436,  1.24667903, -1.4293614 ,  1.59489841],
       [-0.16197938,  1.14311773, -1.44620626,  1.52932941]])
K_np=array([[-0.14234572,  1.12553115, -0.91517874, -1.2557506 ],
       [ 1.24368024,  0.12450988,  0.19428861, -1.26649397],
       [ 0.98947497, -0.02572972,  0.25539853, -0.84585492],
       [ 0.98203293, -0.33542146,  0.57010891, -0.66561129]])
V_np=array([[-1.49433365,  0.22304491,  0.51372392, -0.7628599 ],
       [-0.52501754,  0.16229238,  0.9234576 , -0.82537725],
       [-0.43512996,  0.0794783 ,  0.98639025, -0.64482916],
       [-0.07674777, -0.06324649,  1.05018927, -0.44635892]])


In [232]:
test_a = Matrix(4,4,np.float32,gpu=True)
test_a.alloc_on_gpu()
test_a.init_incremental()

test_output = Matrix(4,1,np.float32,gpu=True)
test_output.alloc_on_gpu()
test_output.init_incremental()

print(f"{test_a=}")
print(f"Before: {test_output=}")

matrix_row_wise_add(test_a.a_gpu,
        np.int32(test_a.num_rows),
        np.int32(test_a.num_cols),
        test_output.a_gpu,
        block=(test_a.num_cols,test_a.num_rows,1),
        shared=test_a.num_elements() * test_a.dtype().nbytes)

print(f"After: {test_output=}")

test_a=[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]]
Before: test_output=[[0.]
 [1.]
 [2.]
 [3.]]
After: test_output=[[ 6.]
 [22.]
 [38.]
 [54.]]


In [233]:
score_scaled_row_sum = Matrix(score_scaled.num_rows, 1, np.float32, gpu=True)
score_scaled_row_sum.alloc_on_gpu()

matrix_row_wise_add(score_scaled.a_gpu,
        np.int32(score_scaled.num_rows),
        np.int32(score_scaled.num_cols),
        score_scaled_row_sum.a_gpu,
        block=(score_scaled.num_cols, score_scaled.num_rows, 1),
        shared=score_scaled.num_elements() * score_scaled.dtype().nbytes)

In [234]:
score_scaled_row_max = Matrix(score_scaled.num_rows, 1, np.float32, gpu=True)
score_scaled_row_max.alloc_on_gpu()

matrix_row_wise_max(score_scaled.a_gpu,
        np.int32(score_scaled.num_rows),
        np.int32(score_scaled.num_cols),
        score_scaled_row_max.a_gpu,
        block=(score_scaled.num_cols, score_scaled.num_rows, 1),
        shared=score_scaled.num_elements() * score_scaled.dtype().nbytes)

In [235]:
def check_softmax(score_mat, score_row_max, score_row_sum):
  score_mat.copy_d_to_h()
  score_row_max.copy_d_to_h()
  score_row_sum.copy_d_to_h()

  print(f"{score_mat=}")
  print(f"{score_row_max=}")
  print(f"{score_row_sum=}")

  scores = score_mat.a_host / 1.0

  exp_score = np.exp(scores - score_row_max.a_host)
  softmax = exp_score / score_row_sum.a_host

  print(f"{softmax=}")

check_softmax(score_scaled, score_scaled_row_max, score_scaled_row_sum)

score_mat=[[ 1.3813001   0.41700038 -0.95617914  1.572532  ]
 [ 0.41700038  1.8507723   0.47578433  0.41423714]
 [-0.95617914  0.47578433  1.564325   -1.3876597 ]
 [ 1.572532    0.41423714 -1.3876597   3.7552238 ]]
score_row_max=[[1.572532 ]
 [1.8507723]
 [1.564325 ]
 [3.7552238]]
score_row_sum=[[ 2.4146533 ]
 [ 3.157794  ]
 [-0.30372953]
 [ 4.354333  ]]
softmax=array([[ 3.4205365e-01,  1.3040799e-01,  3.3032380e-02,  4.1413814e-01],
       [ 7.5498268e-02,  3.1667677e-01,  8.0069385e-02,  7.5289935e-02],
       [-2.6477197e-01, -1.1085768e+00, -3.2924030e+00, -1.7198174e-01],
       [ 2.5890920e-02,  8.1302943e-03,  1.3413822e-03,  2.2965631e-01]],
      dtype=float32)


In [236]:
score_scaled_softmaxed = Matrix(score_scaled.num_rows, score_scaled.num_cols, np.float32, gpu=True)
score_scaled_softmaxed.alloc_on_gpu()

print(f"{score_scaled=}")
print(f"{score_scaled_row_max=}")
print(f"{score_scaled_row_sum=}")

softmax(score_scaled.a_gpu,
        score_scaled_row_max.a_gpu,
        score_scaled_row_sum.a_gpu,
        np.int32(score_scaled.num_rows),
        np.int32(score_scaled.num_cols),
        score_scaled_softmaxed.a_gpu,
        block=(score_scaled_softmaxed.num_cols, score_scaled_softmaxed.num_rows, 1))

print(f"{score_scaled_softmaxed=}")

score_scaled=[[ 1.3813001   0.41700038 -0.95617914  1.572532  ]
 [ 0.41700038  1.8507723   0.47578433  0.41423714]
 [-0.95617914  0.47578433  1.564325   -1.3876597 ]
 [ 1.572532    0.41423714 -1.3876597   3.7552238 ]]
score_scaled_row_max=[[1.572532 ]
 [1.8507723]
 [1.564325 ]
 [3.7552238]]
score_scaled_row_sum=[[ 2.4146533 ]
 [ 3.157794  ]
 [-0.30372953]
 [ 4.354333  ]]
score_scaled_softmaxed=[[1.1846696e+06 3.8521662e+05 6.3682119e+05 4.5485572e+05]
 [4.0000000e+00 5.0000000e+00 6.0000000e+00 7.0000000e+00]
 [8.0000000e+00 9.0000000e+00 1.0000000e+01 1.1000000e+01]
 [1.2000000e+01 1.3000000e+01 1.4000000e+01 1.5000000e+01]]


In [237]:
# print(score_scaled_softmaxed.shape)
# print(V.shape)

attention_output = score_scaled_softmaxed * V
print(f"{attention_output=}")

attention_output=[[ 9.2996143e+03  1.1317338e+06 -2.5221632e+06  1.4651078e+05]
 [-2.3837347e+00  1.6567074e+01 -1.8259239e+01  1.8666782e+00]
 [-3.7580822e+00  2.7322142e+01 -3.2054363e+01  1.5986719e+00]
 [-5.1324291e+00  3.8077209e+01 -4.5849487e+01  1.3306646e+00]]


In [238]:
print(f"{embeddings_w_pos=}")
print(f"{attention_output=}")
add = embeddings_w_pos + attention_output
print(f"{add}")

embeddings_w_pos=[[ 0.41607213  1.7291827  -0.7984399   1.8122675 ]
 [ 1.577836    0.44737792  0.29980516  0.32433015]
 [ 0.8577146  -0.39329472  0.49834803  0.6443237 ]
 [ 0.5117956  -1.2350745   0.34422183  0.63930935]]
attention_output=[[ 9.2996143e+03  1.1317338e+06 -2.5221632e+06  1.4651078e+05]
 [-2.3837347e+00  1.6567074e+01 -1.8259239e+01  1.8666782e+00]
 [-3.7580822e+00  2.7322142e+01 -3.2054363e+01  1.5986719e+00]
 [-5.1324291e+00  3.8077209e+01 -4.5849487e+01  1.3306646e+00]]
[[ 9.3000303e+03  1.1317355e+06 -2.5221640e+06  1.4651259e+05]
 [-8.0589867e-01  1.7014452e+01 -1.7959435e+01  2.1910083e+00]
 [-2.9003675e+00  2.6928846e+01 -3.1556015e+01  2.2429957e+00]
 [-4.6206336e+00  3.6842136e+01 -4.5505264e+01  1.9699740e+00]]
