In [578]:
%load_ext autoreload
%autoreload 2

from kernel_lib import *
from matrix import Matrix
import math

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [579]:
vocab = ["This", "is", "a", "sentence"]
vocab_size = len(vocab)

pos_enc_sequence_len = 10
token_dims = vocab_size

In [580]:
pos_encodings = Matrix(pos_enc_sequence_len, token_dims, np.float32, gpu=True)
pos_encodings.alloc_on_gpu()
gen_pos_encodings(pos_encodings.a_gpu,
                  np.int32(pos_encodings.num_rows),
                  np.int32(pos_encodings.num_cols),
                  block=(pos_encodings.num_cols, pos_encodings.num_rows, 1))
cuda.Context.synchronize()

print(f"{pos_encodings=}")

pos_encodings=[[ 1.00000000e+00  0.00000000e+00  9.99999975e-05  0.00000000e+00]
 [ 5.40302277e-01  8.41471016e-01  5.40302281e-05  8.41471046e-05]
 [-4.16146815e-01  9.09297466e-01 -4.16146831e-05  9.09297451e-05]
 [-9.89992499e-01  1.41120002e-01 -9.89992477e-05  1.41119999e-05]
 [-6.53643668e-01 -7.56802499e-01 -6.53643656e-05 -7.56802474e-05]
 [ 2.83662170e-01 -9.58924294e-01  2.83662175e-05 -9.58924284e-05]
 [ 9.60170269e-01 -2.79415488e-01  9.60170291e-05 -2.79415490e-05]
 [ 7.53902256e-01  6.56986594e-01  7.53902277e-05  6.56986595e-05]
 [-1.45500034e-01  9.89358246e-01 -1.45500035e-05  9.89358232e-05]
 [-9.11130250e-01  4.12118495e-01 -9.11130264e-05  4.12118497e-05]]


In [581]:
sentence = "This is a sentence"
sentence_toks = [0, 1, 2, 3] # Straight forward
word2tok = {"This" : 0, "is" : 1, "a" : 2, "sentence" : 3}

In [582]:
embeddings = Matrix(vocab_size, token_dims, np.float32, gpu=True)
embeddings.alloc_on_gpu()
embeddings.init_uniform_rand(math.sqrt(6.0 / (embeddings.num_rows + embeddings.num_cols)))

cuda.Context.synchronize()

print(f"{embeddings=}")


embeddings=[[ 0.41607213  0.72918266 -0.7984399   0.81226754]
 [ 0.736365   -0.09292436  0.28980532 -0.67561984]
 [-0.0515829   0.0228521   0.47834936 -0.35547632]
 [ 0.37067556 -0.24508198  0.31422633 -0.3602407 ]]


In [583]:
#TODO: Add a function that adds two matrices but has the ability to "scale" down matrices based on which one is bigger, etc
# Special "trimmed" matrix add...
kernel_code = """
// Assumes embedding matrix has been sized such that Dim(embedding_matrix) < Dim(pos_enc)
extern "C" __global__ void add_pos_enc_and_embed(float* embedding_matrix, float* pos_enc, float* output, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {
    output[idx] = embedding_matrix[idx] + pos_enc[idx];
  }
}
"""

mod = SourceModule(kernel_code,
                   no_extern_c=True,
                   options=["-std=c++11",
                           "-Xcompiler",
                           "-fPIC"])

add_pos_enc_and_embed = mod.get_function("add_pos_enc_and_embed")

In [584]:
embeddings_w_pos = Matrix(embeddings.num_rows, embeddings.num_cols, np.float32, gpu=True)
embeddings_w_pos.alloc_on_gpu()
add_pos_enc_and_embed(embeddings.a_gpu,
                      pos_encodings.a_gpu,
                      embeddings_w_pos.a_gpu,
                      np.int32(embeddings.num_elements()),
                      block=(embeddings.num_elements(), 1, 1))
cuda.Context.synchronize()

print(f"{embeddings_w_pos=}")

embeddings_w_pos=[[ 1.4160721   0.72918266 -0.7983399   0.81226754]
 [ 1.2766674   0.74854666  0.28985935 -0.6755357 ]
 [-0.46772972  0.9321496   0.47830775 -0.3553854 ]
 [-0.61931694 -0.10396197  0.31412733 -0.36022657]]


In [585]:
test_a = Matrix(4,4,np.float32,gpu=True)
test_a.alloc_on_gpu()
test_a.init_incremental()

test_b = Matrix(4,4,np.float32,gpu=True)
test_b.alloc_on_gpu()
test_b.init_incremental()

test_c = test_a * test_b
print(test_c)

[[ 56.  62.  68.  74.]
 [152. 174. 196. 218.]
 [248. 286. 324. 362.]
 [344. 398. 452. 506.]]


In [586]:
print(f"Initially: {test_c=}")
test_c = test_c / 2
print(f"After scalar divide: {test_c=}")

Initially: test_c=[[ 56.  62.  68.  74.]
 [152. 174. 196. 218.]
 [248. 286. 324. 362.]
 [344. 398. 452. 506.]]
After scalar divide: test_c=[[ 28.  31.  34.  37.]
 [ 76.  87.  98. 109.]
 [124. 143. 162. 181.]
 [172. 199. 226. 253.]]


In [587]:
# Embedding matrix = [vocab_size, token_dims]
# Technically you can make swap the dimensions of this and it will still work
# One way requires a transpose, the other doesn't
# Weights: [3 * token_dims, token_dims]

#TODO: Make a weight transpose instead for learnings... Even though a bit less efficient...
# weights = Matrix(3 * embeddings.num_cols, embeddings.num_cols, np.float32, gpu=True)
# weights.alloc_on_gpu()
weights_t = Matrix(embeddings_w_pos.num_cols, 3 * embeddings_w_pos.num_cols, np.float32, gpu=True)
weights_t.alloc_on_gpu()

# QKV matrix = [vocab_size, 3 * token_dims]
QKV = Matrix(embeddings_w_pos.num_rows, weights_t.num_cols, np.float32, gpu=True)
QKV.alloc_on_gpu()

regular_matmul(embeddings_w_pos.a_gpu,
               weights_t.a_gpu,
               np.int32(QKV.num_rows),
               np.int32(QKV.num_cols),
               np.int32(embeddings_w_pos.num_cols),
               QKV.a_gpu,
               block=(QKV.num_cols, QKV.num_rows, 1))

b = Matrix(QKV.num_cols, 1, np.float32, gpu=True)
b.alloc_on_gpu()
# TODO: Fix the scale here..?
b.init_uniform_rand(math.sqrt(6.0 / (embeddings.num_rows + embeddings.num_cols)))

QKV_b = Matrix(QKV.num_rows, QKV.num_cols, np.float32, gpu=True)
QKV_b.alloc_on_gpu()

add_matrix_w_vector(QKV.a_gpu,
                    b.a_gpu,
                    np.int32(QKV.num_rows),
                    np.int32(QKV.num_cols),
                    QKV_b.a_gpu,
                    block=(QKV.num_cols, QKV.num_rows, 1))

Q = Matrix(QKV_b.num_rows, QKV_b.num_cols / 3, np.float32, gpu=True)
Q.set_gpu_matrix(QKV_b.a_gpu, stride=QKV_b.num_cols, start_idx=(0 * QKV_b.num_cols) / 3)

K = Matrix(QKV_b.num_rows, QKV_b.num_cols / 3, np.float32, gpu=True)
K.set_gpu_matrix(QKV_b.a_gpu, stride=QKV_b.num_cols, start_idx=(1 * QKV_b.num_cols) / 3)

V = Matrix(QKV_b.num_rows, QKV_b.num_cols / 3, np.float32, gpu=True)
V.set_gpu_matrix(QKV_b.a_gpu, stride=QKV_b.num_cols, start_idx=(2 * QKV_b.num_cols) / 3)

score_scaled = (Q * K.transpose()) / math.sqrt(Q.num_cols)

print(f"{score_scaled=}")

score_scaled=[[ 1.1225320e+11 -2.0338322e+07 -3.3078220e+07  1.0120327e+11]
 [-2.0338322e+07  3.8308286e+03  6.2350864e+03 -1.8336256e+07]
 [-3.3078220e+07  6.2350864e+03  1.0151939e+04 -2.9822062e+07]
 [ 1.0120327e+11 -1.8336256e+07 -2.9822062e+07  9.1241071e+10]]


In [588]:
test_a = Matrix(4,4,np.float32,gpu=True)
test_a.alloc_on_gpu()
test_a.init_incremental()

test_output = Matrix(4,1,np.float32,gpu=True)
test_output.alloc_on_gpu()
test_output.init_incremental()

print(f"{test_a=}")
print(f"Before: {test_output=}")

matrix_row_wise_add(test_a.a_gpu,
        np.int32(test_a.num_rows),
        np.int32(test_a.num_cols),
        test_output.a_gpu,
        block=(test_a.num_cols,test_a.num_rows,1),
        shared=test_a.num_elements() * test_a.dtype().nbytes)

print(f"After: {test_output=}")

test_a=[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]
 [12. 13. 14. 15.]]
Before: test_output=[[0.]
 [1.]
 [2.]
 [3.]]
After: test_output=[[ 6.]
 [22.]
 [38.]
 [54.]]


In [589]:
print(f"{score=}")
print(f"{score_scaled=}")

score_scaled_row_sum = Matrix(score_scaled.num_rows, 1, np.float32, gpu=True)
score_scaled_row_sum.alloc_on_gpu()

matrix_row_wise_add(score_scaled.a_gpu,
        np.int32(score_scaled.num_rows),
        np.int32(score_scaled.num_cols),
        score_scaled_row_sum.a_gpu,
        block=(score_scaled.num_cols, score_scaled.num_rows, 1),
        shared=score_scaled.num_elements() * score_scaled.dtype().nbytes)

score=[[329311.47 214482.02 353902.7  328667.47]
 [214482.02 139694.45 230497.36 214062.28]
 [353902.7  230497.36 380346.25 353212.4 ]
 [328667.47 214062.28 353212.4  328024.97]]
score_scaled=[[ 1.1225320e+11 -2.0338322e+07 -3.3078220e+07  1.0120327e+11]
 [-2.0338322e+07  3.8308286e+03  6.2350864e+03 -1.8336256e+07]
 [-3.3078220e+07  6.2350864e+03  1.0151939e+04 -2.9822062e+07]
 [ 1.0120327e+11 -1.8336256e+07 -2.9822062e+07  9.1241071e+10]]


In [590]:
score_scaled_row_max = Matrix(score_scaled.num_rows, 1, np.float32, gpu=True)
score_scaled_row_max.alloc_on_gpu()

matrix_row_wise_max(score_scaled.a_gpu,
        np.int32(score_scaled.num_rows),
        np.int32(score_scaled.num_cols),
        score_scaled_row_max.a_gpu,
        block=(score_scaled.num_cols, score_scaled.num_rows, 1),
        shared=score_scaled.num_elements() * score_scaled.dtype().nbytes)

In [591]:
def check_softmax(score_mat, score_row_max, score_row_sum):
  score_mat.copy_d_to_h()
  score_row_max.copy_d_to_h()
  score_row_sum.copy_d_to_h()

  print(f"{score_mat=}")
  print(f"{score_row_max=}")
  print(f"{score_row_sum=}")

  scores = score_mat.a_host / 1.0

  exp_score = np.exp(scores - score_row_max.a_host)
  softmax = exp_score / score_row_sum.a_host

  print(f"{softmax=}")

check_softmax(score_scaled, score_scaled_row_max, score_scaled_row_sum)

Allocating on host!
Allocating on host!
Allocating on host!
score_mat=[[ 1.1225320e+11 -2.0338322e+07 -3.3078220e+07  1.0120327e+11]
 [-2.0338322e+07  3.8308286e+03  6.2350864e+03 -1.8336256e+07]
 [-3.3078220e+07  6.2350864e+03  1.0151939e+04 -2.9822062e+07]
 [ 1.0120327e+11 -1.8336256e+07 -2.9822062e+07  9.1241071e+10]]
score_row_max=[[1.1225320e+11]
 [6.2350864e+03]
 [1.0151939e+04]
 [1.0120327e+11]]
score_row_sum=[[ 2.1340304e+11]
 [-3.8664512e+07]
 [-6.2883896e+07]
 [ 1.9239620e+11]]
softmax=array([[ 4.6859686e-12,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
       [-0.0000000e+00, -0.0000000e+00, -2.5863510e-08, -0.0000000e+00],
       [-0.0000000e+00, -0.0000000e+00, -1.5902323e-08, -0.0000000e+00],
       [ 5.1976080e-12,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00]],
      dtype=float32)


In [592]:
score_scaled_softmaxed = Matrix(score_scaled.num_rows, score_scaled.num_cols, np.float32, gpu=True)
score_scaled_softmaxed.alloc_on_gpu()

print(f"{score_scaled=}")
print(f"{score_scaled_row_max=}")
print(f"{score_scaled_row_sum=}")

softmax(score_scaled.a_gpu,
        score_scaled_row_max.a_gpu,
        score_scaled_row_sum.a_gpu,
        np.int32(score_scaled.num_rows),
        np.int32(score_scaled.num_cols),
        score_scaled_softmaxed.a_gpu,
        block=(score_scaled_softmaxed.num_cols, score_scaled_softmaxed.num_rows, 1))

print(f"{score_scaled_softmaxed=}")

score_scaled=[[ 1.1225320e+11 -2.0338322e+07 -3.3078220e+07  1.0120327e+11]
 [-2.0338322e+07  3.8308286e+03  6.2350864e+03 -1.8336256e+07]
 [-3.3078220e+07  6.2350864e+03  1.0151939e+04 -2.9822062e+07]
 [ 1.0120327e+11 -1.8336256e+07 -2.9822062e+07  9.1241071e+10]]
score_scaled_row_max=[[1.1225320e+11]
 [6.2350864e+03]
 [1.0151939e+04]
 [1.0120327e+11]]
score_scaled_row_sum=[[ 2.1340304e+11]
 [-3.8664512e+07]
 [-6.2883896e+07]
 [ 1.9239620e+11]]
score_scaled_softmaxed=[[1.4168753e+05 2.1128656e+05 3.4863278e+05 4.0321256e+05]
 [4.0000000e+00 5.0000000e+00 6.0000000e+00 7.0000000e+00]
 [8.0000000e+00 9.0000000e+00 1.0000000e+01 1.1000000e+01]
 [1.2000000e+01 1.3000000e+01 1.4000000e+01 1.5000000e+01]]
