In [55]:
%load_ext autoreload
%autoreload 2

from kernel_lib import *

# import importlib
# importlib.reload(kernel_lib)

In [56]:
kernel_code = """
#include <curand_kernel.h>
#include <math.h>

extern "C" __global__ void generate_random_numbers(float* numbers, int seed, int N) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if (idx < N) {
    curandState state;
    curand_init(seed, idx, 0, &state);
    numbers[idx] = curand_uniform(&state);
  }
}

extern "C" __global__ void debug_func(void) {
  printf("Debug print %f\\n", powf(2, 1));
}

// Init array with some provided value
extern "C" __global__ void init_array_w_val(float* arr, int val, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {
    arr[idx] = val;
  }
}

extern "C" __global__ void calc_positional_encoding(float* pos_enc, int num_rows, int num_cols) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  if (row < num_rows && col < num_cols) {
    int idx = row * num_cols + col;
    
    int token_idx = row;
    int current_dim = col;
    int token_dims = num_cols;

    pos_enc[idx] = (current_dim & 1) ?
                    sinf(token_idx) / powf(10000, (2 * current_dim) / token_dims) :
                    cosf(token_idx) / powf(10000, (2 * current_dim) / token_dims);
  }
}
"""

mod = SourceModule(kernel_code,
                   no_extern_c=True,  # This is important!
                   options=["-std=c++11",
                           "-Xcompiler",
                           "-fPIC"])

debug_func = mod.get_function("debug_func")
init_array_w_val = mod.get_function("init_array_w_val")
gen_pos_encodings = mod.get_function("calc_positional_encoding")
generate_random_numbers = mod.get_function("generate_random_numbers")

In [57]:
vocab = ["This", "is", "a", "sentence"]

pos_enc_seq_len = 10
token_dims = 10

pos_encodings_num_elements = pos_enc_seq_len * token_dims
pos_encodings_size_bytes = pos_encodings_num_elements * np.float32().nbytes

In [58]:
pos_encodings_gpu = cuda.mem_alloc(pos_encodings_size_bytes)
init_array_w_val(pos_encodings_gpu, np.int32(123), np.int32(pos_encodings_num_elements), block=(pos_encodings_num_elements,1,1))
gen_pos_encodings(pos_encodings_gpu, np.int32(pos_enc_seq_len), np.int32(token_dims), block=(token_dims, pos_enc_seq_len, 1))
cuda.Context.synchronize()

print_gpu_array(pos_encodings_gpu,
                "pos_encoding",
                pos_encodings_num_elements,
                shape=[pos_enc_seq_len, token_dims],
                verbose=True)

pos_encoding=[[ 1.0000e+00  0.0000e+00  1.0000e+00  0.0000e+00  1.0000e+00  0.0000e+00  1.0000e-04  0.0000e+00  1.0000e-04
   0.0000e+00]
 [ 5.4030e-01  8.4147e-01  5.4030e-01  8.4147e-01  5.4030e-01  8.4147e-05  5.4030e-05  8.4147e-05  5.4030e-05
   8.4147e-05]
 [-4.1615e-01  9.0930e-01 -4.1615e-01  9.0930e-01 -4.1615e-01  9.0930e-05 -4.1615e-05  9.0930e-05 -4.1615e-05
   9.0930e-05]
 [-9.8999e-01  1.4112e-01 -9.8999e-01  1.4112e-01 -9.8999e-01  1.4112e-05 -9.8999e-05  1.4112e-05 -9.8999e-05
   1.4112e-05]
 [-6.5364e-01 -7.5680e-01 -6.5364e-01 -7.5680e-01 -6.5364e-01 -7.5680e-05 -6.5364e-05 -7.5680e-05 -6.5364e-05
  -7.5680e-05]
 [ 2.8366e-01 -9.5892e-01  2.8366e-01 -9.5892e-01  2.8366e-01 -9.5892e-05  2.8366e-05 -9.5892e-05  2.8366e-05
  -9.5892e-05]
 [ 9.6017e-01 -2.7942e-01  9.6017e-01 -2.7942e-01  9.6017e-01 -2.7942e-05  9.6017e-05 -2.7942e-05  9.6017e-05
  -2.7942e-05]
 [ 7.5390e-01  6.5699e-01  7.5390e-01  6.5699e-01  7.5390e-01  6.5699e-05  7.5390e-05  6.5699e-05  7.5390e-05
  

In [59]:
sentence = "This is a sentence"
sentence_toks = [0, 1, 2, 3] # Straight forward
word2tok = {"This" : 0, "is" : 1, "a" : 2, "sentence" : 3}

In [60]:
# Embedding matrix shape : (token, vector dimensions)
vocab_size = len(vocab)
embedding_num_elements = vocab_size * token_dims
embedding_size_bytes = embedding_num_elements * np.float32().nbytes

In [61]:
embedding_matrix_gpu = cuda.mem_alloc(embedding_size_bytes)
# init_array(embedding_matrix_gpu, np.int32(embedding_num_elements), block=(embedding_num_elements, 1, 1))
generate_random_numbers(embedding_matrix_gpu, np.int32(0), np.int32(embedding_num_elements), block=(embedding_num_elements, 1, 1))
cuda.Context.synchronize()

print_gpu_array(embedding_matrix_gpu,
                "embedding matrix",
                embedding_num_elements,
                shape=[vocab_size, token_dims])


embedding matrix=[[0.7402 0.921  0.039  0.969  0.9251 0.4464 0.6673 0.1099 0.4702 0.5132]
 [0.7762 0.2948 0.714  0.3585 0.6814 0.292  0.3194 0.8109 0.1541 0.4452]
 [0.208  0.611  0.3073 0.4156 0.2343 0.8793 0.6462 0.9264 0.5786 0.5538]
 [0.3557 0.7229 0.2783 0.6192 0.5876 0.375  0.2405 0.4148 0.0937 0.6326]]


In [62]:
kernel_code = """
// Assumes embedding matrix has been sized such that Dim(embedding_matrix) < Dim(pos_enc)
extern "C" __global__ void add_pos_enc_and_embed(float* embedding_matrix, float* pos_enc, float* output, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {
    output[idx] = embedding_matrix[idx] + pos_enc[idx];
  }
}
"""

mod = SourceModule(kernel_code,
                   no_extern_c=True,
                   options=["-std=c++11",
                           "-Xcompiler",
                           "-fPIC"])

add_pos_enc_and_embed = mod.get_function("add_pos_enc_and_embed")

In [63]:
pos_encoded_emb_gpu = cuda.mem_alloc(embedding_size_bytes)
add_pos_enc_and_embed(embedding_matrix_gpu,
                      pos_encodings_gpu,
                      pos_encoded_emb_gpu,
                      np.int32(embedding_num_elements),
                      block=(embedding_num_elements, 1, 1))
cuda.Context.synchronize()

print_gpu_array(pos_encoded_emb_gpu,
                "pos_encoded_emb",
                embedding_num_elements,
                shape=[vocab_size, token_dims])

pos_encoded_emb=[[ 1.7402  0.921   1.039   0.969   1.9251  0.4464  0.6674  0.1099  0.4703  0.5132]
 [ 1.3165  1.1362  1.2543  1.2     1.2217  0.2921  0.3195  0.811   0.1542  0.4452]
 [-0.2082  1.5203 -0.1089  1.3249 -0.1819  0.8794  0.6462  0.9265  0.5785  0.5539]
 [-0.6343  0.864  -0.7117  0.7603 -0.4024  0.3751  0.2404  0.4148  0.0936  0.6326]]


In [64]:
# Take the input sentence
# convert to tokens (idices)
# TODO(MASAAD): Do this later, assume done for now
# Use sentence_toks

# use tokens as lookup into embedding matrix
# Add embedding element + positional encoding


In [65]:
linear_layer_code = """
// extern "C" __device__ void 

// Inputs x is a matrix and w is a vector
// Dereference the vector in x and vector multiply by w
extern "C" __global__ void linear_layer(float* x, float* w, int num_rows, int num_cols) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {

  }
}
"""

In [None]:
a_rows = 4
a_cols = 3

b_rows = 3
b_cols = 4

c_rows = 4
c_cols = 4

dummy_a = cuda.mem_alloc(a_rows * a_cols * 4)
dummy_b = cuda.mem_alloc(b_rows * b_cols * 4)
dummy_c = cuda.mem_alloc(c_rows * c_cols * 4)

init_array(dummy_a, np.int32(a_rows * a_cols), block=(a_rows * a_cols,1,1))
init_array(dummy_b, np.int32(b_rows * b_cols), block=(b_rows * b_cols,1,1))
init_array(dummy_c, np.int32(c_rows * c_cols), block=(c_rows * c_cols,1,1))

cuda.Context.synchronize()

host_a = np.empty(a_rows * a_cols, np.float32)
host_b = np.empty(b_rows * b_cols, np.float32)
host_c = np.empty(c_rows * c_cols, np.float32)

cuda.memcpy_dtoh(host_a, dummy_a)
cuda.memcpy_dtoh(host_b, dummy_b)

regular_matmul(dummy_a, dummy_b, np.int32(c_rows), np.int32(c_cols), np.int32(a_cols), dummy_c, block=(c_rows,c_cols,1))
cuda.Context.synchronize()
cuda.memcpy_dtoh(host_c, dummy_c)

# print_gpu_array(dummy_c, "dummy_c", c_rows * c_cols, shape=[c_rows,c_cols])

a_h = host_a.reshape(a_rows,a_cols)
b_h = host_b.reshape(b_rows,b_cols)
c_h = host_c.reshape(c_rows,c_cols)

print(f"{c_h=}")
print(f"{c_h=}")

expected_result = a_h @ b_h

print(f"{c_h=}")
print(f"{expected_result=}")

if (np.allclose(c_h, expected_result)):
  print("Matmul successful")
else:
  print("Matmul did not match GPU...")

c_h=array([[  20.,   23.,   26.,   29.],
       [  68.,   83.,   98.,  113.],
       [ 116.,  143.,  170.,  197.],
       [3784., 4630., 5476., 6322.]], dtype=float32)
expected_result=array([[ 20.,  23.,  26.,  29.],
       [ 56.,  68.,  80.,  92.],
       [ 92., 113., 134., 155.],
       [128., 158., 188., 218.]], dtype=float32)
Matmul did not match GPU...


In [13]:
# Embedding matrix = [vocab_len, token_dims]
# Technically you can make swap the dimensions of this and it will still work
# One way requires a transpose, the other doesn't
# Weights: [3 * token_dims, token_dims]
weights_dim = token_dims
weights_num_elements = 3 * token_dims * weights_dim
weights_size_bytes = weights_num_elements * np.float32().nbytes
weights_matrix_gpu = cuda.mem_alloc(weights_size_bytes)

# QKV matrix = [vocab_len, 3 * token_dims]
qkv_matrix_dim = vocab_size
qkv_matrix_num_elements = qkv_matrix_dim * 3 * token_dims
qkv_matrix_bytes = qkv_matrix_num_elements * np.float32().nbytes
qkv_matrix_gpu = cuda.mem_alloc(qkv_matrix_bytes)

regular_matmul(embedding_matrix_gpu, weights_matrix_gpu, np.int32(), np.int32())




ValueError: must specify block size