In [96]:
%load_ext autoreload
%autoreload 2

from kernel_lib import *
from matrix import Matrix

# import importlib
# importlib.reload(kernel_lib)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
kernel_code = """
#include <curand_kernel.h>
#include <math.h>

extern "C" __global__ void generate_random_numbers(float* numbers, int seed, int N) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if (idx < N) {
    curandState state;
    curand_init(seed, idx, 0, &state);
    numbers[idx] = curand_uniform(&state);
  }
}

extern "C" __global__ void debug_func(void) {
  printf("Debug print %f\\n", powf(2, 1));
}

extern "C" __global__ void calc_positional_encoding(float* pos_enc, int num_rows, int num_cols) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;

  if (row < num_rows && col < num_cols) {
    int idx = row * num_cols + col;
    
    int token_idx = row;
    int current_dim = col;
    int token_dims = num_cols;

    pos_enc[idx] = (current_dim & 1) ?
                    sinf(token_idx) / powf(10000, (2 * current_dim) / token_dims) :
                    cosf(token_idx) / powf(10000, (2 * current_dim) / token_dims);
  }
}
"""

mod = SourceModule(kernel_code,
                   no_extern_c=True,  # This is important!
                   options=["-std=c++11",
                           "-Xcompiler",
                           "-fPIC"])

debug_func = mod.get_function("debug_func")
gen_pos_encodings = mod.get_function("calc_positional_encoding")
generate_random_numbers = mod.get_function("generate_random_numbers")

In [98]:
vocab = ["This", "is", "a", "sentence"]
vocab_size = len(vocab)

pos_enc_seq_len = 10
token_dims = vocab_size

pos_encodings_num_elements = pos_enc_seq_len * token_dims
pos_encodings_size_bytes = pos_encodings_num_elements * np.float32().nbytes

In [99]:
pos_encodings_gpu = cuda.mem_alloc(pos_encodings_size_bytes)
init_array_w_val(pos_encodings_gpu, np.int32(123), np.int32(pos_encodings_num_elements), block=(pos_encodings_num_elements,1,1))
gen_pos_encodings(pos_encodings_gpu, np.int32(pos_enc_seq_len), np.int32(token_dims), block=(token_dims, pos_enc_seq_len, 1))
cuda.Context.synchronize()

print_gpu_array(pos_encodings_gpu,
                "pos_encoding",
                pos_encodings_num_elements,
                shape=[pos_enc_seq_len, token_dims],
                verbose=True)

pos_encoding=[[ 1.0000e+00  0.0000e+00  1.0000e-04  0.0000e+00]
 [ 5.4030e-01  8.4147e-01  5.4030e-05  8.4147e-05]
 [-4.1615e-01  9.0930e-01 -4.1615e-05  9.0930e-05]
 [-9.8999e-01  1.4112e-01 -9.8999e-05  1.4112e-05]
 [-6.5364e-01 -7.5680e-01 -6.5364e-05 -7.5680e-05]
 [ 2.8366e-01 -9.5892e-01  2.8366e-05 -9.5892e-05]
 [ 9.6017e-01 -2.7942e-01  9.6017e-05 -2.7942e-05]
 [ 7.5390e-01  6.5699e-01  7.5390e-05  6.5699e-05]
 [-1.4550e-01  9.8936e-01 -1.4550e-05  9.8936e-05]
 [-9.1113e-01  4.1212e-01 -9.1113e-05  4.1212e-05]]


In [100]:
sentence = "This is a sentence"
sentence_toks = [0, 1, 2, 3] # Straight forward
word2tok = {"This" : 0, "is" : 1, "a" : 2, "sentence" : 3}

In [101]:
# Embedding matrix shape : (token, vector dimensions)
embedding_num_elements = vocab_size * token_dims
embedding_size_bytes = embedding_num_elements * np.float32().nbytes

In [102]:
embedding_matrix_gpu = cuda.mem_alloc(embedding_size_bytes)
# init_array(embedding_matrix_gpu, np.int32(embedding_num_elements), block=(embedding_num_elements, 1, 1))
generate_random_numbers(embedding_matrix_gpu, np.int32(0), np.int32(embedding_num_elements), block=(embedding_num_elements, 1, 1))
cuda.Context.synchronize()

print_gpu_array(embedding_matrix_gpu,
                "embedding matrix",
                embedding_num_elements,
                shape=[vocab_size, token_dims])


embedding matrix=[[0.7402 0.921  0.039  0.969 ]
 [0.9251 0.4464 0.6673 0.1099]
 [0.4702 0.5132 0.7762 0.2948]
 [0.714  0.3585 0.6814 0.292 ]]


In [103]:
kernel_code = """
// Assumes embedding matrix has been sized such that Dim(embedding_matrix) < Dim(pos_enc)
extern "C" __global__ void add_pos_enc_and_embed(float* embedding_matrix, float* pos_enc, float* output, int N) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {
    output[idx] = embedding_matrix[idx] + pos_enc[idx];
  }
}
"""

mod = SourceModule(kernel_code,
                   no_extern_c=True,
                   options=["-std=c++11",
                           "-Xcompiler",
                           "-fPIC"])

add_pos_enc_and_embed = mod.get_function("add_pos_enc_and_embed")

In [104]:
pos_encoded_emb_gpu = cuda.mem_alloc(embedding_size_bytes)
add_pos_enc_and_embed(embedding_matrix_gpu,
                      pos_encodings_gpu,
                      pos_encoded_emb_gpu,
                      np.int32(embedding_num_elements),
                      block=(embedding_num_elements, 1, 1))
cuda.Context.synchronize()

print_gpu_array(pos_encoded_emb_gpu,
                "pos_encoded_emb",
                embedding_num_elements,
                shape=[vocab_size, token_dims])

pos_encoded_emb=[[ 1.7402  0.921   0.0391  0.969 ]
 [ 1.4654  1.2878  0.6674  0.11  ]
 [ 0.0541  1.4225  0.7761  0.2949]
 [-0.276   0.4996  0.6813  0.292 ]]


In [105]:
# Take the input sentence
# convert to tokens (idices)
# TODO(MASAAD): Do this later, assume done for now
# Use sentence_toks

# use tokens as lookup into embedding matrix
# Add embedding element + positional encoding


In [106]:
linear_layer_code = """
// extern "C" __device__ void 

// Inputs x is a matrix and w is a vector
// Dereference the vector in x and vector multiply by w
extern "C" __global__ void linear_layer(float* x, float* w, int num_rows, int num_cols) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N) {

  }
}
"""

In [114]:
test_a = Matrix(4,4,np.float32,gpu=True)
test_a.alloc_on_gpu()
test_a.init_incremental()

test_b = Matrix(4,4,np.float32,gpu=True)
test_b.alloc_on_gpu()
test_b.init_incremental()

test_c = test_a * test_b
print(test_c)

[[ 56.  62.  68.  74.]
 [152. 174. 196. 218.]
 [248. 286. 324. 362.]
 [344. 398. 452. 506.]]


In [115]:
print(f"Initially: {test_c=}")
test_c = test_c / 2
print(f"After scalar divide: {test_c=}")

Initially: test_c=[[ 56.  62.  68.  74.]
 [152. 174. 196. 218.]
 [248. 286. 324. 362.]
 [344. 398. 452. 506.]]
After scalar divide: test_c=[[ 28.  31.  34.  37.]
 [ 76.  87.  98. 109.]
 [124. 143. 162. 181.]
 [172. 199. 226. 253.]]


In [116]:
import math

# Embedding matrix = [vocab_size, token_dims]
# Technically you can make swap the dimensions of this and it will still work
# One way requires a transpose, the other doesn't
# Weights: [3 * token_dims, token_dims]

weights_dim = token_dims
weights_num_elements = 3 * token_dims * weights_dim
weights_size_bytes = weights_num_elements * np.float32().nbytes
weights_matrix_gpu = cuda.mem_alloc(weights_size_bytes)

# QKV matrix = [vocab_size, 3 * token_dims]
qkv_matrix_dim = vocab_size
qkv_matrix_num_elements = qkv_matrix_dim * 3 * token_dims
qkv_matrix_bytes = qkv_matrix_num_elements * np.float32().nbytes
qkv_matrix_gpu = cuda.mem_alloc(qkv_matrix_bytes)

regular_matmul(embedding_matrix_gpu, weights_matrix_gpu, np.int32(vocab_size), np.int32(3 * token_dims), np.int32(token_dims), qkv_matrix_gpu, block=(3 * token_dims, vocab_size, 1))

bias_vector_dim = 3 * token_dims
bias_size_bytes = bias_vector_dim * np.float32().nbytes
bias_vector_gpu = cuda.mem_alloc(bias_size_bytes)

generate_random_numbers(bias_vector_gpu, np.int32(0), np.int32(bias_vector_dim), block=(bias_vector_dim, 1, 1))

qkv_matrix_w_bias_gpu = cuda.mem_alloc(qkv_matrix_bytes)
add_matrix_w_vector(qkv_matrix_gpu, bias_vector_gpu, np.int32(qkv_matrix_dim), np.int32(3 * token_dims), qkv_matrix_w_bias_gpu, block=(3 * token_dims, qkv_matrix_dim, 1))

qkv_matrix = Matrix(qkv_matrix_dim, 3 * token_dims, np.float32, gpu=True)
qkv_matrix.set_gpu_matrix(qkv_matrix_w_bias_gpu)

Q = Matrix(qkv_matrix_dim, token_dims, np.float32, gpu=True)
Q.set_gpu_matrix(qkv_matrix_w_bias_gpu, stride=3*token_dims, start_idx=0)

K = Matrix(qkv_matrix_dim, token_dims, np.float32, gpu=True)
K.set_gpu_matrix(qkv_matrix_w_bias_gpu, stride=3*token_dims, start_idx=token_dims)

V = Matrix(qkv_matrix_dim, token_dims, np.float32, gpu=True)
V.set_gpu_matrix(qkv_matrix_w_bias_gpu, stride=3*token_dims, start_idx=2*token_dims)

# mul = Q * K
score = (Q * K) / math.sqrt(Q.num_cols)
print(score)

# print_gpu_array(qkv_matrix_gpu, "qkv_matrix_gpu", qkv_matrix_num_elements, shape=[qkv_matrix_dim, 3 * token_dims])

[[185758.16  213079.12  240097.53  267631.03 ]
 [ 58527.395  67138.09   75653.87   84330.98 ]
 [ 95719.59  109798.766 123723.234 137910.02 ]
 [105880.4   121453.625 136854.58  152549.8  ]]
