In [1]:
!nvcc --version
!pip3 install pycuda

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Collecting pycuda
  Downloading pycuda-2025.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting mako (from pycuda)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading pytools-2025.1.1-py3-none-any.whl (92 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.8/92.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90

In [2]:
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
import numpy as np

In [17]:
kernel_code = """
#include <curand_kernel.h>

extern "C" __global__ void generate_random_numbers(float* numbers, int n, int seed) {
  int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if (idx < n) {
    curandState state;
    curand_init(seed, idx, 0, &state);
    numbers[idx] = curand_uniform(&state);
  }
}

extern "C" __global__ void add_positional_encoding(float* embedding_matrix, )
"""

mod = SourceModule(kernel_code,
                   no_extern_c=True,  # This is important!
                   options=["-std=c++11",
                           "-Xcompiler",
                           "-fPIC"])

In [22]:
sentence = "This is a sentence"
vocab = ["This", "is", "a", "sentence"]
sentence_toks = [0, 1, 2, 3] # Straight forward
word2tok = {"This" : 0, "is" : 1, "a" : 2, "sentence" : 3}

# Create the embedding matrix
vocab_size = len(vocab)
embedding_dimension = 10 # num of dimensions in each vector in the embedding matrix
embedding_num_elements = vocab_size * embedding_dimension
embedding_size_bytes = embedding_num_elements * np.float32().nbytes
embedding_matrix_gpu = cuda.mem_alloc(embedding_size_bytes)

generate_random_numbers = mod.get_function("generate_random_numbers")
generate_random_numbers(embedding_matrix_gpu, np.int32(embedding_num_elements), np.int32(0), block=(256, 1, 1), grid=(int(np.ceil(embedding_num_elements / 256)), 1))

embedding_matrix_host = np.empty(embedding_num_elements, dtype=np.float32)
cuda.memcpy_dtoh(embedding_matrix_host, embedding_matrix_gpu)

# print(embedding_matrix_host)

# We need to build a method to lookup tokens in the embedding matrix
# Use word2tok

# Create Positional Encoding


[0.74021935 0.9209938  0.03902049 0.9689629  0.92514056 0.4463501
 0.6673192  0.10993068 0.4702186  0.51319367 0.77617514 0.29476565
 0.71400964 0.35850185 0.68141866 0.29201493 0.319409   0.81091344
 0.15411183 0.44516575 0.20799614 0.6109805  0.3072757  0.4155753
 0.23426796 0.87933475 0.64623165 0.92644703 0.5785622  0.55384874
 0.3557058  0.722922   0.27829847 0.6191796  0.5875587  0.37504062
 0.24048613 0.41475514 0.09369452 0.6325524 ]
