**CUDA**

Евгений Борисов <esborisov@sevsu.ru>

---

In [3]:
# !pip install pycuda

In [4]:
import pycuda

# help(pycuda)
pycuda.VERSION

(2024, 1)

In [5]:
import pycuda.autoinit
import pycuda.driver as cuda

(free,total) = cuda.mem_get_info()

print('total %iMB'%(total//1024**2))
print('free %iMB - %.02f%% '%(free//1024**2,free*100/total))

total 24445MB
free 24293MB - 99.38% 


In [6]:
cuda.Device.count()

1

In [7]:
device = cuda.Device(0)

print('device:',device.name())

device: Tesla P40


In [8]:
from tabulate import tabulate 

attrs = device.get_attributes()
tabulate( [ [str(k),int(v)] for k,v in attrs.items() ],tablefmt='html', )

0,1
ASYNC_ENGINE_COUNT,2
CAN_MAP_HOST_MEMORY,1
CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM,1
CLOCK_RATE,1531000
COMPUTE_CAPABILITY_MAJOR,6
COMPUTE_CAPABILITY_MINOR,1
COMPUTE_MODE,0
COMPUTE_PREEMPTION_SUPPORTED,1
CONCURRENT_KERNELS,1
CONCURRENT_MANAGED_ACCESS,1


In [6]:
print(
    '%s\n  CUDA COMPUTE CAPABILITY: %i.%i'%(
    device.name(),
    attrs[pycuda._driver.device_attribute.COMPUTE_CAPABILITY_MAJOR],
    attrs[pycuda._driver.device_attribute.COMPUTE_CAPABILITY_MINOR]
    )
)

NVIDIA GeForce GTX 1060 6GB
  CUDA COMPUTE CAPABILITY: 6.1


----

In [7]:
import numpy as np
from pycuda.compiler import SourceModule

In [8]:
mod = SourceModule('''
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
''')

multiply_them = mod.get_function('multiply_them')

In [9]:
n = 1024 # MAX_THREADS_PER_BLOCK: 1024

a = np.random.randn(n).astype(np.float32)
b = np.random.randn(n).astype(np.float32)
z = np.zeros(n, dtype=np.float32)

multiply_them( cuda.Out(z), cuda.In(a), cuda.In(b), block=(n,1,1), grid=(1,1))

In [10]:
assert (z == a*b).all()

In [11]:
z

array([ 2.1113584 , -0.40109885,  0.05108616, ..., -0.12756826,
        3.0181897 , -0.92047447], dtype=float32)