# Intro to CUDA (using Python!)

In [26]:
import numpy as np
import cupy as cp
import matplotlib.pyplot as plt

# A quick set of polls

Go to the link below and fill out the poll

# CuPy Basics

In [12]:
x1 = cp.arange(25,dtype=cp.float32).reshape(5,5)
x1

array([[ 0.,  1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.],
       [10., 11., 12., 13., 14.],
       [15., 16., 17., 18., 19.],
       [20., 21., 22., 23., 24.]], dtype=float32)

In [13]:
x2 = cp.arange(25,50,dtype=cp.float32).reshape(5,5)
x2

array([[25., 26., 27., 28., 29.],
       [30., 31., 32., 33., 34.],
       [35., 36., 37., 38., 39.],
       [40., 41., 42., 43., 44.],
       [45., 46., 47., 48., 49.]], dtype=float32)

In [14]:
y = x1+x2
y

array([[25., 27., 29., 31., 33.],
       [35., 37., 39., 41., 43.],
       [45., 47., 49., 51., 53.],
       [55., 57., 59., 61., 63.],
       [65., 67., 69., 71., 73.]], dtype=float32)

In [None]:
x3 = cp.arange(10000,dtype=cp.float32).reshape(100,100)
y = x3@x3
y

Alternatives to cupy
- pycuda
- tensorflow
- pytorch
- numba

# Data Types

## Static vs Dynamic Typing 

https://twitter.com/01k/status/1067788059989684224
<div>
<img src="./img/static_vs_dynamic.png" width="500"/>
</div>

## Duck Typing
https://stackoverflow.com/questions/4205130/what-is-duck-typing

<div>
<img src="./img/duck_typing.png" width="500"/>
</div>

pseudo-code example (don't run the next cell!)

In [None]:
A = car()
A.drive() #works!

B = semi_truck()
B.drive() #works!

C = golf_club()
C.drive() #works!

D = coffee()
D.drive() #fails!!

## Data Type Demonstration in Python

In [23]:
%load_ext nb_mypy

Version 1.0.5


In [33]:
def add_vectors(x1,x2):
    assert x1.shape[0]==x2.shape[0]
    assert x1.shape[1]==x2.shape[1]
    y = np.zeros_like(x1)
    for i in range(x1.shape[0]):
        for j in range(x1.shape[1]):
            y[i,j] = x1[i,j] + x2[i,j]
    return y

In [49]:
x1 = np.arange(25).reshape(5,5)
x1

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [50]:
x2 = np.arange(25,50).reshape(5,5)
x2

array([[25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49]])

In [51]:
add_vectors(x1,x2)

array([[25, 27, 29, 31, 33],
       [35, 37, 39, 41, 43],
       [45, 47, 49, 51, 53],
       [55, 57, 59, 61, 63],
       [65, 67, 69, 71, 73]])

In [52]:
x3 = list(x2)
add_vectors(x1,x3)

AttributeError: 'list' object has no attribute 'shape'

In [39]:
def add_vectors_mypy(x1:np.typing.NDArray,x2:np.typing.NDArray)->np.typing.NDArray:
    assert x1.shape[0]==x2.shape[0]
    assert x1.shape[1]==x2.shape[1]
    y = np.zeros_like(x1)
    for i in range(x1.shape[0]):
        for j in range(x1.shape[1]):
            y[i,j] = x1[i,j] + x2[i,j]
    return y

In [53]:
add_vectors_mypy(x1,x2)

array([[25, 27, 29, 31, 33],
       [35, 37, 39, 41, 43],
       [45, 47, 49, 51, 53],
       [55, 57, 59, 61, 63],
       [65, 67, 69, 71, 73]])

In [58]:
add_vectors_mypy(x1,x3)

<cell>1: [1m[31merror:[m Argument 2 to [m[1m"add_vectors_mypy"[m has incompatible type [m[1m"List[Any]"[m; expected [m[1m"ndarray[Any, dtype[Any]]"[m  [m[33m[arg-type][m


AttributeError: 'list' object has no attribute 'shape'

In [60]:
%nb_mypy Off

# CUDA Basics: Grids, Blocks, and Threads

# Writing CUDA Kernels

In [82]:
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void cuda_add(const float* x1, const float* x2, float* y) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    y[tid] = x1[tid] + x2[tid];
}
''', 'cuda_add')

In [69]:
x1 = cp.arange(25,dtype=cp.float32).reshape(5,5)
x2 = cp.arange(25,50,dtype=cp.float32).reshape(5,5)
y = cp.zeros((5,5),dtype=cp.float32)

In [76]:
x1 = cp.arange(10000,dtype=cp.float32).reshape(100,100)
x2 = cp.arange(10000,dtype=cp.float32).reshape(100,100)
y = cp.zeros((100,100),dtype=cp.float32)

In [77]:
%%timeit
add_kernel(
    (5,), # grid shape (number of blocks in each dimension)
    (5,), # block shape (number of threads in each dimension)
    (x1,x2,y)
)
y

3.28 µs ± 34 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [78]:
%%timeit
add_kernel(
    (25,), # grid shape (number of blocks in each dimension)
    (1,), # block shape (number of threads in each dimension)
    (x1,x2,y)
)
y

3.29 µs ± 23.7 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [85]:
add_kernel_2D = cp.RawKernel(r'''
extern "C" __global__
void cuda_add(const float* x1, const float* x2, float* y) {
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;
    
    y[i][j] = x1[i][j] + x2[i][j];
    
}
''', 'cuda_add_2D')

In [87]:
add_kernel_2D(
    (1,), # grid shape (number of blocks in each dimension)
    (5,5), # block shape (number of threads in each dimension)
    (x1,x2,y)
)
y

CompileException: /tmp/tmp7uq2zm4p/d5b44167ce8793bd8c98eac3c07e6949ebb3a58b.cubin.cu(7): error: expression must have pointer-to-object type but it has type "float"

/tmp/tmp7uq2zm4p/d5b44167ce8793bd8c98eac3c07e6949ebb3a58b.cubin.cu(7): error: expression must have pointer-to-object type but it has type "float"

/tmp/tmp7uq2zm4p/d5b44167ce8793bd8c98eac3c07e6949ebb3a58b.cubin.cu(7): error: expression must have pointer-to-object type but it has type "float"

3 errors detected in the compilation of "/tmp/tmp7uq2zm4p/d5b44167ce8793bd8c98eac3c07e6949ebb3a58b.cubin.cu".


# Exercise: Matrix Multiplication

In [None]:
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void cuda_add(const float* x1, const float* x2, float* y) {
    int row = blockDim.y * blockIdx.y + threadIdx.x;
    y[tid] = x1[tid] + x2[tid];
}
''', 'cuda_add')