# Intro to CUDA (using Python!)

In [None]:
import numpy as np
import cupy as cp

# A quick poll...

Go to the link and fill out the poll!


# Data Types

## Static vs Dynamic Typing 

https://twitter.com/01k/status/1067788059989684224
<div>
<img src="./img/static_vs_dynamic.png" width="400"/>
</div>

## Duck Typing

https://stackoverflow.com/questions/4205130/what-is-duck-typing

<div>
<img src="./img/duck_typing.png" width="400"/>
</div>

pseudo-code example (don't run the next cell!)

In [None]:
A = car()
A.drive() #works!

B = semi_truck()
B.drive() #works!

C = golf_club()
C.drive() #works!

D = coffee()
D.drive() #fails!!

## Data Type Demonstration in Python

In [None]:
%load_ext nb_mypy

In [None]:
%nb_mypy On

In [None]:
def add_vectors(x1,x2):
    pass

In [None]:
x1 = 
x1

In [None]:
x2 = 
x2

In [None]:
add_vectors(x1,x2)

In [None]:
x3 = list(x2)
add_vectors(x1,x3)

In [None]:
def add_vectors_mypy():
    pass


In [None]:
add_vectors_mypy(x1,x2)

In [None]:
add_vectors_mypy(x1,x3)

In [None]:
%nb_mypy Off

# CUDA Basics

## Grids, Blocks, and Threads

https://cs.calvin.edu/courses/cs/374/CUDA/CUDA-Thread-Indexing-Cheatsheet.pdf

https://en.wikipedia.org/wiki/Thread_block_%28CUDA_programming%29

<div>
<img src="./img/threads_blocks2.png" width="500"/>
</div>

## Intro to CuPy: CUDA/Python Interface

In [None]:
x1 = 
x1

In [None]:
x2 = 
x2

In [None]:
y = x1+x2
y

Alternatives to cupy
- pycuda
- tensorflow
- pytorch
- numba

# CUDA Kernels: 1D Thread Grids

In [None]:
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void cuda_add() {

...


}
''', 'cuda_add')

In [None]:
x1 = cp.arange(25,dtype=cp.float32).reshape(5,5)
x2 = cp.arange(25,50,dtype=cp.float32).reshape(5,5)

In [None]:
#%%timeit
y = cp.zeros((5,5),dtype=cp.float32)
add_kernel(
    , # grid shape (number of blocks in each dimension)
    , # block shape (number of threads in each dimension)
    , # kernel arguments
    
)
y

# CUDA Kernels: 2D Thread Grids

## A Quick Diversion: Flattened Arrays

In [None]:
z1 = np.arange(25).reshape(5,5)
z1

In [None]:
z1_flat = 
z1_flat

In [None]:
i = 2
j = 3
z1[i,j]

In [None]:
N = z1.shape[0]
k = 
z1_flat[k]

## Back to CUDA

In [None]:
add_kernel_2D = cp.RawKernel(r'''
extern "C" __global__
void cuda_add_2D(const float* x1, const float* x2, float* y, int N) {

...
    
}
''', 'cuda_add_2D')

In [None]:
y = cp.zeros((5,5),dtype=cp.float32)
add_kernel_2D(
    (1,1), # grid shape (number of blocks in each dimension)
    (1,1), # block shape (number of threads in each dimension)
    (x1,x2,y,x1.shape[0])
)
y

In [None]:
y = cp.zeros((5,5),dtype=cp.float32)
add_kernel_2D(
    (1,1), # grid shape (number of blocks in each dimension)
    (5,5), # block shape (number of threads in each dimension)
    (x1,x2,y,x1.shape[0])
)
y

In [None]:
y = cp.zeros((5,5),dtype=cp.float32)
add_kernel_2D(
    (5,5), # grid shape (number of blocks in each dimension)
    (1,1), # block shape (number of threads in each dimension)
    (x1,x2,y,x1.shape[0])
)
y

# CUDA Kernels: Matrix Multiplication

In [None]:
mult_kernel = cp.RawKernel(r'''
extern "C" __global__
void cuda_mult(const float* x1, const float* x2, float* y, int N) {
    int ROW = blockDim.y * blockIdx.y + threadIdx.y;
    int COL = blockDim.x * blockIdx.x + threadIdx.x;
    
    float tmpSum = 0;

    if (ROW < N && COL < N) {
        // each thread computes one element of the block sub-matrix
        for (int i = 0; i < N; i++) {
            tmpSum += x1[ROW * N + i] * x2[i * N + COL];
        }
    }
    y[ROW * N + COL] = tmpSum;
    
}
''', 'cuda_mult')

In [None]:
x1 = cp.arange(25,dtype=cp.float32).reshape(5,5)
x2 = cp.arange(25,dtype=cp.float32).reshape(5,5)
y = cp.zeros((5,5),dtype=cp.float32)

In [None]:
#%%timeit
mult_kernel(
    (1,1), # grid shape (number of blocks in each dimension)
    (5,5), # block shape (number of threads in each dimension)
    (x1,x2,y,x1.shape[0])
)
y

## Let's try to push this...

In [None]:
x1 = cp.arange(1000000,dtype=cp.float32).reshape(1000,1000)
x2 = cp.arange(1000000,dtype=cp.float32).reshape(1000,1000)
y = cp.zeros((1000,1000),dtype=cp.float32)

In [None]:
%%timeit
mult_kernel(
    (100,100), # grid shape (number of blocks in each dimension)
    (100,100), # block shape (number of threads in each dimension)
    (x1,x2,y,x1.shape[0])
)
y

In [None]:
add_kernel = cp.RawKernel(r'''
extern "C" __global__
void cuda_add(const float* x1, const float* x2, float* y) {
    int row = blockDim.y * blockIdx.y + threadIdx.x;
    y[tid] = x1[tid] + x2[tid];
}
''', 'cuda_add')