## Setup

In [None]:
!pip install wurlitzer
!pip install Ninja
!pip install dill
import os,math,sys,torch,re,numpy as np
from types import SimpleNamespace as ns
from collections import namedtuple
# from utils import show_img,load_cuda,cuda_begin,cdiv



# Utils

In [None]:
import torch
import matplotlib.pyplot as plt
from torch.utils.cpp_extension import load_inline

import os,math,sys,torch,re,numpy as np
from types import SimpleNamespace as ns
from collections import namedtuple

np.set_printoptions(precision=2, linewidth=140)
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)

def show_img(x, figsize=(4,3), **kwargs):
    "Display HW or CHW format image `x`"
    plt.figure(figsize=figsize)
    plt.axis('off')
    if len(x.shape)==3: x = x.permute(1,2,0)  # CHW -> HWC
    plt.imshow(x.cpu(), **kwargs)

cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CUDA_ERR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}
__host__ __device__ inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a+b-1)/b;}
'''

def load_cuda(cuda_src, cpp_src, funcs, opt=True, verbose=False, name=None):
    "Simple wrapper for torch.utils.cpp_extension.load_inline"
    if name is None: name = funcs[0]
    # flags = "-O3 -Xptxas -O3 -Xcompiler -O3" if opt else "-O0 -Xptxas -O0 -Xcompiler -O0"
    return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs, verbose=verbose, name=name)

def cdiv(a,b):
    "Int ceiling division of `a` over `b`"
    return (a+b-1)//b

def get_sig(fname, src):
    res = re.findall(rf'^(.+\s+{fname}\(.*?\))\s*{{?\s*$', src, re.MULTILINE)
    return res[0]+';' if res else None

dim3 = namedtuple('dim3', ['x','y','z'], defaults=(1,1))



In [None]:
%load_ext wurlitzer

# Basic Parallel Approach
## Numba Version in CUDA format

The general idea of the Brent-Kung approach to scanning is as follows

First, the **reduction phase**
- Assign one thread to every 2 elements
- Have each element perform a scan with the index its assigned and with the element that is 2^t steps prior to it (t=timestep)
- Repeat as long as there are indices which can perform the operation

You'll now have an array where each of the elements that belong to an index that is a power of 2 have succesfully completed the scan. But hte others will be in various states of disarray. Thus, you should perform the **post reduction phase**.

This impelmentation, however, is the Kogge-Stone Algorithm, which essentially
- Assigns each thread to a specific index
- Then, loops through and assigns to each index the sum of itself + the element 2**t prior
Each index = sum of self + previous element
- Once done- store the final index’s value as partial sum (global memory)
- Then, distributes the partial sums




In [34]:
from numba import cuda
from numba.cuda import as_cuda_array as ca

# Let's assume addition


@cuda.jit
def kogge_kernel(m, out, partials):
    cbi,cbd,tid = cuda.blockIdx,cuda.blockDim,cuda.threadIdx
    w = m.shape[0]

    # thread's index
    r = cbi.x * cbd.x + tid.x
    # Initialize shared memory
    shared_mem = cuda.shared.array(shape=(16,), dtype=np.float32)
    cuda.syncthreads()

    shared_mem[r] = m[r]

    for i in range(12):
        temp = 0
        if (2**i<=r):
          temp = shared_mem[r-2**i] + shared_mem[r]
        cuda.syncthreads()

        if (2**i<=r):
          shared_mem[r] = temp
        cuda.syncthreads()

    out[r]=shared_mem[r]
    if (cuda.threadIdx.x==cuda.blockDim.x-1):
      partials[cbi.x]=shared_mem[r]



@cuda.jit
def kogge_add(out, partials):
    cbi, cbd, tid = cuda.blockIdx, cuda.blockDim, cuda.threadIdx
    w = out.shape[0]
    r = cbi.x * cbd.x + tid.x

    # Each thread in a block adds the partial sum from the previous block
    if cbi.x > 0 and r < w:
        out[r] += partials[cbi.x - 1]

def run_kogge(m, tw=16):

    w  = m.shape[0]
    out = torch.zeros(w, dtype=m.dtype, device=m.device)

    block_no = int(w/tw)
    partials = torch.zeros(block_no, dtype=m.dtype, device=m.device)

    # Get partial sums + do scans over individaul blocks
    kogge_kernel[block_no, tw](ca(m), ca(out), ca(partials))

    # tally up partial sums
    for i in range(1,len(partials)):
      partials[i]+=partials[i-1]

    # Distribute on GPU
    if block_no>1:
      kogge_add[block_no, tw](ca(out), ca(partials))

    return out



In [35]:
input = torch.ones([64])
run_kogge(input.to("cuda"))



tensor([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
        27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52.,
        53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64.], device='cuda:0')

In [None]:
run_kogge(d=)

In [None]:
from numba import cuda
from numba.cuda import as_cuda_array as ca

# Let's assume addition

@cuda.jit
def numba_kernel_brent(m, out):
    cbi,cbd,tid = cuda.blockIdx,cuda.blockDim,cuda.threadIdx
    tc,tr = tid.x,tid.y
    r = tid.x * 2
    w = m.shape[0]

    # Loading step
    out[r+1] = m[r+1]
    out[r] = m[r]

    for i in range(w):
        temp = 0
        if (2^i<=r):
          temp = out[r-2^i] + out[r]
        cuda.syncthreads()
    #     # Need 2 conditionals- can't sync threads within a divergent passway
        if (2^i<=r):
          out[r] = temp
        cuda.syncthreads()
    # return


# cuda.syncthreads()



def run_brent(m, tw=16):
    w  = m.shape[0]
    out = torch.zeros(w, dtype=m.dtype, device=m.device)

    numba_kernel_brent[1, int(w/2), 0](ca(m), ca(out))

    return out


In [None]:

input = torch.ones(12)
input.unsqueeze(0)
# input = torch.array(input)
input.shape


torch.Size([12])

In [None]:
input.shape[0]

12

In [None]:

run_brent(input.to("cuda"))



tensor([ 4.00,  1.00,  5.00,  1.00, 10.00,  1.00, 17.00,  1.00, 18.00,  1.00, 20.75,  1.00], device='cuda:0')

In [None]:
out = convolution_22(input)
out

NameError: name 'convolution_22' is not defined

## Numba Version

In [None]:
from numba import cuda
from numba.cuda import as_cuda_array as ca


@cuda.jit
def cuda_numba(m, out):
    cbi,cbd,tid = cuda.blockIdx,cuda.blockDim,cuda.threadIdx
    tc,tr = tid.x,tid.y
    r, c = cbi.y * cbd.y + tr, cbi.x * cbd.x + tc
    h, w = m.shape[0], m.shape[1]
    index = r*w + c
    c_out = 0
    for x in range(max(0, r - 1), min(r + 2, h)):
      for y in range(max(0, c - 1), min(c + 2, w)):
          c_out += m[x, y]
    out[r, c] = c_out


def conv_cuda(m):

    w,h = m.shape
    output = torch.zeros(w,h, dtype=m.dtype, device=m.device)
    thread_block = dim3(3, 3)
    log = np.zeros(w * h, dtype=np.int32)  # Create a log array

    # Calculate the number of blocks needed
    blocks_x = math.ceil(w / thread_block.x)
    blocks_y = math.ceil(h / thread_block.y)
    blocks = blocks_x, blocks_y

    # Create the output tensor

    cuda_numba[blocks, (3,3)](ca(m), ca(output))


    return output

conv_cuda(torch.ones(12,12).to("cuda"))

tensor([[4., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 4.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [4., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 4.]], device='cuda:0')

## CUDA Version

In [None]:
cuda_src = cuda_begin + r'''
__constant__ float c_M[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};

__device__ int two_to_one(int r, int c, int c_size) {
    return r * c_size + c;  // Adjusted for correct row-major indexing
}

__global__ void conv_kernel(const float* m, float* out, int w, int h) {
    int r = blockIdx.y * blockDim.y + threadIdx.y;
    int c = blockIdx.x * blockDim.x + threadIdx.x;

    if (r < w && c < h) {
        int index = two_to_one(r, c, h);

        float c_out = 0.0f;
        int maskIndex = 0;

        for (int x = r - 1; x <= r + 1; x++) {
            for (int y = c - 1; y <= c + 1; y++) {
                if (x >= 0 && x < w && y >= 0 && y < h) {
                    c_out += m[two_to_one(x, y, h)] * c_M[maskIndex];
                }
                maskIndex++;
            }
        }

        out[index] = c_out;
    }
}
'''

cuda_src += r'''
torch::Tensor convolution_22(torch::Tensor m) {
    int w = m.size(0);
    int h = m.size(1);

    auto options = torch::TensorOptions().dtype(m.dtype()).device(m.device());
    torch::Tensor output = torch::zeros({w, h}, options);

    dim3 thread_block(3, 3);
    int blocks_x = std::ceil(static_cast<float>(w) / thread_block.x);
    int blocks_y = std::ceil(static_cast<float>(h) / thread_block.y);
    dim3 blocks(blocks_x, blocks_y);

    conv_kernel<<<blocks, thread_block>>>(m.data_ptr<float>(), output.data_ptr<float>(), w, h);

    // Wait for the GPU to finish and check for any errors
    cudaDeviceSynchronize();
    C10_CUDA_KERNEL_LAUNCH_CHECK();

    return output;
}
'''

In [None]:
fname = "convolution_22"
cpp_src = get_sig(fname, cuda_src)
cpp_src

'torch::Tensor convolution_22(torch::Tensor m);'