## Setup

In [1]:
!pip install wurlitzer
!pip install Ninja
!pip install dill
import os,math,sys,torch,re,numpy as np
from types import SimpleNamespace as ns
from collections import namedtuple
# from utils import show_img,load_cuda,cuda_begin,cdiv



# Utils

In [2]:
import torch
import matplotlib.pyplot as plt
from torch.utils.cpp_extension import load_inline

import os,math,sys,torch,re,numpy as np
from types import SimpleNamespace as ns
from collections import namedtuple

np.set_printoptions(precision=2, linewidth=140)
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)

def show_img(x, figsize=(4,3), **kwargs):
    "Display HW or CHW format image `x`"
    plt.figure(figsize=figsize)
    plt.axis('off')
    if len(x.shape)==3: x = x.permute(1,2,0)  # CHW -> HWC
    plt.imshow(x.cpu(), **kwargs)

cuda_begin = r'''
#include <torch/extension.h>
#include <stdio.h>
#include <c10/cuda/CUDAException.h>

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
#define CUDA_ERR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}
__host__ __device__ inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a+b-1)/b;}
'''

def load_cuda(cuda_src, cpp_src, funcs, opt=True, verbose=False, name=None):
    "Simple wrapper for torch.utils.cpp_extension.load_inline"
    if name is None: name = funcs[0]
    # flags = "-O3 -Xptxas -O3 -Xcompiler -O3" if opt else "-O0 -Xptxas -O0 -Xcompiler -O0"
    return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs, verbose=verbose, name=name)

def cdiv(a,b):
    "Int ceiling division of `a` over `b`"
    return (a+b-1)//b

def get_sig(fname, src):
    res = re.findall(rf'^(.+\s+{fname}\(.*?\))\s*{{?\s*$', src, re.MULTILINE)
    return res[0]+';' if res else None

dim3 = namedtuple('dim3', ['x','y','z'], defaults=(1,1))



In [3]:
%load_ext wurlitzer

# Basic Parallel Approach
##Python Version in CUDA format

In [5]:
filter = torch.ones(3,3)
filter

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [4]:
# Functions
def iterate_kernel(f, blocks, threads, *args):
    for i0 in range(blocks.y):
        for i1 in range(blocks.x):
            for j0 in range(threads.y):
                for j1 in range(threads.x):
                    f(dim3(i1,i0), dim3(j1,j0), threads, *args)

def two_to_one(r,c,c_size):
    return r + (c * c_size)

def conv_kernel(blockIdx, threadIdx, blockDim, m, out, w, h):
    r = blockIdx.y*blockDim.y + threadIdx.y
    c = blockIdx.x*blockDim.x + threadIdx.x

    index = two_to_one(r,c,h)

    c_out = 0
    iterate = []

    for x in range(r-1, r+2):
      for y in range(c-1, c+2):
        if (x>=0 and x<w and y>=0 and y<h):
          iterate.append([x,y])
          c_out = c_out + m[two_to_one(x,y,h)]

    out[index] = c_out


def convolution_22(m):
    w,h  = m.shape
    assert w==h, "Size mismatch!"
    output = torch.zeros(w,h, dtype=m.dtype)

    # Thread block size (3x3)
    thread_block = dim3(3, 3)

    # Calculate the number of blocks needed
    blocks_x = math.ceil(w / thread_block.x)
    blocks_y = math.ceil(h / thread_block.y)
    blocks = dim3(blocks_x, blocks_y)

    # Create the output tensor
    output = torch.zeros((w, h), dtype=m.dtype)

    iterate_kernel(conv_kernel, blocks, thread_block,
                 m.flatten(), output.flatten(), w, h)
    return output

In [17]:
input = torch.ones(6,6)
input

tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])

In [18]:
out = convolution_22(input)
out

tensor([[4., 6., 6., 6., 6., 4.],
        [6., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 6.],
        [4., 6., 6., 6., 6., 4.]])

## Numba Version

In [56]:
from numba import cuda
from numba.cuda import as_cuda_array as ca


@cuda.jit
def cuda_numba(m, out):
    cbi,cbd,tid = cuda.blockIdx,cuda.blockDim,cuda.threadIdx
    tc,tr = tid.x,tid.y
    r, c = cbi.y * cbd.y + tr, cbi.x * cbd.x + tc
    h, w = m.shape[0], m.shape[1]
    index = r*w + c
    c_out = 0
    for x in range(max(0, r - 1), min(r + 2, h)):
      for y in range(max(0, c - 1), min(c + 2, w)):
          c_out += m[x, y]
    out[r, c] = c_out


def conv_cuda(m):

    w,h = m.shape
    output = torch.zeros(w,h, dtype=m.dtype, device=m.device)
    thread_block = dim3(3, 3)
    log = np.zeros(w * h, dtype=np.int32)  # Create a log array

    # Calculate the number of blocks needed
    blocks_x = math.ceil(w / thread_block.x)
    blocks_y = math.ceil(h / thread_block.y)
    blocks = blocks_x, blocks_y

    # Create the output tensor

    cuda_numba[blocks, (3,3)](ca(m), ca(output))


    return output

conv_cuda(torch.ones(12,12).to("cuda"))

tensor([[4., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 4.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 9., 9., 9., 6.],
        [4., 6., 6., 6., 6., 6., 6., 6., 6., 6., 6., 4.]], device='cuda:0')

## CUDA Version

In [57]:
cuda_src = cuda_begin + r'''
__constant__ float c_M[9] = {1, 1, 1, 1, 1, 1, 1, 1, 1};

__device__ int two_to_one(int r, int c, int c_size) {
    return r * c_size + c;  // Adjusted for correct row-major indexing
}

__global__ void conv_kernel(const float* m, float* out, int w, int h) {
    int r = blockIdx.y * blockDim.y + threadIdx.y;
    int c = blockIdx.x * blockDim.x + threadIdx.x;

    if (r < w && c < h) {
        int index = two_to_one(r, c, h);

        float c_out = 0.0f;
        int maskIndex = 0;

        for (int x = r - 1; x <= r + 1; x++) {
            for (int y = c - 1; y <= c + 1; y++) {
                if (x >= 0 && x < w && y >= 0 && y < h) {
                    c_out += m[two_to_one(x, y, h)] * c_M[maskIndex];
                }
                maskIndex++;
            }
        }

        out[index] = c_out;
    }
}
'''

cuda_src += r'''
torch::Tensor convolution_22(torch::Tensor m) {
    int w = m.size(0);
    int h = m.size(1);

    auto options = torch::TensorOptions().dtype(m.dtype()).device(m.device());
    torch::Tensor output = torch::zeros({w, h}, options);

    dim3 thread_block(3, 3);
    int blocks_x = std::ceil(static_cast<float>(w) / thread_block.x);
    int blocks_y = std::ceil(static_cast<float>(h) / thread_block.y);
    dim3 blocks(blocks_x, blocks_y);

    conv_kernel<<<blocks, thread_block>>>(m.data_ptr<float>(), output.data_ptr<float>(), w, h);

    // Wait for the GPU to finish and check for any errors
    cudaDeviceSynchronize();
    C10_CUDA_KERNEL_LAUNCH_CHECK();

    return output;
}
'''

In [60]:
fname = "convolution_22"
cpp_src = get_sig(fname, cuda_src)
cpp_src

'torch::Tensor convolution_22(torch::Tensor m);'

In [61]:
module = load_cuda(cuda_src, cpp_src, [fname], opt=True)

In [62]:
dir(module)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'convolution_22']

In [63]:
py_out = convolution_22(input)
cuda_out = module.convolution_22(input.to("cuda"))

In [64]:
cuda_out

tensor([[4., 6., 6., 6., 6., 4.],
        [6., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 6.],
        [4., 6., 6., 6., 6., 4.]], device='cuda:0')

# Memory tiling

## Numba version

In [66]:
import numpy as np
import torch
from numba import cuda

# Constants
IN_TILE_DIM = 32
FILTER_RADIUS = 1
OUT_TILE_DIM = IN_TILE_DIM - 2 * FILTER_RADIUS
FILTER_DIM = 2 * FILTER_RADIUS + 1

# Assuming a filter of some specific size, define the filter as a numpy array
F_host = np.ones((FILTER_DIM, FILTER_DIM), dtype=np.float32)

@cuda.jit
def cuda_tiled(N, O, width, height):
    col = cuda.blockIdx.x * OUT_TILE_DIM + cuda.threadIdx.x - FILTER_RADIUS
    row = cuda.blockIdx.y * OUT_TILE_DIM + cuda.threadIdx.y - FILTER_RADIUS

    # Shared memory
    N_s = cuda.shared.array(shape=(IN_TILE_DIM, IN_TILE_DIM), dtype=np.float32)

    # Load input tile into shared memory
    if 0 <= row < height and 0 <= col < width:
        N_s[cuda.threadIdx.y, cuda.threadIdx.x] = N[row, col]
    else:
        N_s[cuda.threadIdx.y, cuda.threadIdx.x] = 0.0

    cuda.syncthreads()

    # Ensure the thread corresponds to a valid output element
    if (0 <= col < width and 0 <= row < height):
        # Initialize the sum for the current output element
        sum = 0.0
        # Iterate over the filter
        for i in range(FILTER_DIM):
            for j in range(FILTER_DIM):
                # Calculate the global row and column indices for the current filter element
                r = row - FILTER_RADIUS + i
                c = col - FILTER_RADIUS + j
                # Check if the filter element corresponds to a valid input element
                if (0 <= r < height and 0 <= c < width):
                    # Update the sum for the current output element
                    sum += N_s[cuda.threadIdx.y - FILTER_RADIUS + i, cuda.threadIdx.x - FILTER_RADIUS + j] * F_host[i, j]
        # Write the sum to the current output element
        O[row, col] = sum



# Prepare data
N = torch.ones(9, 9).to("cuda")
O = torch.zeros(9,9).to("cuda")

height, width = N.shape[0], N.shape[1]


# Define grid and block dimensions
block_dim = (IN_TILE_DIM, IN_TILE_DIM)
grid_dim_x = (width + OUT_TILE_DIM - 1) // OUT_TILE_DIM
grid_dim_y = (height + OUT_TILE_DIM - 1) // OUT_TILE_DIM
grid_dim = (grid_dim_x, grid_dim_y)


# Transfer filter to constant memory (you may need to adjust this based on your filter)
F_global_mem = cuda.to_device(F_host)

# Launch kernel
cuda_tiled[grid_dim, block_dim](N, O, width, height)

# Copy the result back to the host if needed
# O_host = O.copy_to_host()





In [67]:
# Answer
O

tensor([[4., 6., 6., 6., 6., 6., 6., 6., 4.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [4., 6., 6., 6., 6., 6., 6., 6., 4.]], device='cuda:0')

## CUDA Version

In [8]:
cuda_src = cuda_begin + r'''

#define IN_TILE_DIM 32
#define FILTER_RADIUS 1
#define OUT_TILE_DIM (IN_TILE_DIM - 2 * FILTER_RADIUS)
#define FILTER_DIM (2 * FILTER_RADIUS + 1)

#define FILTER_DIM 3

__constant__ float F[FILTER_DIM][FILTER_DIM] = {
    {1.0f, 1.0f, 1.0f},
    {1.0f, 1.0f, 1.0f},
    {1.0f, 1.0f, 1.0f}
};


__global__ void cuda_tiled(const float *N, float *O, int width, int height) {
    int col = blockIdx.x * OUT_TILE_DIM + threadIdx.x - FILTER_RADIUS;
    int row = blockIdx.y * OUT_TILE_DIM + threadIdx.y - FILTER_RADIUS;

    extern __shared__ float N_s[];

    if (0 <= row && row < height && 0 <= col && col < width) {
        N_s[threadIdx.y * IN_TILE_DIM + threadIdx.x] = N[row * width + col];
    } else {
        N_s[threadIdx.y * IN_TILE_DIM + threadIdx.x] = 0.0f;
    }

    __syncthreads();

    if (0 <= col && col < width && 0 <= row && row < height) {
        float sum = 0.0f;
        for (int i = 0; i < FILTER_DIM; i++) {
            for (int j = 0; j < FILTER_DIM; j++) {
                int r = row - FILTER_RADIUS + i;
                int c = col - FILTER_RADIUS + j;
                if (0 <= r && r < height && 0 <= c && c < width) {
                    sum += N_s[(threadIdx.y - FILTER_RADIUS + i) * IN_TILE_DIM + (threadIdx.x - FILTER_RADIUS + j)] * F[i][j];
                }
            }
        }
        O[row * width + col] = sum;
    }
}

'''

In [9]:
cuda_src +=  r'''
torch::Tensor convolution_tiled(torch::Tensor N) {
    // Ensure tensor is on CUDA and is of float type
    N = N.to(at::kCUDA).to(at::kFloat);

    // Get dimensions
    int height = N.size(0);
    int width = N.size(1);

    // Allocate output tensor
    auto O = torch::zeros({height, width}, N.options());

    // Calculate grid and block dimensions
    dim3 block_dim(IN_TILE_DIM, IN_TILE_DIM);
    dim3 grid_dim((width + OUT_TILE_DIM - 1) / OUT_TILE_DIM, (height + OUT_TILE_DIM - 1) / OUT_TILE_DIM);

    // Shared memory size
    int shared_mem_size = IN_TILE_DIM * IN_TILE_DIM * sizeof(float);

    // Launch kernel
    cuda_tiled<<<grid_dim, block_dim, shared_mem_size>>>(N.data_ptr<float>(), O.data_ptr<float>(), width, height);

    // Wait for CUDA to finish and check for errors
    cudaDeviceSynchronize();
    C10_CUDA_KERNEL_LAUNCH_CHECK();

    return O;
}
'''



# N = torch.ones(9, 9).to("cuda")
# O = torch.zeros(9,9).to("cuda")

# height, width = N.shape[0], N.shape[1]


# # Define grid and block dimensions
# block_dim = (IN_TILE_DIM, IN_TILE_DIM)
# grid_dim_x = (width + OUT_TILE_DIM - 1) // OUT_TILE_DIM
# grid_dim_y = (height + OUT_TILE_DIM - 1) // OUT_TILE_DIM
# grid_dim = (grid_dim_x, grid_dim_y)


# # Transfer filter to constant memory (you may need to adjust this based on your filter)
# F_global_mem = cuda.to_device(F_host)

# # Launch kernel
# cuda_tiled[grid_dim, block_dim](N, O, width, height)


In [10]:
fname = "convolution_tiled"


In [11]:
cpp_src = get_sig(fname, cuda_src)
cpp_src

'torch::Tensor convolution_tiled(torch::Tensor N);'

In [12]:
module = load_cuda(cuda_src, cpp_src, [fname], opt=True)

In [13]:
dir(module)

['__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'convolution_tiled']

In [7]:
N = torch.ones(9, 9)
type(N[0][0].item())


float

In [14]:
N = torch.ones(9, 9).to("cuda")
out = module.convolution_tiled(N)

In [15]:
out

tensor([[4., 6., 6., 6., 6., 6., 6., 6., 4.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [6., 9., 9., 9., 9., 9., 9., 9., 6.],
        [4., 6., 6., 6., 6., 6., 6., 6., 4.]], device='cuda:0')