In [None]:
!pip install pycuda

In [2]:
import numpy as np
from timeit import default_timer as timer
import pycuda.autoinit
from pycuda.driver import In, Out, Context
from pycuda.compiler import SourceModule
from PIL import Image

BLOCK_SIZE = 32
FILTER_SIZE = 3
SIGMA_D = 3
SIGMA_R = 128
BLOCK = (BLOCK_SIZE, BLOCK_SIZE, 1)
OFFSET = FILTER_SIZE // 2
FILE_NAMES = ["256.bmp", "512.bmp", "1024.bmp"]

kernel = SourceModule(
    """
    __global__ void bilateral_filter(unsigned char* pixels, unsigned char* filtered, int* size){
        const int blockSize = %(BLOCK_SIZE)s;
        const int filterSize = %(FILTER_SIZE)s;
        const int offset = %(OFFSET)s;
        const float sigma_d = %(SIGMA_D)s;
        const float sigma_r = %(SIGMA_R)s;
        const int width = size[0];
        const int bx = blockIdx.x,
            by = blockIdx.y,
            tx = threadIdx.x,
            ty = threadIdx.y;

        const int j = bx * blockDim.x + tx; // column
	      const int i = by * blockDim.y + ty; // row

        __shared__ unsigned char local[blockSize][blockSize];

        local[ty][tx] = pixels[i * width + j];
        __syncthreads ();
        
        unsigned char center = local[ty][tx], current, dr;
        int dx, dy, x, y;
        float sum = 0.0, Wp = 0.0, g, f;
        
        for (int k = 0; k < filterSize; k++){
            x = max(0, min(ty + k - offset, blockSize - 1));
            for (int l = 0; l < filterSize; l++){
                y = max(0, min(tx + l - offset, blockSize - 1));
                current = local[x][y];
                
                dx = x - ty;
                dy = y - tx;
                g = exp( - (dx*dx + dy*dy) / 2 / sigma_d / sigma_d);
                
                dr = center > current ? center - current :  - center + current;
                f = exp( - (dr * dr) / 2 / sigma_r / sigma_r);
                
                Wp += f * g;
                sum += f * g * current;
            }
        }
        __syncthreads ();

        filtered[i * width + j] = (unsigned char)(sum / Wp);
    }
    """ % {
        'BLOCK_SIZE': BLOCK_SIZE,
        'OFFSET': OFFSET,
        'FILTER_SIZE': FILTER_SIZE,
        'SIGMA_D': SIGMA_D,
        'SIGMA_R': SIGMA_R
    }
)

bilateral_filter = kernel.get_function("bilateral_filter")


def open_image(filename: str):
    image = Image.open(filename)

    pix = image.load()

    width = image.size[0]
    height = image.size[1]

    pixels = np.zeros((width, height), dtype=np.uint8)
    for i in range(height):
        for j in range(width):
            pixels[i, j] = pix[j, i]

    min_p = np.min(pixels)
    max_p = np.max(pixels)
    if min_p != 0 and max_p != 255:
        pixels = pixels - min_p
        pixels = np.uint8(pixels * (255 / (max_p - min_p)))

    return pixels, width, height


def cpu_filter(pixels, width, height):
    filtered = np.zeros_like(pixels)

    for i in range(height):
        for j in range(width):
            sum = 0.0
            Wp = 0.0
            center = pixels[i][j]
            for k in range(FILTER_SIZE):
                x = max(0, min(i + k - OFFSET, height - 1))
                for l in range(FILTER_SIZE):
                    y = max(0, min(j + l - OFFSET, width - 1))
                    current = pixels[x][y]
                    dx = x - i
                    dy = y - j
                    g = np.exp(- (dx * dx + dy * dy) / 2 / SIGMA_D / SIGMA_D)

                    if center > current:
                        dr = center - current
                    else:
                        dr = current - center
                    f = np.exp(- (dr * int(dr)) / 2 / SIGMA_R / SIGMA_R)

                    Wp += f * g
                    sum += f * g * current

            filtered[i, j] = np.uint8(sum / Wp)
    return filtered


def gpu_filter(pixels, width, height):
    size = np.array([width, height])
    filtered = np.zeros_like(pixels)
    grid_dim = (width // BLOCK_SIZE, height // BLOCK_SIZE)
    bilateral_filter(In(pixels), Out(filtered), In(size), block=BLOCK, grid=grid_dim)
    Context.synchronize()
    return filtered


def save_image(filtered, filename):
    new_image = Image.fromarray(filtered.astype('uint8'), mode='L')
    new_image.save(filename, format="BMP")


def test_cpu(pixels, width, height, save):
    start = timer()
    filtered = cpu_filter(pixels, width, height)
    cpu_time = timer() - start
    if save:
        save_image(filtered, "cpu" + filename)

    return cpu_time * 1000


def test_gpu(pixels, width, height, save):
    start = timer()
    filtered = gpu_filter(pixels, width, height)
    gpu_time = timer() - start
    if save:
        save_image(filtered, "gpu" + filename)

    return gpu_time * 1000



In [5]:
CPU_TEST_ROUND = 5
GPU_TEST_ROUND = 50

print("|   File   | CPU time, ms | GPU time, ms |   Speedup   |")
print("|:--------:|:------------:|:------------:|:-----------:|")
for filename in FILE_NAMES:
    pixels, width, height = open_image(filename)
    test_cpu(pixels, width, height, True)
    test_gpu(pixels, width, height, True)

    cpu_time = 0
    gpu_time = 0

    for i in range(CPU_TEST_ROUND):
        cpu_time += test_cpu(pixels, width, height, False)

    for i in range(GPU_TEST_ROUND):
        gpu_time += test_gpu(pixels, width, height, False)

    cpu_time /= CPU_TEST_ROUND
    gpu_time /= GPU_TEST_ROUND

    print("| {:8s} | {:12.3f} | {:12.3f} |  {:9.2f}  |".format(filename, cpu_time, gpu_time, cpu_time / gpu_time))

|   File   | CPU time, ms | GPU time, ms |   Speedup   |
|:--------:|:------------:|:------------:|:-----------:|
| 256.bmp  |     4953.899 |        0.405 |   12227.66  |
| 512.bmp  |    20043.113 |        0.720 |   27834.47  |
| 1024.bmp |    79705.054 |        2.150 |   37069.97  |
