In [1]:
!nvidia-smi

Fri Dec  5 17:40:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%%writefile gaussian_blur_cuda.cu


#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define WIDTH 1000
#define HEIGHT 1000
#define MAX_BLOCK_SIZE 32
#define KERNEL_SIZE 3

__constant__ float d_kernel[KERNEL_SIZE][KERNEL_SIZE];

__global__ void gaussianBlurKernel(float *input, float *output, int width, int height) {
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    if (row >= 1 && row < height - 1 && col >= 1 && col < width - 1) {
        float sum = 0.0f;

        for (int ki = -1; ki <= 1; ki++) {
            for (int kj = -1; kj <= 1; kj++) {
                int input_row = row + ki;
                int input_col = col + kj;
                int input_idx = input_row * width + input_col;

                sum += input[input_idx] * d_kernel[ki + 1][kj + 1];
            }
        }

        int output_idx = row * width + col;
        output[output_idx] = sum;
    }
}

__global__ void gaussianBlurKernelShared(float *input, float *output, int width, int height) {
    __shared__ float shared_input[MAX_BLOCK_SIZE + 2][MAX_BLOCK_SIZE + 2];

    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;

    int local_col = threadIdx.x + 1;
    int local_row = threadIdx.y + 1;

    if (row < height && col < width) {
        shared_input[local_row][local_col] = input[row * width + col];

        if (threadIdx.x == 0 && col > 0) {
            shared_input[local_row][0] = input[row * width + (col - 1)];
        }
        if (threadIdx.x == blockDim.x - 1 && col < width - 1) {
            shared_input[local_row][local_col + 1] = input[row * width + (col + 1)];
        }
        if (threadIdx.y == 0 && row > 0) {
            shared_input[0][local_col] = input[(row - 1) * width + col];
        }
        if (threadIdx.y == blockDim.y - 1 && row < height - 1) {
            shared_input[local_row + 1][local_col] = input[(row + 1) * width + col];
        }
    }

    __syncthreads();

    if (row >= 1 && row < height - 1 && col >= 1 && col < width - 1) {
        float sum = 0.0f;

        for (int ki = -1; ki <= 1; ki++) {
            for (int kj = -1; kj <= 1; kj++) {
                sum += shared_input[local_row + ki][local_col + kj] * d_kernel[ki + 1][kj + 1];
            }
        }

        output[row * width + col] = sum;
    }
}

#define CHECK_CUDA_ERROR(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, \
                cudaGetErrorString(err)); \
        exit(EXIT_FAILURE); \
    } \
}

void initializeImage(float *image) {
    for (int i = 0; i < HEIGHT; i++) {
        for (int j = 0; j < WIDTH; j++) {
            int idx = i * WIDTH + j;
            if (i > HEIGHT/4 && i < 3*HEIGHT/4 && j > WIDTH/4 && j < 3*WIDTH/4) {
                image[idx] = 100.0f;
            } else {
                image[idx] = 10.0f;
            }
        }
    }
}

void printImageSample(float *image, int sample_size) {
    printf("Sample output (%dx%d from center):\n", sample_size, sample_size);
    int start_i = HEIGHT/2 - sample_size/2;
    int start_j = WIDTH/2 - sample_size/2;

    for (int i = start_i; i < start_i + sample_size; i++) {
        for (int j = start_j; j < start_j + sample_size; j++) {
            printf("%6.2f ", image[i * WIDTH + j]);
        }
        printf("\n");
    }
}

int main(int argc, char *argv[]) {
    int block_size = 16;
    int use_shared_memory = 0;

    if (argc > 1) {
        block_size = atoi(argv[1]);
        if (block_size < 1 || block_size > MAX_BLOCK_SIZE) {
            printf("Invalid block size. Using default: 16\n");
            block_size = 16;
        }
    }
    if (argc > 2) {
        use_shared_memory = atoi(argv[2]);
    }

    printf("=== Gaussian Blur - CUDA Implementation ===\n");
    printf("Image Size: %dx%d\n", WIDTH, HEIGHT);
    printf("Block Size: %dx%d\n", block_size, block_size);
    printf("Kernel Version: %s\n\n", use_shared_memory ? "Shared Memory" : "Basic");

    cudaDeviceProp prop;
    CHECK_CUDA_ERROR(cudaGetDeviceProperties(&prop, 0));
    printf("GPU Device: %s\n", prop.name);
    printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
    printf("Max Threads per Block: %d\n\n", prop.maxThreadsPerBlock);

    size_t image_size = WIDTH * HEIGHT * sizeof(float);
    float *h_input = (float *)malloc(image_size);
    float *h_output = (float *)malloc(image_size);

    if (h_input == NULL || h_output == NULL) {
        printf("Host memory allocation failed!\n");
        return 1;
    }

    initializeImage(h_input);

    for (int i = 0; i < WIDTH * HEIGHT; i++) {
        h_output[i] = 0.0f;
    }

    float *d_input, *d_output;
    CHECK_CUDA_ERROR(cudaMalloc(&d_input, image_size));
    CHECK_CUDA_ERROR(cudaMalloc(&d_output, image_size));

    float h_kernel[KERNEL_SIZE][KERNEL_SIZE] = {
        {1/16.0f, 2/16.0f, 1/16.0f},
        {2/16.0f, 4/16.0f, 2/16.0f},
        {1/16.0f, 2/16.0f, 1/16.0f}
    };
    CHECK_CUDA_ERROR(cudaMemcpyToSymbol(d_kernel, h_kernel,
                     KERNEL_SIZE * KERNEL_SIZE * sizeof(float)));

    CHECK_CUDA_ERROR(cudaMemcpy(d_input, h_input, image_size, cudaMemcpyHostToDevice));
    CHECK_CUDA_ERROR(cudaMemcpy(d_output, h_output, image_size, cudaMemcpyHostToDevice));

    dim3 block_dim(block_size, block_size);
    dim3 grid_dim((WIDTH + block_size - 1) / block_size,
                  (HEIGHT + block_size - 1) / block_size);

    printf("Grid Dimensions: %dx%d blocks\n", grid_dim.x, grid_dim.y);
    printf("Total Threads: %d\n\n", grid_dim.x * grid_dim.y * block_size * block_size);

    cudaEvent_t start, stop;
    CHECK_CUDA_ERROR(cudaEventCreate(&start));
    CHECK_CUDA_ERROR(cudaEventCreate(&stop));

    CHECK_CUDA_ERROR(cudaEventRecord(start));

    if (use_shared_memory) {
        gaussianBlurKernelShared<<<grid_dim, block_dim>>>(d_input, d_output, WIDTH, HEIGHT);
    } else {
        gaussianBlurKernel<<<grid_dim, block_dim>>>(d_input, d_output, WIDTH, HEIGHT);
    }

    CHECK_CUDA_ERROR(cudaEventRecord(stop));
    CHECK_CUDA_ERROR(cudaEventSynchronize(stop));
    CHECK_CUDA_ERROR(cudaGetLastError());

    float milliseconds = 0;
    CHECK_CUDA_ERROR(cudaEventElapsedTime(&milliseconds, start, stop));
    float execution_time = milliseconds / 1000.0f;

    CHECK_CUDA_ERROR(cudaMemcpy(h_output, d_output, image_size, cudaMemcpyDeviceToHost));

    printf("Execution Time: %.6f seconds (%.3f ms)\n\n", execution_time, milliseconds);

    printImageSample(h_output, 5);

    printf("\n=== Performance Metrics ===\n");
    printf("Total pixels processed: %d\n", WIDTH * HEIGHT);
    printf("Pixels per second: %.2f million\n", (WIDTH * HEIGHT) / (execution_time * 1e6));
    printf("GPU Throughput: %.2f GFLOPS\n",
           (WIDTH * HEIGHT * 9 * 2) / (execution_time * 1e9));

    CHECK_CUDA_ERROR(cudaEventDestroy(start));
    CHECK_CUDA_ERROR(cudaEventDestroy(stop));
    CHECK_CUDA_ERROR(cudaFree(d_input));
    CHECK_CUDA_ERROR(cudaFree(d_output));
    free(h_input);
    free(h_output);

    return 0;
}

Writing gaussian_blur_cuda.cu


In [3]:
!nvcc -arch=sm_75 -O3 gaussian_blur_cuda.cu -o gaussian_blur_cuda

In [4]:
# 8x8 Basic
!./gaussian_blur_cuda 8 0

=== Gaussian Blur - CUDA Implementation ===
Image Size: 1000x1000
Block Size: 8x8
Kernel Version: Basic

GPU Device: Tesla T4
Compute Capability: 7.5
Max Threads per Block: 1024

Grid Dimensions: 125x125 blocks
Total Threads: 1000000

Execution Time: 0.000149 seconds (0.149 ms)

Sample output (5x5 from center):
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 

=== Performance Metrics ===
Total pixels processed: 1000000
Pixels per second: 6732.01 million
GPU Throughput: 121.18 GFLOPS


In [5]:
# 16x16 Basic
!./gaussian_blur_cuda 16 0



=== Gaussian Blur - CUDA Implementation ===
Image Size: 1000x1000
Block Size: 16x16
Kernel Version: Basic

GPU Device: Tesla T4
Compute Capability: 7.5
Max Threads per Block: 1024

Grid Dimensions: 63x63 blocks
Total Threads: 1016064

Execution Time: 0.000096 seconds (0.096 ms)

Sample output (5x5 from center):
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 

=== Performance Metrics ===
Total pixels processed: 1000000
Pixels per second: 10430.57 million
GPU Throughput: 187.75 GFLOPS


In [6]:
# 32x32 Basic
!./gaussian_blur_cuda 32 0

=== Gaussian Blur - CUDA Implementation ===
Image Size: 1000x1000
Block Size: 32x32
Kernel Version: Basic

GPU Device: Tesla T4
Compute Capability: 7.5
Max Threads per Block: 1024

Grid Dimensions: 32x32 blocks
Total Threads: 1048576

Execution Time: 0.000075 seconds (0.075 ms)

Sample output (5x5 from center):
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 

=== Performance Metrics ===
Total pixels processed: 1000000
Pixels per second: 13269.64 million
GPU Throughput: 238.85 GFLOPS


In [7]:
# 8x8 Shared Memory
!./gaussian_blur_cuda 8 1

=== Gaussian Blur - CUDA Implementation ===
Image Size: 1000x1000
Block Size: 8x8
Kernel Version: Shared Memory

GPU Device: Tesla T4
Compute Capability: 7.5
Max Threads per Block: 1024

Grid Dimensions: 125x125 blocks
Total Threads: 1000000

Execution Time: 0.000179 seconds (0.179 ms)

Sample output (5x5 from center):
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 

=== Performance Metrics ===
Total pixels processed: 1000000
Pixels per second: 5592.34 million
GPU Throughput: 100.66 GFLOPS


In [8]:
# 16x16 Shared Memory
!./gaussian_blur_cuda 16 1


=== Gaussian Blur - CUDA Implementation ===
Image Size: 1000x1000
Block Size: 16x16
Kernel Version: Shared Memory

GPU Device: Tesla T4
Compute Capability: 7.5
Max Threads per Block: 1024

Grid Dimensions: 63x63 blocks
Total Threads: 1016064

Execution Time: 0.000114 seconds (0.114 ms)

Sample output (5x5 from center):
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 

=== Performance Metrics ===
Total pixels processed: 1000000
Pixels per second: 8810.26 million
GPU Throughput: 158.58 GFLOPS


In [9]:
# 32x32 Shared Memory
!./gaussian_blur_cuda 32 1


=== Gaussian Blur - CUDA Implementation ===
Image Size: 1000x1000
Block Size: 32x32
Kernel Version: Shared Memory

GPU Device: Tesla T4
Compute Capability: 7.5
Max Threads per Block: 1024

Grid Dimensions: 32x32 blocks
Total Threads: 1048576

Execution Time: 0.000108 seconds (0.108 ms)

Sample output (5x5 from center):
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 
100.00 100.00 100.00 100.00 100.00 

=== Performance Metrics ===
Total pixels processed: 1000000
Pixels per second: 9221.01 million
GPU Throughput: 165.98 GFLOPS
