In [1]:
!nvidia-smi

Fri Feb  7 04:37:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             13W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
%%writefile rgb2gray.cpp

#include <opencv2/opencv.hpp>
#include<bits/stdc++.h>
#include <time.h>

void grayscaleCPU(unsigned char* Pout, unsigned char* Pin, int width, int height) {
    for (int row = 0; row < height; row++) {
        for (int col = 0; col < width; col++) {

            int index = row * width + col;

            int rgb_index = index * 3;

            unsigned char r = Pin[rgb_index];
            unsigned char g = Pin[rgb_index + 1];
            unsigned char b = Pin[rgb_index + 2];


            unsigned char gray = static_cast<unsigned char>(0.299 * r + 0.587 * g + 0.114 * b);


            Pout[index] = gray;
        }
    }
}



int main() {

    cv::Mat img = cv::imread("image.jpg");
    if (img.empty()) {
        std::cerr << "Error: Could not load image!" << std::endl;
        return -1;
    }

    clock_t start, end;
    start = clock();

    int width = img.cols;
    int height = img.rows;
    int channels = img.channels();


    cv::Mat gray_img(height, width, CV_8UC1);


    grayscaleCPU(gray_img.data, img.data, width, height);

    end = clock();
    float time2 = ((float)(end - start)) / CLOCKS_PER_SEC;

    printf("CPU: %f seconds\n", time2);

    cv::imwrite("gray_image.jpg", gray_img);

    printf("%d, %d, %d\n", width, height, channels);
    return 0;
}


Writing rgb2gray.cpp


In [3]:
!g++ rgb2gray.cpp -I /usr/include/opencv4/opencv2/  -I/usr/include/opencv4 -L/usr/lib -lopencv_core -lopencv_highgui -lopencv_imgcodecs -o cpu_code

In [5]:
!./cpu_code

CPU: 1.979741 seconds
15792, 10240, 3


In [6]:
%%writefile rgb2gray.cu

#include <opencv2/opencv.hpp>
#include<bits/stdc++.h>
#include <time.h>

__global__ void rgb2gray_kernel(unsigned char* Pout, unsigned char* Pin, int width, int height)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;


    if(col < width && row<height)
    {
      // get index given row major format
      int index = row*width + col;
      //intuition : Row 0:   R0  G0  B0   R1  G1  B1   R2  G2  B2   R3  G3  B3
      int rgb_index = index * 3;
      unsigned char r = Pin[rgb_index];
      unsigned char g = Pin[rgb_index + 1];
      unsigned char b = Pin[rgb_index + 2];

      unsigned char gray = static_cast<unsigned char>(0.299 * r + 0.587 * g + 0.114 * b);

      Pout[index] =  gray;

    }




}

int main() {
    cv::Mat img = cv::imread("image.jpg");

    clock_t start, end;
    start = clock();

    if (img.empty()) {
        std::cerr << "Error: Could not load image!" << std::endl;
        return -1;
    }


    int width = img.cols;
    int height = img.rows;
    int channels = img.channels();


    unsigned char *d_input, *d_output;
    size_t imgSize = width * height * 3 * sizeof(unsigned char);
    size_t graySize = width * height * sizeof(unsigned char);

    cudaMalloc((void**)&d_input, imgSize);
    cudaMalloc((void**)&d_output, graySize);


    cudaMemcpy(d_input,img.data,imgSize,cudaMemcpyHostToDevice);

    cv::Mat gray_img(img.rows, img.cols, CV_8UC1);


    dim3 dimGrid(ceil(width/16.0), ceil(height/16.0), 1);
    dim3 dimBlock(16, 16, 1);

    rgb2gray_kernel<<<dimGrid,dimBlock>>>(d_output,d_input,width,height);
    cudaDeviceSynchronize();
    end = clock();
    float time2 = ((float)(end - start)) / CLOCKS_PER_SEC;
    cudaMemcpy(gray_img.data, d_output, graySize, cudaMemcpyDeviceToHost);



    printf("GPU: %f seconds\n", time2);

    cv::imwrite("gray_image_gpu.jpg", gray_img);

    cudaFree(d_input);
    cudaFree(d_output);

    printf("%d, %d, %d\n",width,height,channels);
    return 0;
}


Writing rgb2gray.cu


In [7]:
!nvcc rgb2gray.cu -I /usr/include/opencv4/opencv2/  -I/usr/include/opencv4 -L/usr/lib -lopencv_core -lopencv_highgui -lopencv_imgcodecs -o gray -arch=sm_75

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^

  class AffineWarper : public PlaneWarper
        ^


  class AffineWarper : public PlaneWarper
        ^

  class FeatherBlender : public Blender
        ^

  class MultiBandBlender : public Blender
        ^



In [8]:
!./gray

GPU: 0.252803 seconds
15792, 10240, 3


In [9]:
!nvprof ./gray

==680== NVPROF is profiling process 680, command: ./gray
GPU: 0.262489 seconds
15792, 10240, 3
==680== Profiling application: ./gray
==680== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   45.31%  108.09ms         1  108.09ms  108.09ms  108.09ms  [CUDA memcpy DtoH]
                   43.12%  102.87ms         1  102.87ms  102.87ms  102.87ms  [CUDA memcpy HtoD]
                   11.56%  27.577ms         1  27.577ms  27.577ms  27.577ms  rgb2gray_kernel(unsigned char*, unsigned char*, int, int)
      API calls:   62.85%  212.04ms         2  106.02ms  103.06ms  108.98ms  cudaMemcpy
                   28.02%  94.518ms         2  47.259ms  109.60us  94.409ms  cudaMalloc
                    8.18%  27.583ms         1  27.583ms  27.583ms  27.583ms  cudaDeviceSynchronize
                    0.86%  2.8870ms         2  1.4435ms  621.79us  2.2653ms  cudaFree
                    0.05%  167.68us         1  167.68us  167.68us  167.