In [51]:
%%writefile rectify.cu
// C libraries
#include <stdio.h>
#include <stdlib.h>
// CUDA libraries
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
// image processing library
#include "lodepng.h"

// Device Code
__global__ void rectification(unsigned char *input_image, unsigned char *output_image, int array_size) {
    // thread's index in the block structure
    int pixel_index = threadIdx.x + blockIdx.x * blockDim.x;
    // there are 4 values for a pixel: R, G, B, A. Loop over all of them to rectify them        
    for (int i = 0; i < 4; i++) {
        if (pixel_index + i < array_size) {
            int value = (int) input_image[pixel_index+i];
            if (value < 127) value = 127;
            output_image[pixel_index+i] = (unsigned char) value;
        }
    }
}

// Host Code
int main(int argc, char *argv[]) {
    
    if (argc <= 1) {
        return printf("No arguments provided! Please add input file name, output file name and thread number to the program call!");
    } else if (argc > 1 && argc < 4) {
        return printf("Missing arguments! Please check that you have provided the input file name, output file name and the number of threads!");
    }

    // get inputs from the command line
    char *input_filename = argv[1];
    char *output_filename = argv[2];
    int threads_no = atoi(argv[3]);

    // initalize variables for error, input image, input image width and input image height
    unsigned error;
    unsigned char *input_image;
    unsigned width, height;

    // load input image from file to buffer array
    error = lodepng_decode32_file(&input_image, &width, &height, input_filename);
    
    // if there is an error while loading the file, return the error
    if(error) return printf("Error: %u: %s\n", error, lodepng_error_text(error));

    // initalize device variable to copy the input image over to the GPU
    unsigned char *d_input, *d_output;
    int size = width * height * 4 * sizeof(unsigned char);
    int array_size = width * height * 4;

    cudaMalloc(&d_input, size);
    cudaMalloc(&d_output, size);

    // create CUDA events to time the kernel runtime
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // copy image from host memory to device memory
    cudaMemcpy(d_input, input_image, size, cudaMemcpyHostToDevice);

    // record start time
    cudaEventRecord(start);

    // run device kernel
    rectification<<<(array_size + threads_no - 1) / threads_no, threads_no>>>(d_input, d_output, array_size);

    // record stop time
    cudaEventRecord(stop);
    
    // synchronize device to get the output back from the device
    // cudaDeviceSynchronize();

    // initialize output image array to copy output from device to host
    unsigned char *output_image = (unsigned char*)malloc(size);

    // copy output image from device to host
    cudaMemcpy(output_image, d_output, size, cudaMemcpyDeviceToHost);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // save output image
    lodepng_encode32_file(output_filename, output_image, width, height);

    // free up device memory;
    cudaFree(d_input);
    cudaFree(d_output);

    // free up host memory;
    free(input_image);
    free(output_image);

    //print elapsed time
    printf("Time Elapsed: %f ms\n", milliseconds);
}

Overwriting rectify.cu


In [52]:
!nvcc rectify.cu lodepng.cpp -o rectify

In [80]:
!./rectify Test_3.png testfile3.png 256

Time Elapsed: 0.431872 ms


# Runtime for Test_1.png

Thread No: 1
- Time Elapsed: 119.203743 ms

Thread No: 2
- Time Elapsed: 59.446465 ms

Thread No: 4
- Time Elapsed: 30.947231 ms

Thread No: 8
- Time Elapsed: 15.234368 ms

Thread No: 16
- Time Elapsed: 6.365888 ms

Thread No: 32
- Time Elapsed: 4.554912 ms

Thread No: 64
- Time Elapsed: 2.037152 ms

Thread No: 128
- Time Elapsed: 1.601824 ms

Thread No: 256
- Time Elapsed: 1.667808 ms

# Runtime for Test_2.png

Thread No: 1
- Time Elapsed: 26.798977 ms

Thread No: 2
- Time Elapsed: 13.382880 ms 

Thread No: 4
- Time Elapsed: 7.722272 ms

Thread No: 8
- Time Elapsed: 3.448832 ms

Thread No: 16
- Time Elapsed: 1.502048 ms

Thread No: 32
- Time Elapsed: 1.032224 ms

Thread No: 64
- Time Elapsed: 0.463648 ms

Thread No: 128
- Time Elapsed: 0.370944 ms

Thread No: 256
- Time Elapsed: 0.385792 ms

# Runtime for Test_3.png

Thread No: 1
- Time Elapsed: 29.772320 ms

Thread No: 2
- Time Elapsed: 15.001536 ms

Thread No: 4
- Time Elapsed: 7.722208 ms

Thread No: 8
- Time Elapsed: 3.832704 ms

Thread No: 16
- Time Elapsed: 1.602848 ms

Thread No: 32
- Time Elapsed: 1.143136 ms

Thread No: 64
- Time Elapsed: 0.516128 ms

Thread No: 128
- Time Elapsed: 0.415456 ms

Thread No: 256
- Time Elapsed: 0.431872 ms
