<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring2022_GoogleColabs/blob/main/Week9/Histogram_Strategy_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%cd /usr/local/
!rm -rf cuda
!ln -s /usr/local/cuda-10.1 /usr/local/cuda
!stat cuda
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

/usr/local
  File: cuda -> /usr/local/cuda-10.1
  Size: 20        	Blocks: 0          IO Block: 4096   symbolic link
Device: 23h/35d	Inode: 4325386     Links: 1
Access: (0777/lrwxrwxrwx)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2022-03-16 15:47:55.931014558 +0000
Modify: 2022-03-16 15:47:55.820015057 +0000
Change: 2022-03-16 15:47:55.820015057 +0000
 Birth: -
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-saw7pc09
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-saw7pc09
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=c654ff7059fc404be9a4926a196ce93bb4b157596d7f5e2c2f550762c4fdcb7b
  Stored in directory: /tmp/pip-ephem-wheel-cache-xpapdprd/wheels/ca/33/8d/3c86eb85e97d

In [25]:
%%cu
#include <stdio.h>
#include <cstdlib> //rand() function


inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
	if (err != cudaSuccess) {
		fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
	}
	return err;
}

void histogram_CPU(int *data, const int N, int* histogram, const int Nbins)
{
	for(int i=0; i<N; ++i)
	{
		histogram[data[i]]++;		
	}
}

__global__ void histogram_GPU_1( int *data, const int N, int *histo) {
	int i = threadIdx.x + blockIdx.x * blockDim.x;    
	 int section_size = (N-1)/(blockDim.x *gridDim.x)+1; 
	int start = i*section_size; //All threads handle blockDim.x * gridDim.x consecutive elements 
	for (int k = 0; k < section_size; k++) 
	{
		if (start+k < N) 
		{ 
			atomicAdd(&(histo[data[start+k]]), 1); 
		} 
	} 

}


int main() 
{ 
	int *data;
	int *histogram;
	int *histogramGPU; 
	const int Nbins = 256;
	const int N = 33 * 1024;
	const int threadsPerBlock = 256;

	#define imin(a,b) (a<b?a:b)

	const int blocksPerGrid =  imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
 
	// Allocate Unified Memory -- accessible from CPU or GPU
	checkCudaErr(cudaMallocManaged(&data, N*sizeof(int)), "cudaMallocManaged1 data");
	checkCudaErr(cudaMallocManaged(&histogram, Nbins*sizeof(int)), "cudaMallocManaged2 histogram"); 
	checkCudaErr(cudaMallocManaged(&histogramGPU, Nbins*sizeof(int)), "cudaMallocManaged2 histogramGPU"); 

	// fill in the data with random values between 0-255
	for (int i=0; i<N; i++) {
		data[i] = (rand() % Nbins); 
	} 

	// some events to count the execution time
  cudaEvent_t start, stop;
	float cpu_elapsed_time_ms, gpu_1_elapsed_time_ms; 

  cudaEventCreate(&start);
  cudaEventCreate(&stop);
	
	// start to count execution time of CPU version
	cudaEventRecord(start, 0);
 
  //calculate histogram on the CPU
	histogram_CPU(data, N, histogram, Nbins);

  // time counting terminate
  cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

  //compute time elapsed on CPU 
  cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);
  printf("Histogram(CPU) - Time: %f ms.\n", cpu_elapsed_time_ms);

 	//start to count execution time of GPU version
	cudaEventRecord(start, 0);

	//launch kernel
	histogram_GPU_1<<<blocksPerGrid,threadsPerBlock>>>( data, N, histogramGPU );

	// time counting terminate
  cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);

  // compute time elapse on GPU computing
  cudaEventElapsedTime(&gpu_1_elapsed_time_ms, start, stop);
  printf("Histogram_1(GPU) - Time: %f ms.\n", gpu_1_elapsed_time_ms);
	
	int allOK = 1;

	printf("+------------------+-------------------+\n");
  printf("| CPU Histogram[i] |  GPU Histogram[i] |\n");
	printf("+------------------+-------------------+\n");
  //compare results on CPU and GPU
	for(int i=0 ; i<Nbins; ++i)
 	{
		 printf("|      %5d       |      %5d        |\n", histogram[i], histogramGPU[i]);
      
		 if(histogram[i]!=histogramGPU[i])
		 {
		    printf("Histogram mismatch!!!");
				allOK=0;
				break;
		 }
	}
	printf("+------------------+-------------------+\n");
	
	if(allOK==1)
  	printf("Results correct on both CPU and GPU");

	// free memory on the gpu side
	checkCudaErr( cudaFree( data ) , "cudaFree1");
	checkCudaErr( cudaFree( histogram ) , "cudaFree2"); 
	checkCudaErr( cudaFree( histogramGPU ) , "cudaFree3"); 
	checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

	return 0;
}

Histogram(CPU) - Time: 0.100192 ms.
Histogram_1(GPU) - Time: 0.336192 ms.
+------------------+-------------------+
| CPU Histogram[i] |  GPU Histogram[i] |
+------------------+-------------------+
|        127       |        127        |
|        137       |        137        |
|        148       |        148        |
|        100       |        100        |
|        119       |        119        |
|        144       |        144        |
|        139       |        139        |
|        116       |        116        |
|        136       |        136        |
|        146       |        146        |
|        133       |        133        |
|        127       |        127        |
|        137       |        137        |
|        124       |        124        |
|        141       |        141        |
|        158       |        158        |
|        126       |        126        |
|        126       |        126        |
|        125       |        125        |
|         97       |    