<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring2022_GoogleColabs/blob/main/Week3/SumGPU_Timed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%cd /usr/local/
!rm -rf cuda
!ln -s /usr/local/cuda-10.1 /usr/local/cuda
!stat cuda
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

/usr/local
  File: cuda -> /usr/local/cuda-10.1
  Size: 20        	Blocks: 0          IO Block: 4096   symbolic link
Device: 24h/36d	Inode: 3801100     Links: 1
Access: (0777/lrwxrwxrwx)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2022-01-26 07:37:09.443366657 +0000
Modify: 2022-01-26 07:37:09.334359213 +0000
Change: 2022-01-26 07:37:09.334359213 +0000
 Birth: -
Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-sfuxsb_j
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-sfuxsb_j
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=2117e9719af346c172781e167f6470ad852fb6bc6e389d95074d50a33406db14
  Stored in directory: /tmp/pip-ephem-wheel-cache-pw04pm7n/wheels/c5/2b/c0/87008e795a14bbcdfc

In [5]:
%%cu
#include <stdio.h>
#include <time.h>

__global__ void sum(int* a, int* b, int* c, const int N) {
	int i = threadIdx.x + blockDim.x * blockIdx.x;
	if (i<N)
		c[i] = a[i] + b[i];	
	//else 
	//	printf("i > N -> %3d in block: %d\n", i, blockIdx.x);
}

void sum_host(int* a, int* b, int* c, const int N) {
  clock_t clk;

	clk = clock();
	for(int i=0; i<N; ++i)
		c[i] = a[i] + b[i];

	clk = clock() - clk;
	double dt = (((double)clk) / CLOCKS_PER_SEC) * 1000;
	printf("Total time on CPU: %f msecs\n", dt);

}


int main() { 
	int* h_a = 0;
	int* h_b = 0;
	int* h_c = 0;
	
	int* d_a = 0;
	int* d_b = 0;
	int* d_c = 0;

	const int N = 50000; //2048;
	int numThreadsPerBlock =  128;
	int numBlocksPerGrid = ceilf(N/numThreadsPerBlock + 1);
	printf("Num threads per block: %3d\n", numThreadsPerBlock);
	printf("Num blocks per grid: %3d\n", numBlocksPerGrid);

	size_t size = N * sizeof(int);

	//allocate host memory
	h_a = (int*)malloc(size);	 
	h_b = (int*)malloc(size);	 
	h_c = (int*)malloc(size);	 

	//initialize a, b and c
	for(int i=0;i<N;++i) {
		h_a[i] = i+1;
		h_b[i] = h_a[i]*2;
		h_c[i] = 0;
	}

  cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	//allocate device memory 
	cudaMalloc((void**)&d_a, size);
	cudaMalloc((void**)&d_b, size);
	cudaMalloc((void**)&d_c, size);
	
	//copy host data to device memory
	cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice); 
	cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

	//calculate on host 
	sum_host(h_a, h_b, h_c, N); 
	printf("Sum (host): %d\n", h_c); 
  
	//output result
	/*
	printf("Host calculation result: \n");
	for(int i=0;i<N;++i) {
		printf("%3d + %3d = %3d\n", h_a[i], h_b[i], h_c[i]);
		//clear host result to ensure that the result of device is actually from the kernel
		h_c[i] = 0;
	}
	*/

	cudaEventRecord(start);
	//calculate on device
	sum<<<numBlocksPerGrid, numThreadsPerBlock>>>(d_a, d_b, d_c, N);
	cudaEventRecord(stop);

  //cudaDeviceSynchronize();

	//copy result from device to host
	cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

  cudaEventSynchronize(stop);

  printf("Sum (device): %d\n", h_c);

	//output result
	/*
	printf("--------------------------------------\n");
	printf("Device calculation result: \n");
	for(int i=0;i<N;++i) {
		printf("%3d + %3d = %3d\n", h_a[i], h_b[i], h_c[i]);
	}
	printf("--------------------------------------\n");
	*/
 
 	float dt = 0;
	cudaEventElapsedTime(&dt, start, stop);
	printf("Total time on GPU: %f msecs\n", dt);

	//delete data allocated on device
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);

	//delete host memory
	free(h_a);
	free(h_b);
	free(h_c);

	cudaDeviceReset();
	return 0;
}

Num threads per block: 128
Num blocks per grid: 391
Total time on CPU: 0.178000
Sum (host): -1295736832
Sum (device): -1295736832
Total time on GPU: 0.020800

