<a href="https://colab.research.google.com/github/mmmovania/CUDA_Spring2022_GoogleColabs/blob/main/Week8/PrefixSum_Wrong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%cd /usr/local/
!rm -rf cuda
!ln -s /usr/local/cuda-10.1 /usr/local/cuda
!stat cuda
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

/usr/local
  File: cuda -> /usr/local/cuda-10.1
  Size: 20        	Blocks: 0          IO Block: 4096   symbolic link
Device: 24h/36d	Inode: 3538954     Links: 1
Access: (0777/lrwxrwxrwx)  Uid: (    0/    root)   Gid: (    0/    root)
Access: 2022-03-04 05:54:38.908958543 +0000
Modify: 2022-03-04 05:54:38.805951244 +0000
Change: 2022-03-04 05:54:38.805951244 +0000
 Birth: -
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-4pmkbqzr
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-4pmkbqzr
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=590b93ca945fcc6aee2f089d13ed8d0bd8473e28919de7c0bcea088cd17e26d5
  Stored in directory: /tmp/pip-ephem-wheel-cache-tqeyark0/wheels/c5/2b/c0/87008e795a14

In [24]:
%%cu
#include <stdio.h>

#define SECTION_SIZE 4 

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
}

//CPU version
void sequential_scan(float* x, float* y, int N) 
{
  y[0]=x[0]; 
  for (int i=1; i < N; i++)
  {
    y[i]= y [i-1] + x[i]; 
  }
}

__global__ void work_inefficient_inc_scan_kernel(float *X, float *Y, int N) {
  __shared__ float XY[4]; 
  int i = blockIdx.x*blockDim.x + threadIdx.x; 
  if (i < N) { 
    XY[threadIdx.x] = X[i]; 
  }

  // the code below performs iterative scan on XY 
  for (unsigned int stride=1; stride<=threadIdx.x; stride*= 2) 
  { 
    __syncthreads(); 
    XY[threadIdx.x] += XY[threadIdx.x-stride]; 
  }
  Y[i] = XY[threadIdx.x];
}

   
int main() { 
		float   *X, *Y; 
    const int N = 16; 
    const int threadsPerBlock = SECTION_SIZE;
    const int blocksPerGrid =  (N / threadsPerBlock);

    // Allocate Unified Memory -- accessible from CPU or GPU
    checkCudaErr(cudaMallocManaged(&X, N*sizeof(float)), "cudaMallocManaged1");
    checkCudaErr(cudaMallocManaged(&Y, N*sizeof(float)), "cudaMallocManaged2"); 
     
    // fill in the memory with data
    for (int i=0; i<N; i++) 
    {
        X[i] = i+1;
        Y[i] = 0; 
    }  
  
    // Prefetch the data to the GPU
    int device = -1;
    cudaGetDevice(&device);
    cudaMemPrefetchAsync(X, N*sizeof(float), device, NULL);
    cudaMemPrefetchAsync(Y, N*sizeof(float), device, NULL);
    
    cudaEvent_t start, stop;
    float gpu_elapsed_time_ms=0, cpu_elapsed_time_ms=0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
 
    ////lets time the CPU code
    cudaEventRecord(start, 0);
    sequential_scan(X, Y, N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&cpu_elapsed_time_ms, start, stop);

    //reset Y for GPU
    for (int i=0; i<N; i++) 
    {         
        Y[i] = 0; 
    }  

    //lets time the GPU code  
    cudaEventRecord(start, 0);
    work_inefficient_inc_scan_kernel<<<blocksPerGrid,threadsPerBlock>>>(X, Y, N);
    cudaDeviceSynchronize();
  
    // time counting terminate
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);

    // compute time elapse on GPU computing
    cudaEventElapsedTime(&gpu_elapsed_time_ms, start, stop);

    //output the result
    puts("Prefix Scan Results:");
    printf("CPU Time: %3.3f msecs, GPU Time: %3.3f\n",cpu_elapsed_time_ms, gpu_elapsed_time_ms);
    puts("X[i]\t| Y[i]");
    puts("--------+-------");

    for(int i=0; i<N; ++i)
      printf("%3.0f\t| %3.0f\n", X[i], Y[i]);

    // free memory on the gpu side
    checkCudaErr( cudaFree( X ) , "cudaFree1");
    checkCudaErr( cudaFree( Y ) , "cudaFree2"); 
		checkCudaErr( cudaDeviceReset(), "cudaDeviceReset");

		return 0;
}

Prefix Scan Results:
CPU Time: 0.016 msecs, GPU Time: 0.207
X[i]	| Y[i]
--------+-------
  1	|   1
  2	|   3
  3	|   6
  4	|  10
  5	|   6
  6	|  12
  7	|  19
  8	|  32
  9	|  15
 10	|  25
 11	|  36
 12	|  57
 13	|  28
 14	|  42
 15	|  57
 16	|  86

