<a href="https://colab.research.google.com/github/mu06905/GPU-Accelerated-Programming-in-Cuda-2023/blob/main/Week3/Assignment1_Random_Array_Initialization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-iz6ni8ju
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-iz6ni8ju
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


In [2]:
!nvidia-smi

Tue Jan 24 06:35:21 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0    30W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [52]:
%%cuda --name my_curand.cu 
#include <stdio.h>
#include <cuda_runtime.h>
#include <curand.h>

#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    return EXIT_FAILURE;}} while(0)
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__);\
    return EXIT_FAILURE;}} while(0)

inline cudaError_t checkCudaErr(cudaError_t err, const char* msg) {
  if (err != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime error at %s: %s\n", msg, cudaGetErrorString(err));
  }
  return err;
}
 
int main()
{
  const int N = 1000000;  
  
  float* h_data = (float*)malloc(N * sizeof(float));
  int start_h = clock();
  for(int i=0;i<N;i++)
      h_data[i]=rand()%N;
  int end = clock();
  int time_taken = end - start_h;
  printf("Time taken to populate data on host is %d ms \n",time_taken);
  printf("fist 10 elements from host\n");
  for(int i = 0; i<10;i++){
      printf("d_data[%d] -> %f \n",i,h_data[i]);
  }
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  float* d_data;
  curandGenerator_t gen;
  CUDA_CALL(cudaMalloc((void **)&d_data, N*sizeof(float)));
  CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
  cudaEventRecord(start);
  CURAND_CALL(curandGenerateUniform(gen, d_data, N));
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float milliseconds = 0;
  cudaEventElapsedTime(&milliseconds, start, stop);
  printf("Time taken to populate data on device is %f ms \n",milliseconds);
  start_h = clock();
  CUDA_CALL(cudaMemcpy(h_data, d_data, N * sizeof(float), cudaMemcpyDeviceToHost));
  end = clock();
  time_taken = end - start_h;
  printf("time taken to copy data from gpu to cpu is: %i ms \n",time_taken);
  printf("fist 10 elements from device\n");
  for(int i = 0; i<10;i++){
      printf("d_data[%d] -> %f \n",i,h_data[i]);
  }

  
  free(h_data);  //deallocate memory on device
  CURAND_CALL(curandDestroyGenerator(gen));
  CUDA_CALL(cudaFree(d_data));


  return 0;
}

'File written in /content/src/my_curand.cu'

In [53]:
!nvcc -o /content/src/my_curand /content/src/my_curand.cu -lcurand

In [54]:
!/content/src/my_curand

Time taken to populate data on host is 9745 ms 
fist 10 elements from host
d_data[0] -> 289383.000000 
d_data[1] -> 930886.000000 
d_data[2] -> 692777.000000 
d_data[3] -> 636915.000000 
d_data[4] -> 747793.000000 
d_data[5] -> 238335.000000 
d_data[6] -> 885386.000000 
d_data[7] -> 760492.000000 
d_data[8] -> 516649.000000 
d_data[9] -> 641421.000000 
Time taken to populate data on device is 0.495904 ms 
time taken to copy data from gpu to cpu is: 960 ms 
fist 10 elements from device
d_data[0] -> 0.145468 
d_data[1] -> 0.820181 
d_data[2] -> 0.550399 
d_data[3] -> 0.294830 
d_data[4] -> 0.914733 
d_data[5] -> 0.868979 
d_data[6] -> 0.321921 
d_data[7] -> 0.782857 
d_data[8] -> 0.011302 
d_data[9] -> 0.285450 
