<a href="https://colab.research.google.com/github/lucas-t-reis/CUDA/blob/master/notebooks/Professional_CUDA_programming_exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Checking for Nvidia **CUDA toolkit**
- Checking for Nvidia Devices (enabled in Runtime type)

In [38]:
!which nvcc
!ls -l /dev/nv*

/usr/local/cuda/bin/nvcc
crw-rw-rw- 1 root root 195,   0 Jan 16 11:33 /dev/nvidia0
crw-rw-rw- 1 root root 195, 255 Jan 16 11:33 /dev/nvidiactl
crw-rw-rw- 1 root root 247,   0 Jan 16 11:33 /dev/nvidia-uvm
crw-rw-rw- 1 root root 247,   1 Jan 16 11:33 /dev/nvidia-uvm-tools


### Installing plugin to allow running CUDA C/C++ - by [andreinechaev](https://github.com/andreinechaev)

In [39]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-emop4itj
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-emop4itj
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=b6e51f4bf8bfa35abe52cfd5057f28426a8768db1bf218ee2e6375db3519a069
  Stored in directory: /tmp/pip-ephem-wheel-cache-3ndq4lwh/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
The nvcc_plugin extension is already loaded. To reload it, use:
  %reload_ext nvcc_plugin


# Hello World - Example 1

In [40]:
%%cu
#include <stdio.h>

// __global__ qualifier to denote a function called by CPU and executed by GPU    
__global__ void helloWorldGPU() {
    
    int gpu_tid = threadIdx.x;
    printf("Hello World from GPU thread %d!\n", gpu_tid);
}

int main() {
    printf("Hello World from CPU!\n");
    
    // Kernel
    helloWorldGPU <<< 1,10 >>> ();

    cudaDeviceReset();
}

Hello World from CPU!
Hello World from GPU thread 0!
Hello World from GPU thread 1!
Hello World from GPU thread 2!
Hello World from GPU thread 3!
Hello World from GPU thread 4!
Hello World from GPU thread 5!
Hello World from GPU thread 6!
Hello World from GPU thread 7!
Hello World from GPU thread 8!
Hello World from GPU thread 9!



## Understanting *grid* and *block*
(pg.33)

In [41]:
%%cu
#include <stdio.h>

__global__ void d_workersData() {
    
      printf("threadIdx(%d, %d, %d) blockIdx(%d, %d, %d) blockDim(%d, %d, %d) gridDim(%d, %d, %d)\n",
             threadIdx.x,threadIdx.y,threadIdx.z, blockIdx.x,blockIdx.y,blockIdx.z, 
             blockDim.x,blockDim.y,blockDim.z, gridDim.x,gridDim.y,gridDim.z);  

}

int main () {
    
    int n = 6;
    dim3 block(3); // Setting each block to have 3 threads
    dim3 grid( (n + block.x - 1)/block.x); // Setting x dimension to acomodate 2 blocks (data+threads per block)/blocks in give dimension

    // Host check of block and grid dimensions
    printf("grid.x %d grid.y %d grid.z %d\n", grid.x, grid.y, grid.z);
    printf("block.x %d block.y %d block.z %d\n", block.x, block.y, block.z);

    d_workersData <<< grid , block >>> ();

    cudaDeviceSynchronize(); // Sync CPU and GPU


    // Resetting block and grid sizes
    block.x = 2;
    grid.x = (n + block.x - 1)/(block.x);
    printf("\nblock %d grid %d\n", block.x, grid.x);

    d_workersData <<< grid, block >>> ();

    cudaDeviceReset();
    
    
}

grid.x 2 grid.y 1 grid.z 1
block.x 3 block.y 1 block.z 1
threadIdx(0, 0, 0) blockIdx(0, 0, 0) blockDim(3, 1, 1) gridDim(2, 1, 1)
threadIdx(1, 0, 0) blockIdx(0, 0, 0) blockDim(3, 1, 1) gridDim(2, 1, 1)
threadIdx(2, 0, 0) blockIdx(0, 0, 0) blockDim(3, 1, 1) gridDim(2, 1, 1)
threadIdx(0, 0, 0) blockIdx(1, 0, 0) blockDim(3, 1, 1) gridDim(2, 1, 1)
threadIdx(1, 0, 0) blockIdx(1, 0, 0) blockDim(3, 1, 1) gridDim(2, 1, 1)
threadIdx(2, 0, 0) blockIdx(1, 0, 0) blockDim(3, 1, 1) gridDim(2, 1, 1)

block 2 grid 3
threadIdx(0, 0, 0) blockIdx(0, 0, 0) blockDim(2, 1, 1) gridDim(3, 1, 1)
threadIdx(1, 0, 0) blockIdx(0, 0, 0) blockDim(2, 1, 1) gridDim(3, 1, 1)
threadIdx(0, 0, 0) blockIdx(1, 0, 0) blockDim(2, 1, 1) gridDim(3, 1, 1)
threadIdx(1, 0, 0) blockIdx(1, 0, 0) blockDim(2, 1, 1) gridDim(3, 1, 1)
threadIdx(0, 0, 0) blockIdx(2, 0, 0) blockDim(2, 1, 1) gridDim(3, 1, 1)
threadIdx(1, 0, 0) blockIdx(2, 0, 0) blockDim(2, 1, 1) gridDim(3, 1, 1)



### Observations

 * Grid dimensions are determined by:
    ``` 
    n = # of elements + threads per block on x
    grid.x = (n-1)/(threads per block on x) 
    ```
 * Each thread can be uniquely identified by **threadIdx** and **blockIdx**
 * Host sees block and grid, Device uses pre-initialized **blockDim** and **gridDim**
 * \# of executions by changing block capacity or grid capacity respectively:
   * ``` work = block (x*y*z)```
   * ``` grid (x*y*z) * work``` 
 * Syncing CPU and GPU is something to be aware of when changing properties
