<a href="https://colab.research.google.com/github/lucas-t-reis/CUDA/blob/master/notebooks/CUDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Runtime type set to use Hardware accelerator GPU
- Checking if setup was done properly.

In [0]:
!nvidia-smi

Wed Jan  1 19:00:14 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

Installing plugin to run **CUDA C/C++** code - by 
andreinechaev

In [3]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-8q8xbmo5
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-8q8xbmo5
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp36-none-any.whl size=4307 sha256=6434fd02e195f4364b824dbed1ee4787c9c9767d517762f0e9664f9e60a5d44d
  Stored in directory: /tmp/pip-ephem-wheel-cache-63oq5k9u/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory at /content/src
Out bin /content/result.out


#Example 1 - Simple sum

In [2]:
%%cu

#include <stdio.h>
#include <stdlib.h>

__global__ void add(int *a, int *b, int *c) {
    *c = *a + *b;
}

int main() {
    
    int a,b,c;
    int *d_a, *d_b, *d_c;
    int size = sizeof(int);

    cudaMalloc( (void **)&d_a, size);
    cudaMalloc( (void **)&d_b, size);
    cudaMalloc( (void **)&d_c, size);

    c = 0;
    a = 4;
    b = 8;

    cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

    add<<<1,1>>>(d_a, d_b, d_c); // Kernel execution

    cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
    if(err != cudaSuccess) {
        printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
    }

    printf("Result is: %d\n", c);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}

UsageError: Cell magic `%%cu` not found.


# Example 2 - Large arrays addition
First, for comparison purposes we time it using a CPU

In [62]:
%%writefile arrayAdd.cpp

//%%cu
#include <iostream>
#include <math.h>

// Adding elements of both arrays

void add(int n, float *x, float *y) {
    
    for(int i=0; i<n; i++)
      y[i] = x[i] + y[i];

}

int main () {
    
    int N = 1<<25;

    float *x = new float[N];
    float *y = new float[N];

    for(int i=0; i<N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }

    // Kernel
    add(N, x, y);
    float maxError = 0.0f;
    for(int i = 0; i<N; i++)
      maxError = fmax(maxError, fabs(y[i]-3.0f));

    std::cout << "Max error: " << maxError << std::endl;


    delete [] x;
    delete [] y;


}


Overwriting arrayAdd.cpp


In [54]:
!g++ arrayAdd.cpp -o ex2
!time ./ex2

Max error: 0

real	0m0.651s
user	0m0.532s
sys	0m0.111s


### Now the same code modified to run in a GPU. Best results can be found using even larger arrays and more complex operations.

In [64]:
%%writefile arrayAdd.cu
#include <iostream>
#include <math.h>

// Adding elements of both arrays

__global__ void add(int n, float *x, float *y) {
    
    // Using grid-stride-loop
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for(int i=index; i<n; i+= stride)
      y[i] = x[i] + y[i];

}

int main () {
    
    int N = 1<<25;

    float *x, *y;

    cudaMallocManaged(&x, N*sizeof(float));
    cudaMallocManaged(&y, N*sizeof(float));

    for(int i=0; i<N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
    }

    // Prefetch data to avoid transfer overhead
    int device = -1;
    cudaGetDevice(&device);
    cudaMemPrefetchAsync(x, N*sizeof(float), device, NULL);
    cudaMemPrefetchAsync(y, N*sizeof(float), device, NULL);


    // Kernel
    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    add<<<numBlocks, blockSize>>>(N, x, y);

    // Wait Kernel to finish before procedding with CPU
    cudaDeviceSynchronize();


    float maxError = 0.0f;
    for(int i = 0; i<N; i++)
      maxError = fmax(maxError, fabs(y[i]-3.0f));

    std::cout << "Max error: " << maxError << std::endl;


    cudaFree(x);
    cudaFree(y);


}


Overwriting arrayAdd.cu


In [65]:
!nvcc arrayAdd.cu -o ex2_GPU
!nvprof ./ex2_GPU

Max error: 0

real	0m0.693s
user	0m0.473s
sys	0m0.222s
==3091== NVPROF is profiling process 3091, command: ./ex2_GPU
Max error: 0
==3091== Profiling application: ./ex2_GPU
==3091== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  736.86us         1  736.86us  736.86us  736.86us  add(int, float*, float*)
      API calls:   83.50%  203.79ms         2  101.89ms  38.719us  203.75ms  cudaMallocManaged
                    8.36%  20.396ms         1  20.396ms  20.396ms  20.396ms  cudaDeviceSynchronize
                    5.69%  13.891ms         2  6.9456ms  5.9010ms  7.9902ms  cudaFree
                    2.17%  5.3005ms         2  2.6502ms  201.64us  5.0989ms  cudaMemPrefetchAsync
                    0.19%  467.32us         1  467.32us  467.32us  467.32us  cuDeviceTotalMem
                    0.06%  147.31us        96  1.5340us     146ns  56.413us  cuDeviceGetAttribute
                    0.02%  52.660us         1  