# Hands-On 6: Portable Parallel Programming with CUDA

This Hands-on comprises $2$ sessions. Next table shows the documents and files needed to develop each one of the exercises.

|  Sessions | Codes                              | files                        | 
|:----------|:-----------------------------------|:-----------------------------|
| Session 1 | Portable Sequential Code           | saxpy.c, and saxpy.cu    |
| Session 2 | Unified Memory (cudaMallocManaged) | saxpy-cudaMallocManaged.cu |


## `Add Vectors Benchmark`

This subprograms perform the following computation, using the scalar $\alpha$ and vectors $x$ and $y$:

$$z = \alpha x + y,$$

where $x$, $y$, and $z$ are vectors and $\alpha$ is scalar. SAXPY stands for Single-Precision it is a function in the standard Basic Linear Algebra Subroutines (BLAS) library. SAXPY is a combination of scalar multiplication and vector addition, and it is very simple: it takes as
input two vectors of 32-bit floats $x$ and $y$ with $n$ elements each, and a scalar value $\alpha$. It multiplies each element $x[i]$ by $\alpha$ and adds the result to $y[i]$. A simple C implementation looks like this.

In [None]:
%%writefile saxpy.c
#include <stdio.h>
#include <stdlib.h>

void saxpy(int n,  float *x, float *y)
{
 for (int i=0; i < n ; ++i)
  y[i] = x[i] + y[i];
}

void printVector(float *vector, int n)
{

 for (int i=0; i < n ; ++i)
  printf("%1.0f\t", vector[i]);

  printf("\n\n");
}

void generateVector(float *vector, int n)
{
 for (int i=0; i < n ; ++i)
  vector[i] = i + 1;
}

int main(int argc, char *argv[])
{
  int n = atoi(argv[1]);   
  float *x,*y;

  x = (float*) malloc(sizeof(float) * n);
  y = (float*) malloc(sizeof(float) * n);
 
  generateVector(x, n);
  printVector(x, n);

  generateVector(y, n);
  printVector(y, n);

  saxpy(n, x, y);
  printVector(y, n);
 
  free(x);
  free(y);

  return 0;

}

## Run the Code 

In [None]:
!gcc saxpy.c -o saxpy 

In [None]:
!./saxpy 8

## `Unified Memory (cudaMallocManaged)`

The program in `saxpy-cudaMallocManaged.cu` allocates memory, using `cudaMallocManaged` for a $n$ elements array of integers, and then seeks to initialize all the values of the array in parallel using a CUDA kernel.

In [None]:
%%writefile saxpy-cudaMallocManaged.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>

__global__ void saxpy(int n,  float *x, float *y){
 int i = threadIdx.x;
 if(i < n)
   y[i] = x[i] + y[i];
}

void printVector(float *vector, int n){
for (int i=0; i < n ; ++i)
 printf("%1.0f\t", vector[i]);
printf("\n\n");
}

void generateVector(float *vector, int n){
for (int i=0; i < n ; ++i)
 vector[i] = i + 1;
}

int main(int argc, char *argv[]){
  int n = atoi(argv[1]);   
  float *x,*y;

  cudaMallocManaged(&x, sizeof(float) * n);
  cudaMallocManaged(&y, sizeof(float) * n);
 
  generateVector(x, n);
  printVector(x, n);
  generateVector(y, n);
  printVector(y, n);
  
  int NUMBER_OF_BLOCKS = 1;
  int NUMBER_OF_THREADS_PER_BLOCK = n;
  
  saxpy <<< NUMBER_OF_BLOCKS, NUMBER_OF_THREADS_PER_BLOCK >>> (n, x, y);
  
  cudaDeviceSynchronize();

  printVector(y, n);
 
  cudaFree(x);
  cudaFree(y);

  return 0;
}

## Run the Code 

In [None]:
!nvcc saxpy-cudaMallocManaged.cu -o saxpy-cudaMallocManaged 

In [None]:
!./saxpy-cudaMallocManaged 8

## References

M. Boratto. Hands-On Supercomputing with Parallel Computing. Available: https://github.com/muriloboratto/Hands-On-Supercomputing-with-Parallel-Computing. 2022.