# Fundamentals of Accelerated Computing with CUDA C/C++

 Murilo Boratto$^1$

$^1$ SENAI CIMATEC <br />
     &nbsp;&nbsp;&nbsp; Supercomputing Center<br />

## Enabled GPU in Colab

**Go to Menu > Runtime > Change runtime > V100 GPU**

## Instalation the APIs OpenMP and CUDA

In [None]:
!sudo apt-get install libomp-dev

## Check if GPU is running or not, run the following command

In [1]:
!nvidia-smi

Thu Feb 13 10:55:12 2025       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.60.13    Driver Version: 525.60.13    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:60:00.0 Off |                    0 |
| N/A   42C    P0    45W / 300W |      0MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:61:00.0 Off |                    0 |
| N/A   42C    P0    42W / 300W |      0MiB / 32768MiB |      0%      Default |
|       

## Check if nvcc compiler is capable of using GPU

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


## Optimizations Matrix Multiply Benchmark

### Sequential Code

In [3]:
%%writefile mm.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

void kernel(int *A, int *B, int *C, int size)
{
 int i, j, k;

 for(i = 0; i < size; i++)
  for(j = 0; j < size; j++)
    for(k = 0; k < size; k++)
       C[i * size + j] += A[i * size + k] * B[k * size + j];
}

void initializeMatrix(int *matrix, int size)
{
  int i, j;

  for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
      matrix[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
  {
    for(int j = 0; j < size; j++)
      printf("%d\t", matrix[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main (int argc, char **argv)
{
 int size = atoi(argv[1]);
 int i, j, k;
 double t1, t2;

 int  *A = (int *) malloc (sizeof(int)*size*size);
 int  *B = (int *) malloc (sizeof(int)*size*size);
 int  *C = (int *) malloc (sizeof(int)*size*size);

 initializeMatrix(A, size);
 initializeMatrix(B, size);

 t1 = omp_get_wtime();
   kernel(A, B, C, size);
 t2 = omp_get_wtime();

 printf("%d\t%f\n", size, t2-t1);

 //printMatrix(A,size);
 //printMatrix(B,size);
 //printMatrix(C,size);

 free(A);
 free(B);
 free(C);

 return 0;

}

Writing mm.c


In [4]:
!gcc mm.c -o mm -fopenmp -O3

In [5]:
!./mm 10000

10000	1301.358851


### `CUDA Thread Hierarchy`

In [43]:
%%HTML

<div align="center">
<iframe src="https://docs.google.com/presentation/d/1J_GF6XACL0-Dk1BtJCeWiHwJCFcM_Hkx/edit?usp=share_link&ouid=117965215426975519312&rtpof=true&sd=true" frameborder="0" width="900" height="550" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true">

</iframe></div>

In [6]:
%%writefile mm.cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

__global__ void kernel(int *A, int *B, int *C, int size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;
  int k;

  if((i < size) && (j < size))
    for(k = 0; k < size; k++)
       C[i * size + j] += A[i * size + k] * B[k * size + j];

}

void initializeMatrix(int *A, int size)
{
  int i, j;

  for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
       A[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *A, int size)
{
  int i, j;

  for(i = 0; i < size; i++){
    for(j = 0; j < size; j++)
       printf("%d\t", A[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main(int argc, char **argv)
{
  if (argc < 3)
  {
    printf("%s [SIZE] [BLOCKSIZE]\n", argv[0]);
    exit(-1);
  }

  int size = atoi(argv[1]);
  int blockSize = atoi(argv[2]);
  double t1, t2;

 //Memory Allocation in the Host
  int  *A = (int *) malloc (sizeof(int) * size * size);
  int  *B = (int *) malloc (sizeof(int) * size * size);
  int  *C = (int *) malloc (sizeof(int) * size * size);

  initializeMatrix(A, size);
  initializeMatrix(B, size);

 // Memory Allocation in the Device
  int *d_A, *d_B, *d_C;
  cudaMalloc((void **) &d_A, size * size * sizeof(int) ) ;
  cudaMalloc((void **) &d_B, size * size * sizeof(int) ) ;
  cudaMalloc((void **) &d_C, size * size * sizeof(int) ) ;

  t1 = omp_get_wtime();

 // Copy of data from host to device
  cudaMemcpy( d_A, A, size * size * sizeof(int), cudaMemcpyHostToDevice ) ;
  cudaMemcpy( d_B, B, size * size * sizeof(int), cudaMemcpyHostToDevice ) ;
  cudaMemcpy( d_C, C, size * size * sizeof(int), cudaMemcpyHostToDevice ) ;

 // 2D Computational Grid
  dim3 dimGrid( (int) ceil( (int) size / (float) blockSize ), (int) ceil( (int) size / (int) blockSize ) );
  dim3 dimBlock( blockSize, blockSize);

       kernel<<<dimGrid, dimBlock>>>(A, B, C, size);

 // Copy of data from device to host
  cudaMemcpy( C, d_C, size * size * sizeof(int), cudaMemcpyDeviceToHost ) ;

  t2 = omp_get_wtime();

  printf("%d\t%f\n", size, t2-t1);

 //printMatrix(A, size);
 //printMatrix(B, size);
 //printMatrix(C, size);

// Memory Allocation in the Device
 cudaFree(d_A);
 cudaFree(d_B);
 cudaFree(d_C);

// Memory Allocation in the Host
 free(A);
 free(B);
 free(C);

 return 0;

}

Writing mm.cu


In [7]:
!nvcc -arch=sm_75 mm.cu -o mm -Xcompiler -fopenmp

In [8]:
!./mm 10000 64

10000	0.383470


### `Grid Stride Loop`

In [9]:
%%HTML

<div align="center">
<iframe src="https://docs.google.com/presentation/d/1tRO-HwqCfv8imhDO4S_8yAv8wEcJVttZ/edit?usp=sharing&ouid=117965215426975519312&rtpof=true&sd=true" frameborder="0" width="900" height="550" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true">

</iframe></div>

In [10]:
%%writefile mm-gridStrideLoop.cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

__global__ void kernel(int *A, int *B, int *C, int size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;
  int k;

  if((i < size) && (j < size))
    for(k = 0; k < size; k++)
       C[i * size + j] += A[i * size + k] * B[k * size + j];

}

__global__ void kernelGridStrideLoop(int *A, int *B, int *C, int size)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int idy = blockIdx.y * blockDim.y + threadIdx.y;
  int stride = gridDim.x * blockDim.x;
  int i, j, k;

  for(i = idx; i < size; i += stride)
    for(j = idy; j < size; j += stride)
    {
       for(k = 0; k < size; k++)
            C[i * size + j] += A[i * size + k] * B[k * size + j];
    }

}

void initializeMatrix(int *A, int size)
{
  int i, j;

  for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
      A[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *A, int size)
{
  int i, j;

  for(i = 0; i < size; i++){
    for(j = 0; j < size; j++)
      printf("%d\t", A[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main(int argc, char **argv)
{
  if (argc < 3)
  {
    printf("%s [SIZE] [BLOCKSIZE]\n", argv[0]);
    exit(-1);
  }

  int size = atoi(argv[1]);
  int blockSize = atoi(argv[2]);
  double t1, t2;

 //Memory Allocation in the Host
  int  *A = (int *) malloc (sizeof(int) * size * size);
  int  *B = (int *) malloc (sizeof(int) * size * size);
  int  *C = (int *) malloc (sizeof(int) * size * size);

  initializeMatrix(A, size);
  initializeMatrix(B, size);

 // Memory Allocation in the Device
  int *d_A, *d_B, *d_C;
  cudaMalloc((void **) &d_A, size * size * sizeof(int) ) ;
  cudaMalloc((void **) &d_B, size * size * sizeof(int) ) ;
  cudaMalloc((void **) &d_C, size * size * sizeof(int) ) ;

  t1 = omp_get_wtime();

 // Copy of data from host to device
  cudaMemcpy( d_A, A, size * size * sizeof(int), cudaMemcpyHostToDevice ) ;
  cudaMemcpy( d_B, B, size * size * sizeof(int), cudaMemcpyHostToDevice ) ;
  cudaMemcpy( d_C, C, size * size * sizeof(int), cudaMemcpyHostToDevice ) ;

 // 2D Computational Grid
  dim3 dimGrid( (int) ceil( (int) size / (int) blockSize ), (int) ceil( (int) size / (int) blockSize ) );
  dim3 dimBlock( blockSize, blockSize);

            kernelGridStrideLoop<<<dimGrid, dimBlock>>>(A, B, C, size);

 // Copy of data from device to host
  cudaMemcpy( C, d_C, size * size * sizeof(int), cudaMemcpyDeviceToHost ) ;

  t2 = omp_get_wtime();

  printf("%d\t%f\n", size, t2-t1);

 //printMatrix(A, size);
 //printMatrix(B, size);
 //printMatrix(C, size);

// Memory Allocation in the Device
 cudaFree(d_A);
 cudaFree(d_B);
 cudaFree(d_C);

// Memory Allocation in the Host
 free(A);
 free(B);
 free(C);

 return 0;
}

Writing mm-gridStrideLoop.cu


In [11]:
!nvcc -arch=sm_75 mm-gridStrideLoop.cu -o mm-gridStrideLoop -Xcompiler -fopenmp

In [25]:
!./mm-gridStrideLoop 10000 64

10000	0.376829


### `Unified Memory (cudaMallocManaged)`

In [13]:
%%HTML

<div align="center">
<iframe src="https://docs.google.com/presentation/d/1ui1b_fEY8NsG8fffNmUJ2HbWyu_r1xOz/edit?usp=sharing&ouid=117965215426975519312&rtpof=true&sd=true" frameborder="0" width="900" height="550" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true">

</iframe></div>

In [40]:
%%writefile mm-cudaMallocManaged.cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

__global__ void kernel(int *A, int *B, int *C, int size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;
  int k;

  if((i < size) && (j < size))
     for(k = 0; k < size; k++)
        C[i * size + j] += A[i * size + k] * B[k * size + j];

}

__global__ void kernelGridStrideLoop(int *A, int *B, int *C, int size)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int idy = blockIdx.y * blockDim.y + threadIdx.y;
  int stride = gridDim.x * blockDim.x;
  int i, j, k;

  for(i = idx; i < size; i += stride)
    for(j = idy; j < size; j += stride)
    {
       for(k = 0; k < size; k++)
            C[i * size + j] += A[i * size + k] * B[k * size + j];
    }

}

void initializeMatrix(int *A, int size)
{
  int i, j;

  for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
      A[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *A, int size)
{
  int i, j;

  for(i = 0; i < size; i++){
    for(j = 0; j < size; j++)
      printf("%d\t", A[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main(int argc, char **argv)
{
 /*Usage*/
 if (argc < 3) {
   printf("%s [SIZE] [BLOCKSIZE]\n", argv[0]);
   exit(-1);
 }

 int size = atoi(argv[1]);
 int blockSize = atoi(argv[2]); ;
 double t1, t2, t3, t4;
 int *A,  *B,  *C;

 t1 = omp_get_wtime();

 cudaMallocManaged(&A, sizeof(int) * size * size);
 cudaMallocManaged(&B, sizeof(int) * size * size);
 cudaMallocManaged(&C, sizeof(int) * size * size);

     t3 = omp_get_wtime();
    
     initializeMatrix(A, size);
     initializeMatrix(B, size);

     t4 = omp_get_wtime();
   
 //printMatrix(A, size);
 //printMatrix(B, size);
 //printMatrix(C, size);

 dim3 dimGrid( (int) ceil( (int) size / (int) blockSize ), (int) ceil( (int) size / (int) blockSize ) );
 dim3 dimBlock( blockSize, blockSize);

      kernelGridStrideLoop<<<dimGrid, dimBlock>>>(A, B, C, size);
      cudaDeviceSynchronize();

t2 = omp_get_wtime();

printf("%d\t%f\n", size, (t2-t1)-(t4-t3));

//printMatrix(A, size);
//printMatrix(B, size);
//printMatrix(C, size);

cudaFree(A);
cudaFree(B);
cudaFree(C);

return 0;

}


Overwriting mm-cudaMallocManaged.cu


In [41]:
!nvcc mm-cudaMallocManaged.cu -o mm-cudaMallocManaged -Xcompiler -fopenmp

In [42]:
!./mm-cudaMallocManaged 10000 64

10000	0.195462


#### `Stream Multiprocessador (SMs)`

In [29]:
%%HTML

<div align="center">
<iframe src="https://docs.google.com/presentation/d/18z3x55kxCCjGZ3LVKOtSN5q8qXe4swFL/edit?usp=sharing&ouid=117965215426975519312&rtpof=true&sd=true" frameborder="0" width="900" height="550" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true">

</iframe></div>

In [37]:
%%writefile mm-streamMultiprocessador.cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

__global__ void kernelGridStrideLoop(int *A, int *B,  int *C, int size)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int idy = blockIdx.y * blockDim.y + threadIdx.y;
  int stride = gridDim.x * blockDim.x;
  int i, j, k;

  for(i = idx; i < size; i += stride)
    for(j = idy; j < size; j += stride)
    {
       for(k = 0; k < size; k++)
         C[i * size + j] += A[i * size + k] * B[k * size + j];
    }

}

void initializeMatrix(int *A, int size)
{
  int i, j;

  for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
      A[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *A, int size)
{
  int i, j;

  for(i = 0; i < size; i++){
    for(j = 0; j < size; j++)
      printf("%d\t", A[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main (int argc, char **argv)
{
 /*Usage*/
 if (argc < 3) {
   printf("%s [SIZE] [BLOCKSIZE]\n", argv[0]);
   exit(-1);
 }

 int size = atoi(argv[1]);
 int sizeblock = atoi(argv[2]); ;
 double t1, t2, t3, t4;
 int *A,  *B, *C;

 t1 = omp_get_wtime();
   
 cudaMallocManaged(&A, sizeof(int) * size * size);
 cudaMallocManaged(&B, sizeof(int) * size * size);
 cudaMallocManaged(&C, sizeof(int) * size * size);

     t3 = omp_get_wtime();
     
     initializeMatrix(A, size);
     initializeMatrix(B, size);
     
     t4 = omp_get_wtime();

 //printMatrix(A, size);
 //printMatrix(B, size);
 //printMatrix(C, size);

 int deviceId, numberOfSMs;
 cudaGetDevice(&deviceId);
 cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);

 int NUMBER_OF_BLOCKS = numberOfSMs * 32;
 int NUMBER_OF_THREADS = 1024;

      kernelGridStrideLoop<<< NUMBER_OF_BLOCKS, NUMBER_OF_THREADS>>>(A, B, C, size);
      cudaDeviceSynchronize();

 t2 = omp_get_wtime();

 printf("%d\t%f\n", size, (t2-t1)-(t4-t3));

//printMatrix(A, size);
//printMatrix(B, size);
//printMatrix(C, size);

cudaFree(A);
cudaFree(B);
cudaFree(C);

return 0;
}


Overwriting mm-streamMultiprocessador.cu


In [38]:
!nvcc mm-streamMultiprocessador.cu -o mm-streamMultiprocessador -Xcompiler -fopenmp

In [39]:
!./mm-streamMultiprocessador 10000 64

10000	0.374113
