# OPTIMIZATIONS CUDA

 Murilo Boratto$^1$

$^1$ SENAI CIMATEC <br />
     &nbsp;&nbsp;&nbsp; Supercomputing Center<br />

## Enabled GPU in Colab

**Go to Menu > Runtime > Change runtime > V100 GPU**

## Check if GPU is running or not, run the following command

In [1]:
!nvidia-smi

Mon Jan 15 09:26:03 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Check if nvcc compiler is capable of using GPU

In [32]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


## Optimizations Matrix Multiply Benchmark

### The workload is divided in threads manually using CUDA

In [7]:
%%writefile mm.cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

__global__ void kernel(int *A, int *B, int *C, int size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;

  if((i < size) && (j < size))
    for(int k = 0; k < size; k++)
       C[i * size + j] += A[i * size + k] * B[k * size + j];

}

void initializeMatrix(int *A, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      A[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *A, int size)
{
  for(int i = 0; i < size; i++){
    for(int j = 0; j < size; j++)
      printf("%d\t", A[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main(int argc, char **argv)
{
  if (argc < 3)
  {
    printf("%s [SIZE] [BLOCKSIZE]\n", argv[0]);
    exit(-1);
  }

  int n = atoi(argv[1]);
  int blockSize = atoi(argv[2]);
  double t1, t2;

 //Memory Allocation in the Host
  int  *A = (int *) malloc (sizeof(int)*n*n);
  int  *B = (int *) malloc (sizeof(int)*n*n);
  int  *C = (int *) malloc (sizeof(int)*n*n);

  initializeMatrix(A, n);
  initializeMatrix(B, n);

 //printMatrix(A, n);
 //printMatrix(B, n);
 //printMatrix(C, n);

 // Memory Allocation in the Device
  int *d_A, *d_B, *d_C;
  cudaMalloc((void **) &d_A, n * n * sizeof(int) ) ;
  cudaMalloc((void **) &d_B, n * n * sizeof(int) ) ;
  cudaMalloc((void **) &d_C, n * n * sizeof(int) ) ;

  t1 = omp_get_wtime();

 // Copy of data from host to device
  cudaMemcpy( d_A, A, n * n * sizeof(int), cudaMemcpyHostToDevice ) ;
  cudaMemcpy( d_B, B, n * n * sizeof(int), cudaMemcpyHostToDevice ) ;
  cudaMemcpy( d_C, C, n * n * sizeof(int), cudaMemcpyHostToDevice ) ;

 // 2D Computational Grid
  dim3 dimGrid( (int) ceil( (float) n / (float) blockSize ), (int) ceil( (float) n / (float) blockSize ) );
  dim3 dimBlock( blockSize, blockSize);

            kernel<<<dimGrid, dimBlock>>>(A, B, C, n);

 // Copy of data from device to host
  cudaMemcpy( C, d_C, n * n * sizeof(float), cudaMemcpyDeviceToHost ) ;

  t2 = omp_get_wtime();

  printf("%d\t%f\n", n, t2-t1);

 //printMatrix(A, n);
 //printMatrix(B, n);
 //printMatrix(C, n);

// Memory Allocation in the Device
 cudaFree(d_A) ;
 cudaFree(d_B) ;
 cudaFree(d_C) ;

// Memory Allocation in the Host
 free(A);
 free(B);
 free(C);

 return 0;
}

Writing mm.cu


In [8]:
!nvcc mm.cu -o mm -Xcompiler -fopenmp -O3

In [9]:
!./mm 10000 64

10000	0.870519


### Grid Strider Loop

In [4]:
%%HTML

<div align="center">
<iframe src="https://docs.google.com/presentation/d/1tRO-HwqCfv8imhDO4S_8yAv8wEcJVttZ/edit?usp=sharing&ouid=117965215426975519312&rtpof=true&sd=true" frameborder="0" width="900" height="550" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true">
 
</iframe></div>

In [10]:
%%writefile mm-gridStriderLoop.cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

__global__ void kernel(int *A, int *B, int *C, int size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;

  if((i < size) && (j < size))
    for(int k = 0; k < size; k++)
       C[i * size + j] += A[i * size + k] * B[k * size + j];

}

__global__ void kernelGridStriderLoop(int *A, int *B, int *C, int n)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int idy = blockIdx.y * blockDim.y + threadIdx.y;
  int stride = gridDim.x * blockDim.x;
  int i, j, k;

  for(i = idx; i < n; i += stride)
    for(j = idy; j < n; j += stride)
    {
       for(k = 0; k < n; k++)
            C[i*n+j] += A[i*n+k] * B[k*n+j];
    }

}

void initializeMatrix(int *A, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      A[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *A, int size)
{
  for(int i = 0; i < size; i++){
    for(int j = 0; j < size; j++)
      printf("%d\t", A[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main(int argc, char **argv)
{
  if (argc < 3)
  {
    printf("%s [SIZE] [BLOCKSIZE]\n", argv[0]);
    exit(-1);
  }

  int n = atoi(argv[1]);
  int blockSize = atoi(argv[2]);
  double t1, t2;

 //Memory Allocation in the Host
  int  *A = (int *) malloc (sizeof(int)*n*n);
  int  *B = (int *) malloc (sizeof(int)*n*n);
  int  *C = (int *) malloc (sizeof(int)*n*n);

  initializeMatrix(A, n);
  initializeMatrix(B, n);

 //printMatrix(A, n);
 //printMatrix(B, n);
 //printMatrix(C, n);

 // Memory Allocation in the Device
  int *d_A, *d_B, *d_C;
  cudaMalloc((void **) &d_A, n * n * sizeof(int) ) ;
  cudaMalloc((void **) &d_B, n * n * sizeof(int) ) ;
  cudaMalloc((void **) &d_C, n * n * sizeof(int) ) ;

  t1 = omp_get_wtime();

 // Copy of data from host to device
  cudaMemcpy( d_A, A, n * n * sizeof(int), cudaMemcpyHostToDevice ) ;
  cudaMemcpy( d_B, B, n * n * sizeof(int), cudaMemcpyHostToDevice ) ;
  cudaMemcpy( d_C, C, n * n * sizeof(int), cudaMemcpyHostToDevice ) ;

 // 2D Computational Grid
  dim3 dimGrid( (int) ceil( (float) n / (float) blockSize ), (int) ceil( (float) n / (float) blockSize ) );
  dim3 dimBlock( blockSize, blockSize);

            kernelGridStriderLoop<<<dimGrid, dimBlock>>>(A, B, C, n);

 // Copy of data from device to host
  cudaMemcpy( C, d_C, n * n * sizeof(float), cudaMemcpyDeviceToHost ) ;

  t2 = omp_get_wtime();

  printf("%d\t%f\n", n, t2-t1);

 //printMatrix(A, n);
 //printMatrix(B, n);
 //printMatrix(C, n);

// Memory Allocation in the Device
 cudaFree(d_A) ;
 cudaFree(d_B) ;
 cudaFree(d_C) ;

// Memory Allocation in the Host
 free(A);
 free(B);
 free(C);

 return 0;
}

Writing mm-gridStriderLoop.cu


In [11]:
!nvcc mm-gridStriderLoop.cu -o mm-gridStriderLoop -Xcompiler -fopenmp -O3

In [12]:
!./mm-gridStriderLoop 10000 64

10000	0.779759


### cudaMallocManaged

In [5]:
%%HTML

<div align="center">
<iframe src="https://docs.google.com/presentation/d/1ui1b_fEY8NsG8fffNmUJ2HbWyu_r1xOz/edit?usp=sharing&ouid=117965215426975519312&rtpof=true&sd=true" frameborder="0" width="900" height="550" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true">
 
</iframe></div>

In [31]:
%%writefile mm-cudaMallocManaged.cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

__global__ void kernel(int *A, int *B, int *C, int size)
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;

  if((i < size) && (j < size))
     for(int k = 0; k < size; k++)
        C[i * size + j] += A[i * size + k] * B[k * size + j];

}

__global__ void kernelGridStriderLoop(int *A, int *B, int *C, int n)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int idy = blockIdx.y * blockDim.y + threadIdx.y;
  int stride = gridDim.x * blockDim.x;
  int i, j, k;

  for(i = idx; i < n; i += stride)
    for(j = idy; j < n; j += stride)
    {
       for(k = 0; k < n; k++)
            C[i*n+j] += A[i*n+k] * B[k*n+j];
    }

}

void initializeMatrix(int *A, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      A[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *A, int size)
{
  for(int i = 0; i < size; i++){
    for(int j = 0; j < size; j++)
      printf("%d\t", A[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main(int argc, char **argv)
{
 /*Usage*/
 if (argc < 3) {
   printf("%s [SIZE] [BLOCKSIZE]\n", argv[0]);
   exit(-1);
 }

 int n = atoi(argv[1]);
 int blockSize = atoi(argv[2]); ;
 double t1, t2;
 int *A,  *B,  *C;

 cudaMallocManaged(&A, sizeof(int) * n * n);
 cudaMallocManaged(&B, sizeof(int) * n * n);
 cudaMallocManaged(&C, sizeof(int) * n * n);

 initializeMatrix(A, n);
 initializeMatrix(B, n);

 //printMatrix(A, n);
 //printMatrix(B, n);
 //printMatrix(C, n);

t1 = omp_get_wtime();

 dim3 dimGrid( (int) ceil( (float) n / (float) blockSize ), (int) ceil( (float) n / (float) blockSize ) );
 dim3 dimBlock( blockSize, blockSize);

      kernelGridStriderLoop<<<dimGrid, dimBlock>>>(A, B, C, n);
      cudaDeviceSynchronize();

t2 = omp_get_wtime();

printf("%d\t%f\n", n, t2-t1);

//printMatrix(A, n);
//printMatrix(B, n);
//printMatrix(C, n);

cudaFree(A);
cudaFree(B);
cudaFree(C);

return 0;
}


Overwriting mm-cudaMallocManaged.cu


In [24]:
!nvcc mm-cudaMallocManaged.cu -o mm-cudaMallocManaged -Xcompiler -fopenmp -O3

In [25]:
!./mm-cudaMallocManaged 10000 64

10000	0.081976


#### Stream Multiprocessador

In [6]:
%%HTML

<div align="center">
<iframe src="https://docs.google.com/presentation/d/18z3x55kxCCjGZ3LVKOtSN5q8qXe4swFL/edit?usp=sharing&ouid=117965215426975519312&rtpof=true&sd=true" frameborder="0" width="900" height="550" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true">
 
</iframe></div>

In [28]:
%%writefile mm-streamMultiprocessador.cu
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

__global__ void kernelGridStriderLoop(int *A, int *B,  int *C, int n)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int idy = blockIdx.y * blockDim.y + threadIdx.y;
  int stride = gridDim.x * blockDim.x;
  int i, j, k;

  for(i = idx; i < n; i += stride)
    for(j = idy; j < n; j += stride)
    {
       for(k = 0; k < n; k++)
         C[i*n+j] += A[i*n+k] * B[k*n+j];
    }

}

void initializeMatrix(int *A, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      A[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *A, int size)
{
  for(int i = 0; i < size; i++){
    for(int j = 0; j < size; j++)
      printf("%d\t", A[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main (int argc, char **argv)
{
 /*Usage*/
 if (argc < 3) {
   printf("%s [SIZE] [BLOCKSIZE]\n", argv[0]);
   exit(-1);
 }

 int n = atoi(argv[1]);
 int sizeblock = atoi(argv[2]); ;
 double t1, t2;
 int *A,  *B, *C;

 cudaMallocManaged(&A, sizeof(int) * n * n);
 cudaMallocManaged(&B, sizeof(int) * n * n);
 cudaMallocManaged(&C, sizeof(int) * n * n);

 initializeMatrix(A, n);
 initializeMatrix(B, n);

 //printMatrix(A, n);
 //printMatrix(B, n);
 //printMatrix(C, n);

t1 = omp_get_wtime();

 int deviceId, numberOfSMs;
 cudaGetDevice(&deviceId);
 cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);

 int NUMBER_OF_BLOCKS = numberOfSMs * 32;
 int NUMBER_OF_THREADS = 1024;

      kernelGridStriderLoop<<< NUMBER_OF_BLOCKS, NUMBER_OF_THREADS>>>(A, B, C, n);
      cudaDeviceSynchronize();

 t2 = omp_get_wtime();

 printf("%d\t%f\n", n, t2-t1);

//printMatrix(A, n);
//printMatrix(B, n);
//printMatrix(C, n);

cudaFree(A);
cudaFree(B);
cudaFree(C);

return 0;
}


Overwriting mm-streamMultiprocessador.cu


In [29]:
!nvcc mm-streamMultiprocessador.cu -o mm-streamMultiprocessador -Xcompiler -fopenmp -O3

In [30]:
!./mm-streamMultiprocessador 10000 64

10000	0.238641
