# Comparison Fortran Codes APIs on GPU Environment

In [None]:
!pgfortran --version

In [None]:
!nvfortran --version

## Environment Modules on AIRIS

These modules must be initialized before running the jupyter-notebook:
```cpp
Currently Loaded Modulefiles:
    1) anaconda3/2023.07     
    2) ucx/1.15.0
    3) openmpi/4.1.5  
    4) nvhpc/23.11
    5) llvm/12.0.0
```

In [None]:
#!module load anaconda3/2023.07 ucx/1.15.0 openmpi/4.1.5 nvhpc/23.11 llvm/12.0.0

## `Matrix Multiple Benchmarks `

### ⊗ Sequential

In [None]:
%%writefile mm-sequential.f90
program matrix_multiply_sequential
 implicit none

 integer :: n, i, j, k
 real, allocatable :: A(:,:), B(:,:), C(:,:)
 character(len=100) :: arg
    
 integer :: start_count, end_count, rate, count_max
 real :: elapsed_time  

 ! Get the command-line argument for the matrix size
 call get_command_argument(1, arg)
 read(arg, *) n

 ! Allocate matrices
 allocate(A(n, n))
 allocate(B(n, n))
 allocate(C(n, n))

 ! Initialize matrices A and B
 call random_number(A)
 call random_number(B)
 C = 0.0

 ! Get the clock rate (ticks per second) and the maximum count value
 call system_clock(count_max=count_max, count_rate=rate)

 ! Get the start time
 call system_clock(start_count)

 do i = 1, n
    do j = 1, n
       do k = 1, n
          C(i, j) = C(i, j) + A(i, k) * B(k, j)
       end do
    end do
 end do

 ! Get the end time
 call system_clock(end_count)

 ! Calculate the elapsed time in seconds
 elapsed_time = real(end_count - start_count) / rate

 ! Print the elapsed time
 print '(I0, "  ", F0.2)', n, elapsed_time

 ! Deallocate matrices
 deallocate(A)
 deallocate(B)
 deallocate(C)
  
end program matrix_multiply_sequential

In [None]:
!nvfortran mm-sequential.f90 -o mm -O3

In [None]:
!./mm-sequential 1000

### ⊗ OpenMP

In [None]:
%%writefile mm-omp.f90
program matrix_multiply_openmp
 use omp_lib
 implicit none

 integer :: n, i, j, k
 real, allocatable :: A(:,:), B(:,:), C(:,:)
 character(len=100) :: arg
    
 integer :: start_count, end_count, rate, count_max
 real :: elapsed_time  

 ! Get the command-line argument for the matrix size
 call get_command_argument(1, arg)
 read(arg, *) n

 ! Allocate matrices
 allocate(A(n, n))
 allocate(B(n, n))
 allocate(C(n, n))

 ! Initialize matrices A and B
 call random_number(A)
 call random_number(B)
 C = 0.0

 ! Get the clock rate (ticks per second) and the maximum count value
 call system_clock(count_max=count_max, count_rate=rate)

 ! Get the start time
 call system_clock(start_count)

!$omp parallel do private(i,j,k) shared(A,B,C)
 do i = 1, n
    do j = 1, n
       do k = 1, n
          C(i, j) = C(i, j) + A(i, k) * B(k, j)
       end do
    end do
 end do
!$omp end parallel do

 ! Get the end time
 call system_clock(end_count)

 ! Calculate the elapsed time in seconds
 elapsed_time = real(end_count - start_count) / rate

 ! Print the elapsed time
 print '(I0, " x ", I0, "  ", F0.2, " seconds")', n, n, elapsed_time

 ! Deallocate matrices
 deallocate(A)
 deallocate(B)
 deallocate(C)

end program matrix_multiply_openmp

In [None]:
!nvfortran mm-omp.f90 -o mm-omp -fopenmp -O3

In [None]:
!OMP_NUM_THREADS=32 ./mm-omp 1000

### ⊗ OpenACC

In [None]:
%%writefile mm-openacc.f90
program matrix_multiply
  use openacc
  implicit none

  integer :: n, i, j, k
  real, allocatable :: A(:,:), B(:,:), C(:,:)
  character(len=100) :: arg
    
  integer :: start_count, end_count, rate, count_max
  real :: elapsed_time  

  ! Get the command-line argument for the matrix size
  call get_command_argument(1, arg)
  read(arg, *) n

  ! Allocate matrices
  allocate(A(n, n))
  allocate(B(n, n))
  allocate(C(n, n))

  ! Initialize matrices A and B
  call random_number(A)
  call random_number(B)
  C = 0.0

  ! Get the clock rate (ticks per second) and the maximum count value
  call system_clock(count_max=count_max, count_rate=rate)

  ! Get the start time
  call system_clock(start_count)

  ! Matrix multiplication using OpenACC
  !$acc data copyin(A, B), copyout(C)
  !$acc parallel loop collapse(2)
  do i = 1, n
     do j = 1, n
        do k = 1, n
           C(i, j) = C(i, j) + A(i, k) * B(k, j)
        end do
     end do
  end do
  !$acc end parallel loop
  !$acc end data

  ! Get the end time
  call system_clock(end_count)

  ! Calculate the elapsed time in seconds
  elapsed_time = real(end_count - start_count) / rate

  ! Print the elapsed time
  print '(I0, " x ", I0, "  ", F0.2, " seconds")', n, n, elapsed_time

  ! Deallocate matrices
  deallocate(A)
  deallocate(B)
  deallocate(C)

end program matrix_multiply

In [None]:
!pgfortran mm-openacc.f90 -o mm-openacc -acc  

In [None]:
!./mm-openacc 1000

### ⊗ OpenMP5

In [None]:
%%writefile mm-omp5.c
program matrix_multiply_openmp_offloading
  use omp_lib
  implicit none

  integer :: n, i, j, k
  real, allocatable :: A(:,:), B(:,:), C(:,:)
  character(len=100) :: arg
    
  integer :: start_count, end_count, rate, count_max
  real :: elapsed_time  

  ! Get the command-line argument for the matrix size
  call get_command_argument(1, arg)
  read(arg, *) n

  ! Allocate matrices
  allocate(A(n, n))
  allocate(B(n, n))
  allocate(C(n, n))

  ! Initialize matrices A and B
  call random_number(A)
  call random_number(B)
  C = 0.0

  ! Get the clock rate (ticks per second) and the maximum count value
  call system_clock(count_max=count_max, count_rate=rate)

  ! Get the start time
  call system_clock(start_count)

  ! Perform matrix multiplication using OpenMP 5.0 on GPU
  !$omp target data map(to: A(1:lda, 1:i), B(1:ldb, 1:j)) map(tofrom: C(1:ldc, 1:k))
  !$omp target teams distribute parallel for collapse(2) private(i,j,k) shared(A,B,C)
  do i = 1, n
     do j = 1, n
        do k = 1, n
           C(i, j) = C(i, j) + A(i, k) * B(k, j)
         end do
     end do
  end do
  !$omp end target teams distribute parallel for
  !$omp end target data

  ! Get the end time
  call system_clock(end_count)

  ! Calculate the elapsed time in seconds
  elapsed_time = real(end_count - start_count) / rate

  ! Print the elapsed time
  print '(I0, " x ", I0, "  ", F0.2, " seconds")', n, n, elapsed_time

  ! Deallocate matrices
  deallocate(A)
  deallocate(B)
  deallocate(C)

end program matrix_multiply_openmp_offloading

In [None]:
!nvfortran mm-omp5.f90 -o mm-omp5 -stdpar=gpu

In [None]:
!./mm-omp5 1000

### Table with Comparison Performance Analysis using 1-GPU

| Program Version      | Execution Time (sec.)  | Speedup      |
| :---                 |    :----:              |        ---:  |
| Serial               | 1.50                   | 1X           |
| OpenMP T=36          | 0.06                   | 25X          |
| OpenACC              | 0.57                   | 2.6X         | 
| OpenMP5              | 2.24                   | -            | 

## Limpando os arquivos remanescentes

In [None]:
!rm -rf mm*