# Hands-On 1: Portable Parallel Programming with OpenMP

Welcome to Hands-on _Portable Parallel Programming with OpenMP_. This notebook comprises 2 sessions. Next table shows the documents and files needed to develop each one of the exercises.


|  Sessions     | Codes             | files           | 
| --------------| ----------------- | --------------- |
| Session 1     | Matrix Multiply   |  mm.c           |  
| Session 2     | Asynchronous Task |  asyncTaskOpenMP.c |   


## `Matrix Multiple Benchmark`

The definite algebraic operation of the matrix can be defined as:For your work today, you have access to several GPUs in the cloud. Run the following cell to see the GPUs available to you today.

$$ c_{ij} = \sum\limits_{k=1}^{n } a_{ik} b_{kj} = a_{i1}b_{1j} + a_{i2}b_{2j} + ... + a_{in}b_{nj} $$

where $i$ is summed over for all possible values of $j$ and $k$ and the notation above uses the summation convention. The sequential code of the program is available in the file `mm.c`. The follow code shows an extract of such code. In particular, we can see the algebraic operation include a loop that implements the summatory of the above definition.

In [57]:
%%writefile mm.c
#include <stdio.h>
#include <stdlib.h>

void initializeMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      matrix[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
  {
    for(int j = 0; j < size; j++)
      printf("%d\t", matrix[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main(int argc, char **argv)
{
  int size = atoi(argv[1]);  
  int i, j, k;

  int  *A = (int *) malloc (sizeof(int)*size*size);
  int  *B = (int *) malloc (sizeof(int)*size*size);
  int  *C = (int *) malloc (sizeof(int)*size*size);

  initializeMatrix(A, size);
  initializeMatrix(B, size);

  for(i = 0; i < size; i++)
   for(j = 0; j < size; j++)
     for(k = 0; k < size; k++)
        C[i * size + j] += A[i * size + k] * B[k * size + j];

  printMatrix(A,size);
  printMatrix(B,size);
  printMatrix(C,size);

  return 0;
}

Overwriting mm.c


### Run the Code

In [58]:
!gcc mm.c -o mm

In [59]:
!./mm 10

1	7	0	7	5	7	1	3	6	1	
5	4	5	7	5	4	6	0	7	1	
8	8	6	6	8	8	8	4	1	1	
5	0	0	3	5	3	1	7	4	7	
6	0	0	2	5	4	5	2	2	3	
2	1	1	8	8	0	5	5	4	4	
6	0	5	6	2	8	7	3	4	2	
0	0	0	0	2	6	2	5	6	5	
7	6	6	8	5	3	6	2	8	1	
6	6	8	0	1	1	7	0	3	2	

0	1	2	1	8	3	5	2	6	0	
7	2	7	2	8	1	6	5	1	5	
4	6	0	4	6	2	3	2	0	4	
3	7	5	3	6	5	4	2	5	2	
1	3	2	8	3	2	0	0	7	2	
4	3	6	2	5	1	2	6	4	2	
2	7	8	5	1	5	1	4	8	4	
6	7	5	8	6	0	8	4	0	7	
4	2	8	1	5	2	3	7	8	7	
8	1	3	7	5	2	3	6	6	0	

155	141	212	132	210	81	135	157	173	140	
138	176	214	146	225	124	130	157	230	145	
190	237	261	226	306	142	192	189	252	175	
142	121	149	174	186	70	133	136	174	103	
81	103	131	124	137	77	82	100	171	70	
131	172	176	193	177	106	120	119	205	124	
136	188	203	153	215	117	134	162	212	129	
124	90	144	119	123	42	87	136	132	101	
171	208	249	174	281	140	176	184	253	180	
121	129	148	112	184	88	114	125	145	115	



### Entering time measurement metrics

The next step will be to modify the code in the file `mm.c` to enter time measurement metrics of the matrix multiply in parallel using OpenMP. A first approach could be to use the command `omp_get_wtime()` for the get the initial and final time. You will need to link the command with the OpenMP library by including `omp.h`.

In [60]:
%%writefile mm.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

void initializeMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      matrix[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
  {
    for(int j = 0; j < size; j++)
      printf("%d\t", matrix[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main(int argc, char **argv)
{
 int size = atoi(argv[1]);  
 int i, j, k;
 double t1, t2;

 int  *A = (int *) malloc (sizeof(int)*size*size);
 int  *B = (int *) malloc (sizeof(int)*size*size);
 int  *C = (int *) malloc (sizeof(int)*size*size);

 initializeMatrix(A, size);
 initializeMatrix(B, size);

 t1 = omp_get_wtime();
   for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
      for(k = 0; k < size; k++)
        C[i * size + j] += A[i * size + k] * B[k * size + j];
 t2 = omp_get_wtime();

 printf("%d\t%f\n",size, t2-t1);

 //printMatrix(A,size);
 //printMatrix(B,size);
 //printMatrix(C,size);

 return 0;
}

Overwriting mm.c


### Run the Code

In [61]:
!gcc mm.c -o mm -fopenmp

In [62]:
!./mm 1000

1000	3.891373


### Inserting the OpenMP directive

The next step will be to modify the code in the file `mm.c` to perform the computation of the integral in parallel using OpenMP. A first approach could be to use the directive parallel for considering the variables $i$, $j$ and $k$ are private. After this change, you can compile and execute the program.

In [63]:
%%writefile mm.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

void initializeMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      matrix[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
  {
    for(int j = 0; j < size; j++)
      printf("%d\t", matrix[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main (int argc, char **argv)
{
 int size = atoi(argv[1]);  
 int i, j, k;
 double t1, t2;

 int  *A = (int *) malloc (sizeof(int)*size*size);
 int  *B = (int *) malloc (sizeof(int)*size*size);
 int  *C = (int *) malloc (sizeof(int)*size*size);

 initializeMatrix(A, size);
 initializeMatrix(B, size);

 t1 = omp_get_wtime();
 #pragma omp parallel for private(i, j, k)
   for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
      for(k = 0; k < size; k++)
        C[i * size + j] += A[i * size + k] * B[k * size + j];
 t2 = omp_get_wtime();

 printf("%d\t%f\n",size, t2-t1);

 //printMatrix(A,size);
 //printMatrix(B,size);
 //printMatrix(C,size);

 return 0;
}

Overwriting mm.c


### Run the Code

In [64]:
!gcc mm.c -o mm -fopenmp

In [65]:
!OMP_NUM_THREADS=16 ./mm 1000

1000	1.234644


### Performance Analysis

The last step will be to create the shell script file to measure the perform the computation of the matrix multiply in parallel using OpenMP. The shell script is available in the file `script.sh`.
Compile and execute the script. At run time, an argument can be used to select the number of the threads. For example, to use the first variant you can use for $16$ threads:

In [66]:
%%writefile script.sh
#!/bin/sh

for i in 100 200 300 400 500 600 700 800 900 1000
do
  OMP_NUM_THREADS=$1  ./mm  "$i"
done

Overwriting script.sh


In [80]:
print("NUM_THREADS = 2")
!bash script.sh 2

print("NUM_THREADS = 3")
!bash script.sh 3

print("NUM_THREADS = 4")
!bash script.sh 4

print("NUM_THREADS = 5")
!bash script.sh 5

print("NUM_THREADS = 6")
!bash script.sh 6

print("NUM_THREADS = 7")
!bash script.sh 7

print("NUM_THREADS = 8")
!bash script.sh 8

print("NUM_THREADS = 9")
!bash script.sh 9

print("NUM_THREADS = 10")
!bash script.sh 10

print("NUM_THREADS = 12")
!bash script.sh 12

print("NUM_THREADS = 14")
!bash script.sh 14

print("NUM_THREADS = 16")
!bash script.sh 16


NUM_THREADS = 2
100	0.004261
200	0.013501
300	0.055428
400	0.122101
500	0.214986
600	0.417214
700	0.645438
800	0.982717
900	1.403954
1000	2.346066
NUM_THREADS = 3
100	0.002849
200	0.018245
300	0.035055
400	0.083604
500	0.242629
600	0.268093
700	0.438117
800	0.764525
900	0.973835
1000	1.401968
NUM_THREADS = 4
100	0.002927
200	0.020782
300	0.063260
400	0.117177
500	0.247807
600	0.242725
700	0.357502
800	0.754872
900	0.861965
1000	1.237573
NUM_THREADS = 5
100	0.001664
200	0.028141
300	0.054439
400	0.081348
500	0.174856
600	0.275571
700	0.448064
800	0.705911
900	1.089714
1000	1.893489
NUM_THREADS = 6
100	0.003216
200	0.023702
300	0.055292
400	0.084507
500	0.167499
600	0.301726
700	0.535816
800	0.865338
900	1.058991
1000	1.639300
NUM_THREADS = 7
100	0.002324
200	0.009532
300	0.024553
400	0.073543
500	0.160882
600	0.286409
700	0.437558
800	0.708929
900	1.035497


In [79]:
print("NUM_THREADS = 3")
!bash script.sh 3


NUM_THREADS = 3
100	0.002033
200	0.015908
300	0.038455
400	0.142857
500	0.296704
600	0.296805
700	0.478674
800	0.680968
900	0.998845
1000	1.468385


In [77]:
print("NUM_THREADS = 5")
!bash script.sh 8


NUM_THREADS = 5
100	0.022148
200	0.029841
300	0.050022
400	0.103293
500	0.153110
600	0.254504
700	0.454049
800	0.594063
900	1.063179
1000	1.307018


In [73]:
print("NUM_THREADS = 8")
!bash script.sh 8


NUM_THREADS = 8
100	0.014688
200	0.024234
300	0.042618
400	0.127805
500	0.162168
600	0.280910
700	0.400587
800	0.601958
900	0.895546
1000	1.285744


In [74]:
print("NUM_THREADS = 16")
!bash script.sh 16


NUM_THREADS = 16
100	0.001501
200	0.010420
300	0.032323
400	0.082839
500	0.160684
600	0.258168
700	0.380678
800	0.597057
900	0.902712
1000	1.519269


## `Asynchronous Task`

Asynchronous programming is a set of techniques for implementing expensive operations that run concurrently with the rest of the program. One domain where asynchronous programming is often used is in programs with a graphical user interface: it is often unacceptable when the user interface freezes while performing a costly operation. Also, asynchronous operations are essential for parallel applications that need to run multiple tasks simultaneously. The following is a code `asyncTaskOpenMP.c` that represents a task being done asynchronously. Before understanding the code, compile and run it as follows:

In [None]:
%%writefile asyncTaskOpenMP.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#define SIZE_MATRIX 10

int main(int argc, char **argv)
{
  int n = atoi(argv[1]);
  int block_size = atoi(argv[2]);
  int matrix[SIZE_MATRIX][SIZE_MATRIX], k1 = 10, k2 = 20;
  int i, j, row, column;

  for(i = 0; i < n; i++)
  {
    for(j = 0; j < n; j++)
    {
      matrix[i][j] = 5;
      printf("%d\t", matrix[i][j]);
    }
    printf("\n");
  }

  printf("\n\n");

  omp_set_num_threads(5);

  #pragma omp parallel private(row, column)
  {
    int id = omp_get_thread_num();

    if(id == 0)
    {
      for(row = 0; row < n; row++)
        for(column = 0; column < block_size; column++)
          matrix[row][column] *= k1;
    }

    if(id == 1)
    {
      for(row = 0; row < n; row++)
        for(column = block_size; column < 2 * block_size; column++)
          matrix[row][column] *= k2;
    }
  
  }

  for(i = 0; i < n; i++)
  {
    for(j = 0; j < n; j++)
      printf("%d\t", matrix[i][j]);
    printf("\n");
  }

  return 0;
}

Overwriting asyncTaskOpenMP.c


### Run the Code

In [None]:
!gcc asyncTaskOpenMP.c -o asyncTaskOpenMP -fopenmp

In [None]:
!./asyncTaskOpenMP 10 2

5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	


50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	
50	50	100	100	5	5	5	5	5	5	


## References

M. Boratto. Hands-On Supercomputing with Parallel Computing. Available: https://github.com/muriloboratto/Hands-On-Supercomputing-with-Parallel-Computing. 2022.

B. Chapman, G. Jost and R. Pas. Using OpenMP: Portable Shared Memory Parallel Programming. The MIT Press, 2007, USA.