In [98]:
%%writefile mm.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

int  *A;
int  *B; 
int  *C;
int size;
int num_threads;

void initializeMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      matrix[i * size + j] = rand() % (10 - 1) * 1;
}

void multiply_sequential(){
  int i, j, k;
  for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
      for(k = 0; k < size; k++)
        C[i * size + j] += A[i * size + k] * B[k * size + j];
}

void multiply_parallel(){
  int i, j, k;
  #pragma omp parallel for private(i, j, k)
  for(i = 0; i < size; i++){
    for(j = 0; j < size; j++){
      if (i == 0 && j == 0) num_threads = omp_get_num_threads();
      for(k = 0; k < size; k++)
        C[i * size + j] = A[i * size + k] * B[k * size + j];
    }
  } 
}

void multiply_parallel_collapse_reduction(){
  int i, j, k;
  #pragma omp parallel for  collapse(2) private(i, j, k)
  for(i = 0; i < size; i++){
    for(j = 0; j < size; j++){
      if (i == 0 && j == 0) num_threads = omp_get_num_threads();
      int ans = 0;
    #pragma omp simd reduction(+ : ans)
      for(k = 0; k < size; k++){
        ans += A[i * size + k] * B[k * size + j];
      }
      C[i * size + j] = ans;
    }
  } 
}

int main (int argc, char **argv)
{
  size = atoi(argv[1]);  
  int i, j, k;
  double st1, st2, mpt1, mpt2, mpcrt1, mpcrt2;

  A = (int *) malloc (sizeof(int)*size*size);
  B = (int *) malloc (sizeof(int)*size*size);
  C = (int *) malloc (sizeof(int)*size*size);

  initializeMatrix(A, size);
  initializeMatrix(B, size);

  // Sequential
  st1 = omp_get_wtime();
  multiply_sequential();
  st2 = omp_get_wtime();
  
  // Parallel
  mpt1 = omp_get_wtime();
  multiply_parallel();
  mpt2 = omp_get_wtime();

  // Parallel collapse reduction
  mpcrt1 = omp_get_wtime();
  multiply_parallel_collapse_reduction();
  mpcrt2 = omp_get_wtime();
  
  double msft = st2-st1; // Sequencial final time
  double mpft = mpt2-mpt1; // Parallel final time
  double mpcrft = mpcrt2-mpcrt1; // Parallel collapse reduction final time

  double speedup = msft/mpcrft;

  printf("Size: %d\tMSFT: %f\tMPFT: %f\tMPCRFT: %f\tSpeedUp: %f\tNum Threads: %d\n", size, msft, mpft, mpcrft, speedup, num_threads);

  return 0;
}

Writing mm.c


### Run the Code

In [99]:
!gcc mm.c -o mm -fopenmp

### Performance Analysis

In [100]:
%%writefile script.sh
#!/bin/sh

for ((i=2; i<=64; i*=2))
do
  for ((j=200; j<=1000; j+=200))
  do
    OMP_NUM_THREADS="$i" ./mm "$j"
  done
  echo "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="
done

Writing script.sh


In [101]:
!bash script.sh 

Size: 200	MSFT: 0.028417	MPFT: 0.011712	MPCRFT: 0.006467	SpeedUp: 4.394053	Num Threads: 2
Size: 400	MSFT: 0.236149	MPFT: 0.089556	MPCRFT: 0.054004	SpeedUp: 4.372842	Num Threads: 2
Size: 600	MSFT: 0.787572	MPFT: 0.301314	MPCRFT: 0.187562	SpeedUp: 4.198989	Num Threads: 2
Size: 800	MSFT: 1.849147	MPFT: 0.719798	MPCRFT: 0.439701	SpeedUp: 4.205461	Num Threads: 2
Size: 1000	MSFT: 3.715425	MPFT: 1.426778	MPCRFT: 0.893393	SpeedUp: 4.158782	Num Threads: 2
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
Size: 200	MSFT: 0.028403	MPFT: 0.006350	MPCRFT: 0.003525	SpeedUp: 8.058443	Num Threads: 4
Size: 400	MSFT: 0.224047	MPFT: 0.046702	MPCRFT: 0.024593	SpeedUp: 9.110077	Num Threads: 4
Size: 600	MSFT: 0.790312	MPFT: 0.160473	MPCRFT: 0.103259	SpeedUp: 7.653711	Num Threads: 4
Size: 800	MSFT: 1.952867	MPFT: 0.373433	MPCRFT: 0.250817	SpeedUp: 7.786018	Num Threads: 4
Size: 1000	MSFT: 3.754693	MPFT: 0.755925	MPCRFT: 0.474902	SpeedUp: 7.906243	Num Threads: 4
=-=-=-=-

# Questions and Answers:

## Question 1:
**What is the behavior of execution time and speedup as the problem size varies? (Present the solution with tabular and graphical data).**

The execution time and speedup vary as the problem size changes. Some expected behaviors also occur, such as for a smaller problem and a larger number of threads, the speedup is reduced, as parallelizing the code in these circumstances proves to be ineffective.

## Question 2:
**What is the optimal number of threads for the best parallel solution?**

The optimal number of threads is 64.

## `Asynchronous Task`

In [102]:
%%writefile asyncTaskOpenMP.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#define SIZE_MATRIX 10

int main(int argc, char **argv)
{
  int n = atoi(argv[1]);
  int block_size = atoi(argv[2]);
  int matrix[SIZE_MATRIX][SIZE_MATRIX], k1 = 10, k2 = 20, k3 = 30, k4 = 40, k5 = 50;
  int i, j, row, column;

  for(i = 0; i < n; i++)
  {
    for(j = 0; j < n; j++)
    {
      matrix[i][j] = 5;
      printf("%d\t", matrix[i][j]);
    }
    printf("\n");
  }

  printf("\n\n");

  omp_set_num_threads(5);

  #pragma omp parallel private(row, column)
  {
    int id = omp_get_thread_num();

    if(id == 0)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < block_size; column++)
          matrix[row][column] *= k1;
    }

    if(id == 1)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < 2 * block_size; column++)
          matrix[row][column] *= k2;
    }

    if(id == 2)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < 3 * block_size; column++)
          matrix[row][column] *= k3;
    }

    if(id == 3)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < 4 * block_size; column++)
          matrix[row][column] *= k4;
    }

    if(id == 4)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < 5 * block_size; column++)
          matrix[row][column] *= k5;
    }
  
  }

  for(i = 0; i < n; i++)
  {
    for(j = 0; j < n; j++)
      printf("%d\t", matrix[i][j]);
    printf("\n");
  }

  return 0;
}

Writing asyncTaskOpenMP.c


### Run the Code

In [103]:
!gcc asyncTaskOpenMP.c -o asyncTaskOpenMP -fopenmp

In [104]:
!./asyncTaskOpenMP 10 2

5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	


50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	


# Questions and Answers:

## Question 1:
**What does the code do from the compilation and execution of the previous code?**

The code aims to divide the matrix into parts, where each thread will be responsible for performing operations on these parts independently.

## Question 2:
**How would it be possible to extend the code so that the five threads perform asynchronous tasks?**

We only need to adjust the range that each thread will follow and adjust the number of "if" statements to cover all possible IDs.


## References

M. Boratto. Hands-On Supercomputing with Parallel Computing. Available: https://github.com/muriloboratto/Hands-On-Supercomputing-with-Parallel-Computing. 2022.

B. Chapman, G. Jost and R. Pas. Using OpenMP: Portable Shared Memory Parallel Programming. The MIT Press, 2007, USA.