In [None]:
%%writefile mm.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

void initializeMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
    for(int j = 0; j < size; j++)
      matrix[i * size + j] = rand() % (10 - 1) * 1;
}

void printMatrix(int *matrix, int size)
{
  for(int i = 0; i < size; i++)
  {
    for(int j = 0; j < size; j++)
      printf("%d\t", matrix[i * size + j]);
    printf("\n");
  }
  printf("\n");
}

int main (int argc, char **argv)
{
 int size = atoi(argv[1]);  
 int i, j, k;
 double t1, t2;

 int  *A = (int *) malloc (sizeof(int)*size*size);
 int  *B = (int *) malloc (sizeof(int)*size*size);
 int  *C = (int *) malloc (sizeof(int)*size*size);

 initializeMatrix(A, size);
 initializeMatrix(B, size);

 int num_threads;
 int num_tests = 1;

// Sequential code
double sft_total = 0;
for(int g = 0; g < num_tests; g++){
  // SEQUENCIAL CODE
  t1 = omp_get_wtime();
   for(i = 0; i < size; i++)
    for(j = 0; j < size; j++)
      for(k = 0; k < size; k++)
        C[i * size + j] += A[i * size + k] * B[k * size + j];
  t2 = omp_get_wtime();

  double sft = t2-t1; // Sequencial final time

  sft_total += sft;
}

// Parallel code
double pft_total = 0;
  for(int g = 0; g < num_tests; g++){
    t1 = omp_get_wtime();
  #pragma omp parallel for private(i, j, k)
    for(i = 0; i < size; i++){
      if(i == 0) num_threads = omp_get_num_threads();
      for(j = 0; j < size; j++)
        for(k = 0; k < size; k++)
          C[i * size + j] += A[i * size + k] * B[k * size + j];     
    }
    t2 = omp_get_wtime();

    double pft = t2-t1; // Parallel final time

    pft_total += pft;
  }

  double sft_media = sft_total/num_tests;
  double pft_media = pft_total/num_tests;

  double speedup = sft_media/pft_media;

  printf("Size: %d\tSFT: %f\t PFT: %f\t SpeedUp: %f\t Num Threads: %d\n",size, sft_media, pft_media, speedup, num_threads);

 //printMatrix(A,size);
 //printMatrix(B,size);
 //printMatrix(C,size);

 return 0;
}

### Run the Code

In [None]:
!gcc mm.c -o mm -fopenmp

### Performance Analysis

In [None]:
%%writefile script.sh
#!/bin/sh

for ((i=100; i<=1000; i+=100))
do
  OMP_NUM_THREADS="$1" ./mm "$i"
done

In [None]:
!bash script.sh 2

# !bash script.sh 4

# !bash script.sh 8

# !bash script.sh 16

# !bash script.sh 32

# !bash script.sh 64

# Questions and Answers:

## Question 1:
**What is the behavior of execution time and speedup as the problem size varies? (Present the solution with tabular and graphical data).**

The execution time and speedup vary as the problem size changes. Some expected behaviors also occur, such as for a smaller problem and a larger number of threads, the speedup is reduced, as parallelizing the code in these circumstances proves to be ineffective.

## Question 2:
**What is the optimal number of threads for the best parallel solution?**

The optimal number of threads is 64.

## `Asynchronous Task`

In [1]:
%%writefile asyncTaskOpenMP.c
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>

#define SIZE_MATRIX 10

int main(int argc, char **argv)
{
  int n = atoi(argv[1]);
  int block_size = atoi(argv[2]);
  int matrix[SIZE_MATRIX][SIZE_MATRIX], k1 = 10, k2 = 20, k3 = 30, k4 = 40, k5 = 50;
  int i, j, row, column;

  for(i = 0; i < n; i++)
  {
    for(j = 0; j < n; j++)
    {
      matrix[i][j] = 5;
      printf("%d\t", matrix[i][j]);
    }
    printf("\n");
  }

  printf("\n\n");

  omp_set_num_threads(5);

  #pragma omp parallel private(row, column)
  {
    int id = omp_get_thread_num();

    if(id == 0)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < block_size; column++)
          matrix[row][column] *= k1;
    }

    if(id == 1)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < 2 * block_size; column++)
          matrix[row][column] *= k2;
    }

    if(id == 2)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < 3 * block_size; column++)
          matrix[row][column] *= k3;
    }

    if(id == 3)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < 4 * block_size; column++)
          matrix[row][column] *= k4;
    }

    if(id == 4)
    {
      for(row = 0; row < n; row++)
        for(column = block_size*id; column < 5 * block_size; column++)
          matrix[row][column] *= k5;
    }
  
  }

  for(i = 0; i < n; i++)
  {
    for(j = 0; j < n; j++)
      printf("%d\t", matrix[i][j]);
    printf("\n");
  }

  return 0;
}

Writing asyncTaskOpenMP.c


### Run the Code

In [2]:
!gcc asyncTaskOpenMP.c -o asyncTaskOpenMP -fopenmp

In [3]:
!./asyncTaskOpenMP 10 2

5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	
5	5	5	5	5	5	5	5	5	5	


50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	
50	50	100	100	150	150	200	200	250	250	


# Questions and Answers:

## Question 1:
**What does the code do from the compilation and execution of the previous code?**

The code aims to divide the matrix into parts, where each thread will be responsible for performing operations on these parts independently.

## Question 2:
**How would it be possible to extend the code so that the five threads perform asynchronous tasks?**

We only need to adjust the range that each thread will follow and adjust the number of "if" statements to cover all possible IDs.


## References

M. Boratto. Hands-On Supercomputing with Parallel Computing. Available: https://github.com/muriloboratto/Hands-On-Supercomputing-with-Parallel-Computing. 2022.

B. Chapman, G. Jost and R. Pas. Using OpenMP: Portable Shared Memory Parallel Programming. The MIT Press, 2007, USA.