# Shared Memory Parallelism

   - Shared memory
     - Implementations
       - Vectorization
       - POSIX Threads
       - PGAS
       - OpenMP
     - Forks and joins
     - SoA or AoS
     - Critical sections (locks)
     - GPUs
       - CUDA
       - OpenCL
       - OpenACC

### OpenMP

OpenMP is defined by a set of *directives* that are put into code which on compilation a compiler can turn into multi-threaded code.

Simple hello world from OpenMP.  Here we fetch the number of threads and print out the unique ID given to each.
```fortran
program hello_world_omp
    
    use omp_lib

    implicit none
    integer :: num_threads, thread_id

    !$OMP parallel private(num_threads, thread_id)
    !$ num_threads = omp_get_num_threads()
    !$ thread_id = omp_get_thread_num()
    print *, 'Hello from thread number', thread_id + 1, &
             ' of ', num_threads, ' processes'

    !$OMP end parallel

end program hello_world_omp

```

```fortran
program yeval
   
   use omp_lib

   implicit none

   integer(kind=8), parameter :: n = 2**16
   integer(kind=4) :: i, nthreads
   real(kind=8), dimension(n) :: y
   real(kind=8) :: dx, x

   ! Specify number of threads to use:
   !$ print *, "How many threads to use? "
   !$ read *, nthreads
   !$ call omp_set_num_threads(nthreads)
   !$ print "('Using OpenMP with ',i3,' threads')", nthreads

   dx = 1.d0 / (n+1.d0)

   !$omp parallel do private(x) 
   do i=1, n
      x = i * dx
      y(i) = exp(x) * cos(x) * sin(x) * sqrt(5.d0 * x + 6.d0)
   enddo
   !$omp end parallel do

   print *, "Filled vector y of length", n

end program yeval
```
*Modified from amath 583 - R.J. LeVeque - http://faculty.washington.edu/rjl/classes/am583s2014/notes/openmp.html*

#### Fine-Grain vs. Coarse-Grain Parallelism

Consider the problem of normalizing a vector which requires two steps:
1. Compute the norm of the vector, and
1. Divide each entry of the vector by the norm.

Unfortunately we need to loop over every entry in the vector to compute the norm **before** we can perform the division of each entry.  There are two ways to tackle this problem,
 - Let the compiler decide what thread takes what entries (fine grain) - large number of small tasks
 - Let the programmer explicitly control which entries are handled by each thread (coarse grain) - small number of large tasks

```fortran
program fine_grain
   
    use omp_lib
    implicit none
    integer :: i, thread_num
    integer, parameter :: n = 1000
 
    real(kind=8), dimension(n) :: x, y
    real(kind=8) :: norm,ynorm
 
    integer :: nthreads
    
    ! Specify number of threads to use:
    nthreads = 1       ! need this value in serial mode
    !$ nthreads = 4    
    !$ call omp_set_num_threads(nthreads)
    !$ print "('Using OpenMP with ',i3,' threads')", nthreads

    ! Specify number of threads to use:
    !$ call omp_set_num_threads(4)
 
    ! initialize x:
    !$omp parallel do 
    do i=1,n
        x(i) = real(i, kind=8)  ! convert to double float
    enddo

    norm = 0.d0
    ynorm = 0.d0

    !$omp parallel private(i)

    !$omp do reduction(+ : norm)
    do i=1,n
        norm = norm + abs(x(i))
    enddo

     !$omp barrier   ! not needed (implicit)

    !$omp do reduction(+ : ynorm)
    do i=1,n
        y(i) = x(i) / norm
        ynorm = ynorm + abs(y(i))
    enddo
    
    !$omp end parallel

    print *, "norm of x = ",norm, "  n(n+1)/2 = ",n*(n+1)/2
    print *, 'ynorm should be 1.0:   ynorm = ', ynorm

end program fine_grain

```
*Modified from amath 583 - R.J. LeVeque - http://faculty.washington.edu/rjl/classes/am583s2014/notes/openmp.html*

```fortran
program coarse_grain
    
    use omp_lib
    implicit none
    integer, parameter :: n = 1000
    real(kind=8), dimension(n) :: x,y
    real(kind=8) :: norm,norm_thread,ynorm,ynorm_thread
    integer :: nthreads, points_per_thread,thread_num
    integer :: i,istart,iend

    ! Specify number of threads to use:
    nthreads = 1       ! need this value in serial mode
    !$ nthreads = 4    
    !$ call omp_set_num_threads(nthreads)
    !$ print "('Using OpenMP with ',i3,' threads')", nthreads

    ! Determine how many points to handle with each thread.
    ! Note that dividing two integers and assigning to an integer will
    ! round down if the result is not an integer.  
    ! This, together with the min(...) in the definition of iend below,
    ! insures that all points will get distributed to some thread.
    points_per_thread = (n + nthreads - 1) / nthreads
    print *, "points_per_thread = ",points_per_thread

    ! initialize x:
    do i=1,n
        x(i) = dble(i)  ! convert to double float
        enddo

    norm = 0.d0
    ynorm = 0.d0

    !$omp parallel private(i,norm_thread, &
    !$omp                  istart,iend,thread_num,ynorm_thread) 

    thread_num = 0     ! needed in serial mode
    !$ thread_num = omp_get_thread_num()    ! unique for each thread

    ! Determine start and end index for the set of points to be 
    ! handled by this thread:
    istart = thread_num * points_per_thread + 1
    iend = min((thread_num+1) * points_per_thread, n)

    !$omp critical
    print "("Thread ",i2," will take i = ",i6," through i = ",i6)", thread_num, istart, iend
    !$omp end critical

    norm_thread = 0.d0
    do i=istart,iend
        norm_thread = norm_thread + abs(x(i))
        enddo

    ! update global norm with value from each thread:
    !$omp critical
      norm = norm + norm_thread
      print *, "norm updated to: ",norm
    !$omp end critical

    ! make sure all have updated norm before proceeding:
    !$omp barrier

    ynorm_thread = 0.d0
    do i=istart,iend
        y(i) = x(i) / norm
        ynorm_thread = ynorm_thread + abs(y(i))
        enddo

    ! update global ynorm with value from each thread:
    !$omp critical
      ynorm = ynorm + ynorm_thread
      print *, "ynorm updated to: ",ynorm
    !$omp end critical
    !$omp barrier

    !$omp end parallel 

    print *, "norm of x = ",norm, "  n(n+1)/2 = ",n*(n+1)/2
    print *, 'ynorm should be 1.0:   ynorm = ', ynorm

end program coarse_grain
```
*Modified from amath 583 - R.J. LeVeque - http://faculty.washington.edu/rjl/classes/am583s2014/notes/openmp.html*