# 1. Hello World

Key components of MPI:
  - `<mpi.h>`: Header file of MPI, must include this
  - `MPI_Init`: Initialization of MPI, must call before any other MPI routines.
  - `MPI_Comm_size`: Get the total number of processors. `mpirun -np 4 ./main.ex` get the number `4`
  - `MPI_Comm_rank`: Get the rank (the number 0, 1, 2 ... of P0, P1, P2, ...)
  - `MPI_Finalize()`: Pair with initialization.

In the following example, you will see the basic usage of MPI.
Note that physical processor name is not the same as logical processor rank.

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

int main(int argc, char** argv) {
  MPI_Init(&argc, &argv);

  int myid, numprocs;
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD, &myid);

  char processor_name[MPI_MAX_PROCESSOR_NAME];
  int name_len;
  MPI_Get_processor_name(processor_name, &name_len);

  std::cout << "Hello world from processor " << processor_name << ", rank " << myid << " out of " << numprocs << " processors" << std::endl;

  MPI_Finalize();
  return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

# 2. Combine MPI with OpenMP

You can combine MPI with OpenMP. Each MPI processes is able to use openmp.

 - If you nave np MPI processes and p OpenMP thread each, you will have np*p cores working together.
 - Try the following code and see if it works as you expected.

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>
#include <unistd.h>
#include <omp.h>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if(argc != 2)
   {
      omp_set_num_threads(2);
   }
   else
   {
      omp_set_num_threads(atoi(argv[1]));
   }
   if(myid == 0)
   {
      std::cout << "Number of processors: " << numprocs << std::endl;
      std::cout << "Number of threads: " << omp_get_max_threads() << std::endl;
   }

   int *x = new int[numprocs];
   for(int i = 0; i < numprocs; i++)
   {
      x[i] = i + myid * numprocs;
   }

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         #pragma omp parallel
         {
            int tid = omp_get_thread_num();
            #pragma omp critical
            {
               std::cout << "Processor " << myid << " thread " << tid << " has x = ";
               for(int i = 0; i < numprocs; i++)
               {
                  std::cout << x[i] << " ";
               }
               std::cout << std::endl;
            }
         }
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   sleep(1);

   for(int i = 0; i < numprocs; i++)
   {
      if(myid == i)
      {
         #pragma omp parallel
         {
            int tid = omp_get_thread_num();
            #pragma omp critical
            {
               std::cout << "x[0] on processor " << myid << " thread " << tid << " is " << x[0] << std::endl;
            }
         }
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex -fopenmp

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

# 3. MPI Communicator

 - Communicator are group of processors. The same processor can have different rank/myid in different communicator.
 - In the example below, we group processor i with i+1 for all even i. You can see different rank in MPI_COMM_WORLD and the new comm.
 - We will revisit this later.


In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

int main(int argc, char** argv) {
  MPI_Init(&argc, &argv);

  int rank, size;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  MPI_Comm_size(MPI_COMM_WORLD, &size);

  if (size % 2 != 0) {
    if (rank == 0) {
      std::cerr << "Error: Number of processors must be even." << std::endl;
    }
    MPI_Finalize();
    return 1;
  }

  int color = rank / 2;
  MPI_Comm new_comm;
  MPI_Comm_split(MPI_COMM_WORLD, color, rank, &new_comm);

  int new_rank, new_size;
  MPI_Comm_rank(new_comm, &new_rank);
  MPI_Comm_size(new_comm, &new_size);

  std::cout << "Processor " << rank << " (world rank) is now in communicator " << color
            << " with rank " << new_rank << " (local rank) and size " << new_size
            << std::endl;

  MPI_Comm_free(&new_comm);

  MPI_Finalize();
  return 0;
}

In [None]:
!mpiCC main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

# 4. Next we introduce the bcast operation in MPI.

 - Bcast: send information from root to all other processors.
 - In the code below, we bcast the value of x from processor number 0 to all other processors. You can see a change of value afterward.

Bcast:
  - Pointer to the data: data to be bcase, this is also where other processes receive data. For a number use & to get address
  - Length of the data: for single number is it 1. You can also send array of longer size.
  - Data type: int, float
  - Root: the processor that is sending data out
  - MPI_Comm

Questions: How to modify the code to send from processor 1 instead of processor 0?

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   int x = myid;

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "Before Bcast, processor " << myid << " has x = " << x << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Bcast(&x, 1, MPI_INT, 0, MPI_COMM_WORLD);

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "After Bcast, processor " << myid << " has x = " << x << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

# 5.1 Next, we start talking about send and recv

 - We start from the blocking version of send and receive.
 - This is the simplest routines.

Sender:
 - pointer to the data: for number use `&` to get address
 - send data length: length of the array. If a number, set to 1.
 - send data type: integer? fp32?
 - dest: where the message goes to
 - tag: message unique id. If send recv tag does not match data will not be received.
 - MPI_Comm: the mpi communicator

Receiver:
 - pointer to the data
 - recv data length:
 - recv data type: type of data that is being received
 - source: where the message is from
 - tag: message unique id.
 - MPI_Comm: the communicator
 - MPI_Status: special structure to get some useful information

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if (numprocs != 2) {
      if (myid == 0) {
        std::cerr << "Error: This program requires exactly 2 processors." << std::endl;
      }
      MPI_Finalize();
      return 1;
   }

   if (myid == 0) {
      int message = 123;
      MPI_Send(&message, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
      std::cout << "Processor 0 sent message: " << message << std::endl;
   }
   else if (myid == 1) {
      int received_message;
      MPI_Recv(&received_message, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      std::cout << "Processor 1 received message: " << received_message << std::endl;
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpiCC main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 2 ./main.ex

# 5.2 Examples of dead lock

 - Unpaired communication (send with no receiver)
    - P0:
          send to P1
    - P1:
          no recv of the send
    - Be very careful in your code. This can be caused by tag mismatch, communicator mismatch, and so on.
 - both pe blocked somewhere
    - P0:
          send message 0 to P1
          recv message 1 from P1
    - P1:
          send message 1 to P0
          recv message 0 from P0
    - The code will block at the send commands, as the recv commands will never be executed
 - And many more

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

// Dead lock due to blocking at the MPI_Recv
// Question: How to fix the following code?

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if (numprocs != 2) {
      if (myid == 0) {
        std::cerr << "Error: This program requires exactly 2 processors." << std::endl;
      }
      MPI_Finalize();
      return 1;
   }

   if (myid == 0) {
      int message = 123;
      int received_message;
      MPI_Recv(&received_message, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      std::cout << "Processor 0 received message: " << received_message << std::endl;
      MPI_Send(&message, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
      std::cout << "Processor 0 sent message: " << message << std::endl;
   }
   else if (myid == 1) {
      int message = 321;
      int received_message;
      MPI_Recv(&received_message, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      std::cout << "Processor 1 received message: " << received_message << std::endl;
      MPI_Send(&message, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
      std::cout << "Processor 1 sent message: " << message << std::endl;
   }

   MPI_Finalize();
   return 0;
}

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

// Unpaired send recv due to tag mismatch

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if (numprocs != 2) {
      if (myid == 0) {
        std::cerr << "Error: This program requires exactly 2 processors." << std::endl;
      }
      MPI_Finalize();
      return 1;
   }

   if (myid == 0) {
      int message = 123;
      MPI_Send(&message, 1, MPI_INT, 1, 111, MPI_COMM_WORLD);
      std::cout << "Processor 0 sent message: " << message << std::endl;
   }
   else if (myid == 1) {
      int received_message;
      MPI_Recv(&received_message, 1, MPI_INT, 0, 222, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      std::cout << "Processor 1 received message: " << received_message << std::endl;
   }

   MPI_Finalize();
   return 0;
}

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

// Unpaired send recv due to communicator mismatch
// Even if they include same list of processors!

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   MPI_Comm new_world;
   MPI_Comm_dup(MPI_COMM_WORLD, &new_world);

   if (numprocs != 2) {
      if (myid == 0) {
        std::cerr << "Error: This program requires exactly 2 processors." << std::endl;
      }
      MPI_Finalize();
      return 1;
   }

   if (myid == 0) {
      int message = 123;
      MPI_Send(&message, 1, MPI_INT, 1, 0, new_world);
      std::cout << "Processor 0 sent message: " << message << std::endl;
   }
   else if (myid == 1) {
      int received_message;
      MPI_Recv(&received_message, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      std::cout << "Processor 1 received message: " << received_message << std::endl;
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 2 ./main.ex

# 5.3 Probe

 - In the following code we have three processes, mimicking the following scenario:
    - Three processes running some tasks in parallel
    - Processor 0 then need to gather results and do some post processing
 - In the first example, we use recv
    - Recv from P1 first
    - Must wait for P1 to complete
    - Total time:
      1. 5 seconds for P1 to finish parallel task
      2. 6 seconds (2*3) for handling data after that
      3. 11 seconds in total
 - In the second example, we use probe
    - Recv from whatever comes first. In our example it is P2
    - Total time:
      1. 2 seconds for P0 and P2 to both finish, then the first communication happens.
      2. 3 seconds for P0 to handel data from P2
      3. Now we can directly receive data from P1
      4. Total 8 seconds.

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>
#include <unistd.h>
#include <chrono>

// Recv version
// Switch to prob version by change below to true
bool use_probe = false;

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if (numprocs != 3) {
      if (myid == 0) {
        std::cerr << "Error: This program requires exactly 3 processors." << std::endl;
      }
      MPI_Finalize();
      return 1;
   }

   auto start = std::chrono::high_resolution_clock::now();

   // parallel task:
   // pretend that different tasks take different time
   if(myid == 0) sleep(2);
   else if(myid == 1) sleep(5);
   else if(myid == 2) sleep(1);

   // gathering results to processor 0

   if (myid == 1) {
      int message = 123;
      MPI_Send(&message, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
      std::cout << "Processor 1 sent message: " << message << std::endl;
   }
   else if (myid == 2) {
      int message = 321;
      MPI_Send(&message, 1, MPI_INT, 0, 2, MPI_COMM_WORLD);
      std::cout << "Processor 2 sent message: " << message << std::endl;
   }
   else if (myid == 0) {
      int received_message;
      // in this loop, the data from processor 1 will be received first
      for(int i = 1 ; i <= 2; i++)
      {
         MPI_Status status;


         // Recv version
         if(!use_probe)
         {
            MPI_Recv(&received_message, 1, MPI_INT, i, i, MPI_COMM_WORLD, &status);
            std::cout << "Processor " << myid << " received message: " << received_message << " from processor " << status.MPI_SOURCE << " with tag " << status.MPI_TAG << std::endl;
         }
         // Probe version
         else
         {
            MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
            int count;
            MPI_Get_count(&status, MPI_INT, &count);
            MPI_Recv(&received_message, count, MPI_INT, status.MPI_SOURCE, status.MPI_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            std::cout << "Processor " << myid << " received message: " << received_message << " from processor " << status.MPI_SOURCE << " with tag " << status.MPI_TAG << std::endl;
         }


         // pretend that we need to process the received data
         sleep(3);
      }
      auto end = std::chrono::high_resolution_clock::now();
      std::chrono::duration<double> elapsed = end - start;
      std::cout << "Total time: " << elapsed.count() << " seconds" << std::endl;
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 3 ./main.ex

# 5.4 Example of different send/recv time and count

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>
#include <vector>
#include <complex>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if (numprocs != 2) {
      if (myid == 0) {
        std::cerr << "Error: This program requires exactly 2 processors." << std::endl;
      }
      MPI_Finalize();
      return 1;
   }

   if (myid == 0) {
      std::vector<float> send(8);
      for(int i = 0; i < 8; i++)
      {
         send[i] = i;
      }
      std::cout << "Processor 0 sent message: " << std::endl;
      for(int i = 0; i < 8; i++)
      {
         std::cout << send[i] << " ";
      }
      std::cout << std::endl;
      MPI_Send(send.data(), 8, MPI_FLOAT, 1, 0, MPI_COMM_WORLD);
   }
   else if (myid == 1) {
      std::vector<std::complex<float>> received(4);
      MPI_Recv(received.data(), 4, MPI_COMPLEX, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
      std::cout << "Processor 1 received message: " << std::endl;
      for(int i = 0; i < 4; i++)
      {
         std::cout << received[i] << " ";
      }
      std::cout << std::endl;
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 2 ./main.ex

# 6. Reduce

Reduction. In the following example:
 - If run with -np 4
 - x is 0 1 2 3
 - sum is init to 10086
 - Sum x and put results to sum on processor 0
 - So the sum on processor 0 is changed to 6

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   int x = myid;
   int sum = 10086;

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "Before Reduce, processor " << myid << " has x = " << x << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Reduce(&x, &sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "After Reduce, processor " << myid << " has sum = " << sum << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }
   MPI_Barrier(MPI_COMM_WORLD);

   MPI_Finalize();

   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

# 7 Scatter and scatter v

Question: how to scatter 4 5 6 7 (on P1)?

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>
#include <unistd.h>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if(myid == 0)
   {
      std::cout << "Number of processors: " << numprocs << std::endl;
   }

   int *x = new int[numprocs];
   int *y = new int[1];
   for(int i = 0; i < numprocs; i++)
   {
      x[i] = i + myid * numprocs;
   }

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "Before Scatter, processor " << myid << " has x = ";
         for(int i = 0; i < numprocs; i++)
         {
            std::cout << x[i] << " ";
         }
         std::cout << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Scatter(x, 1, MPI_INT, y, 1, MPI_INT, 0, MPI_COMM_WORLD);

   std::cout<<"Processor "<<myid<<" has " <<y[0] <<std::endl;
   MPI_Barrier(MPI_COMM_WORLD);


   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

Scatterv example:
 - x: 0 1 2 3 4 5 6
 - P0:
    - disp: 0
    - length: 2
    - start from x[0], send x[0] and x[1]
 - P1:
    - disp: 2
    - length: 2
    - start from x[2], send x[2] and x[3]

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>
#include <unistd.h>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if(myid == 0)
   {
      std::cout << "Number of processors: " << numprocs << std::endl;
   }

   if(numprocs != 4)
   {
      if(myid == 0)
      {
         std::cout << "Error: This program requires exactly 4 processors." << std::endl;
      }
      MPI_Finalize();
      return 1;
   }

   int sendcount[4] = {2,2,1,1};
   int displs[4] = {0,2,4,5};
   float x[6];
   for(int i = 0; i < 6; i++)
   {
      x[i] = i;
   }
   if(myid == 0)
   {
      std::cout << "Before Scatterv, processor " << myid << " has x = ";
      for(int i = 0; i < 6; i++)
      {
         std::cout << x[i] << " ";
      }
      std::cout << std::endl;
   }
   MPI_Barrier(MPI_COMM_WORLD);

   float *y = new float[sendcount[myid]];
   MPI_Scatterv(x, sendcount, displs, MPI_FLOAT, y, sendcount[myid], MPI_FLOAT, 0, MPI_COMM_WORLD);

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "After Scatterv, processor " << myid << " has y = ";
         for(int i = 0; i < sendcount[myid]; i++)
         {
            std::cout << y[i] << " ";
         }
         std::cout << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>
#include <unistd.h>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if(myid == 0)
   {
      std::cout << "Number of processors: " << numprocs << std::endl;
   }

   int *x = new int[2*numprocs];
   int *y = new int[2];
   for(int i = 0; i < 2; i++)
   {
      y[i] = i + myid * 2;
   }

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "Before Gather, processor " << myid << " has y = ";
         for(int i = 0; i < 2; i++)
         {
            std::cout << y[i] << " ";
         }
         std::cout << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Gather(y, 2, MPI_INT, x, 2, MPI_INT, 0, MPI_COMM_WORLD);

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "After Gather, processor " << myid << " has x = ";
         for(int i = 0; i < 2*numprocs; i++)
         {
            std::cout << x[i] << " ";
         }
         std::cout << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

# 8 All to all

Like a transpose

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>
#include <unistd.h>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   if(myid == 0)
   {
      std::cout << "Number of processors: " << numprocs << std::endl;
   }

   int *x = new int[numprocs];
   for(int i = 0; i < numprocs; i++)
   {
      x[i] = i + myid * numprocs;
   }

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "Before all-to-all broadcast, processor " << myid << " has x = ";
         for(int i = 0; i < numprocs; i++)
         {
            std::cout << x[i] << " ";
         }
         std::cout << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Alltoall(MPI_IN_PLACE, 1, MPI_INT, x, 1, MPI_INT, MPI_COMM_WORLD);

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "After all-to-all broadcast, processor " << myid << " has x = ";
         for(int i = 0; i < numprocs; i++)
         {
            std::cout << x[i] << " ";
         }
         std::cout << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Finalize();
   return 0;
}

In [None]:
!mpicxx main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex

# 9 Allreduce

Reduce + bcast

In [None]:
%%writefile main.cpp
#include <mpi.h>
#include <iostream>

int main(int argc, char** argv) {
   MPI_Init(&argc, &argv);

   int myid, numprocs;
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &myid);

   int x = myid;
   int sum = 10086;

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "Before Allreduce, processor " << myid << " has x = " << x << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }

   MPI_Allreduce(&x, &sum, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);

   for(int id = 0 ; id < numprocs; id++)
   {
      if(myid == id)
      {
         std::cout << "After Allreduce, processor " << myid << " has sum = " << sum << std::endl;
      }
      MPI_Barrier(MPI_COMM_WORLD);
   }
   MPI_Barrier(MPI_COMM_WORLD);

   MPI_Finalize();

   return 0;
}

In [None]:
!mpiCC main.cpp -o main.ex

In [None]:
!mpirun --oversubscribe --allow-run-as-root -np 4 ./main.ex