## MPI

In [1]:
%%writefile basic_mpi.c
#include <mpi.h>    // MPI header
#include <stdio.h>  // For printf
#include <stdlib.h> // For exit()

int main(int argc, char** argv) {

    // ===============================
    // 1. Initialize MPI environment
    // ===============================
    // Arguments: pointers to argc and argv
    // Returns: MPI_SUCCESS if successful
    MPI_Init(&argc, &argv);

    // ===============================
    // 2. Determine total number of processes
    // ===============================
    int world_size;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    // MPI_COMM_WORLD is the default communicator including all processes

    // ===============================
    // 3. Determine the rank of this process
    // ===============================
    int world_rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    // Rank is an integer from 0 to world_size-1, unique to each process

    // ===============================
    // 4. (Optional) Get processor name
    // ===============================
    char processor_name[MPI_MAX_PROCESSOR_NAME];
    int name_len;
    MPI_Get_processor_name(processor_name, &name_len);

    // ===============================
    // 5. Example: Point-to-Point Communication
    // ===============================
    // Send data from rank 0 to rank 1
    if (world_rank == 0) {
        int number = 42;
        MPI_Send(&number,     // data to send
                 1,           // number of elements
                 MPI_INT,     // data type
                 1,           // destination rank
                 0,           // message tag
                 MPI_COMM_WORLD); // communicator
        printf("Process 0 sent number %d to process 1\n", number);
    } else if (world_rank == 1) {
        int number;
        MPI_Recv(&number,     // buffer to receive
                 1,           // number of elements
                 MPI_INT,     // data type
                 0,           // source rank
                 0,           // message tag
                 MPI_COMM_WORLD, // communicator
                 MPI_STATUS_IGNORE); // status (can store info about message)
        printf("Process 1 received number %d from process 0\n", number);
    }

    // ===============================
    // 6. Example: Collective Communication
    // ===============================
    // Broadcast a number from rank 0 to all processes
    int broadcast_number;
    if (world_rank == 0) {
        broadcast_number = 100;
    }
    MPI_Bcast(&broadcast_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
    printf("Process %d received broadcast number %d\n", world_rank, broadcast_number);

    // ===============================
    // 7. Example: Reduction Operation
    // ===============================
    // Sum all ranks into rank 0
    int sum_of_ranks;
    MPI_Reduce(&world_rank,       // send buffer
               &sum_of_ranks,     // receive buffer
               1,                 // number of elements
               MPI_INT,           // data type
               MPI_SUM,           // operation
               0,                 // root process
               MPI_COMM_WORLD);   // communicator
    if (world_rank == 0) {
        printf("Sum of ranks = %d\n", sum_of_ranks);
    }

    // ===============================
    // 8. Finalize MPI environment
    // ===============================
    MPI_Finalize();

    return 0;
}


Writing basic_mpi.c


In [2]:
!mpicc -o basic_mpi basic_mpi.c
!mpirun -np 4 ./basic_mpi


Process 0 sent number 42 to process 1
Process 0 received broadcast number 100
Process 1 received number 42 from process 0
Process 1 received broadcast number 100
Process 2 received broadcast number 100
Process 3 received broadcast number 100
Sum of ranks = 6


## CUDA

In [3]:
%%writefile cuda_example.cu
#include <cuda_runtime.h>  // CUDA runtime API
#include <stdio.h>          // Standard IO
#include <stdlib.h>         // For malloc/free

// ===============================
// 1. Define a CUDA kernel
// ===============================
__global__ void addKernel(int *d_c, const int *d_a, const int *d_b, int n) {
    // Compute global thread index
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (i < n) {
        d_c[i] = d_a[i] + d_b[i]; // Each thread adds one element
    }
}

// ===============================
// 2. Host code (CPU)
// ===============================
int main() {
    int n = 16; // Size of arrays
    size_t size = n * sizeof(int);

    // -------------------------------
    // 2a. Allocate host memory
    // -------------------------------
    int *h_a = (int*)malloc(size);
    int *h_b = (int*)malloc(size);
    int *h_c = (int*)malloc(size);

    // Initialize host arrays
    for (int i = 0; i < n; i++) {
        h_a[i] = i;
        h_b[i] = i * 2;
    }

    // -------------------------------
    // 2b. Allocate device memory
    // -------------------------------
    int *d_a, *d_b, *d_c;
    cudaMalloc((void**)&d_a, size);  // Allocate array on GPU
    cudaMalloc((void**)&d_b, size);
    cudaMalloc((void**)&d_c, size);

    // -------------------------------
    // 2c. Copy data from host to device
    // -------------------------------
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // -------------------------------
    // 2d. Launch kernel
    // -------------------------------
    int threadsPerBlock = 8;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; // Ceiling division
    addKernel<<<blocksPerGrid, threadsPerBlock>>>(d_c, d_a, d_b, n);

    // -------------------------------
    // 2e. Copy result back to host
    // -------------------------------
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // -------------------------------
    // 2f. Print results
    // -------------------------------
    for (int i = 0; i < n; i++) {
        printf("%d + %d = %d\n", h_a[i], h_b[i], h_c[i]);
    }

    // -------------------------------
    // 2g. Free device and host memory
    // -------------------------------
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}


Writing cuda_example.cu


In [5]:
!nvcc -o cuda_example cuda_example.cu   # Compile
!./cuda_example                         # Run


0 + 0 = 0
1 + 2 = 3
2 + 4 = 6
3 + 6 = 9
4 + 8 = 12
5 + 10 = 15
6 + 12 = 18
7 + 14 = 21
8 + 16 = 24
9 + 18 = 27
10 + 20 = 30
11 + 22 = 33
12 + 24 = 36
13 + 26 = 39
14 + 28 = 42
15 + 30 = 45


| Step                         | Function / Concept                                   | Arguments                            | Purpose                                                                         |
| ---------------------------- | ---------------------------------------------------- | ------------------------------------ | ------------------------------------------------------------------------------- |
| **Kernel declaration**       | `__global__ void kernel(...)`                        | `__global__`                         | Declares a function that runs on GPU, called from host                          |
| **Thread index**             | `threadIdx`, `blockIdx`, `blockDim`                  | `threadIdx.x`, `blockIdx.x`          | Compute unique global thread index: `i = threadIdx.x + blockIdx.x * blockDim.x` |
| **Host memory allocation**   | `malloc(size)`                                       | size in bytes                        | Allocate memory on CPU                                                          |
| **Device memory allocation** | `cudaMalloc((void**)&d_a, size)`                     | pointer to GPU memory, size          | Allocate memory on GPU                                                          |
| **Memory copy H→D**          | `cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice)` | destination, source, size, direction | Copy data from CPU to GPU                                                       |
| **Kernel launch**            | `kernel<<<blocks, threads>>>(args)`                  | blocksPerGrid, threadsPerBlock       | Execute kernel on GPU with parallel threads                                     |
| **Memory copy D→H**          | `cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost)` | destination, source, size, direction | Copy results from GPU to CPU                                                    |
| **Free GPU memory**          | `cudaFree(d_a)`                                      | pointer                              | Release GPU memory                                                              |
| **Free host memory**         | `free(h_a)`                                          | pointer                              | Release CPU memory                                                              |
