# Sequential execution X Parallel execution.

## Sequential execution:

### First we need to import some libraries and declare some useful variables as folder and file paths:

In [1]:
import os

folder_name = "sequential_matrix_multiplication"
os.makedirs(folder_name, exist_ok=True)
file_path = os.path.join(folder_name, "sequential_matrix_multiplication.c")

### Next we create our matrix multiply sequential code:

In [2]:
%%writefile $file_path
#include <iostream>
#include <vector>
#include <chrono>

const int ROWS1 = 1000;
const int COLS1 = 1000;
const int ROWS2 = 1000;
const int COLS2 = 1000;

// Function to multiply two matrices
void multiplyMatrices(const std::vector<std::vector<int>>& firstMatrix,
                      const std::vector<std::vector<int>>& secondMatrix,
                      std::vector<std::vector<int>>& result) {
    for (int i = 0; i < ROWS1; i++) {
        for (int j = 0; j < COLS2; j++) {
            result[i][j] = 0;
            for (int k = 0; k < COLS1; k++) {
                result[i][j] += firstMatrix[i][k] * secondMatrix[k][j];
            }
        }
    }
}

// Function to display a matrix
void displayMatrix(const std::vector<std::vector<int>>& matrix) {
    for (const auto& row : matrix) {
        for (int val : row) {
            std::cout << val << " ";
        }
        std::cout << '\n';
    }
}

int main() {
    // Use a fill_value to determine the value to fill the matrices
    int fill_value = 1;

    // Create and initialize matrices
    std::vector<std::vector<int>> firstMatrix(ROWS1, std::vector<int>(COLS1, fill_value));
    std::vector<std::vector<int>> secondMatrix(ROWS2, std::vector<int>(COLS2, fill_value));
    std::vector<std::vector<int>> resultMatrix(ROWS1, std::vector<int>(COLS2, 0));

    // Start measuring time
    auto begin = std::chrono::high_resolution_clock::now();

    // Perform matrix multiplication
    multiplyMatrices(firstMatrix, secondMatrix, resultMatrix);

    // Stop measuring time and calculate the elapsed time
    auto end = std::chrono::high_resolution_clock::now();
    auto time_spent = std::chrono::duration_cast<std::chrono::duration<double>>(end - begin);

    // Display the result
    //displayMatrix(resultMatrix);

    std::cout << time_spent.count() << "\n";

    return 0;
}

Overwriting sequential_matrix_multiplication/sequential_matrix_multiplication.c


### Now we need to compile the code we wrote:

In [3]:
!g++ {file_path} -o {folder_name}/sequential_matrix_multiplication

### Execute the sequential matrix multiplication:

In [None]:
!./{folder_name}/sequential_matrix_multiplication >> result.txt
!cat result.txt

## Parallel execution

### Let's import the libraries and create some useful variables as folder and file names:

In [None]:
import os

folder_name = 'dpcpp_matrix_multiplication'
os.makedirs(folder_name, exist_ok=True)
file_path = os.path.join(folder_name, 'dpcpp_matrix_multiplication.cpp')

### Next we create our matrix multiply parallel code:

In [None]:
%%writefile $file_path
#include <CL/sycl.hpp>
#include <iostream>
#include <vector>
#include <chrono>

int main() {
    // Set up sizes. Here for simplicity we use small matrices.
    const int M = 1000; // rows of A and C
    const int N = 1000; // cols of B and C
    const int K = 1000; // cols of A and rows of B

    // Initialize matrices with some values
    std::vector<int> A(M * K, 1); // Matrix A (MxK) with all values set to 1
    std::vector<int> B(K * N, 1); // Matrix B (KxN) with all values set to 2
    std::vector<int> C(M * N, 0); // Matrix C (MxN) as result matrix with all values set to 0

    // Create SYCL buffers for the matrices
    sycl::buffer<int> bufA(A.data(), sycl::range<1>(M * K));
    sycl::buffer<int> bufB(B.data(), sycl::range<1>(K * N));
    sycl::buffer<int> bufC(C.data(), sycl::range<1>(M * N));

    // Create a queue to submit work to, with profiling enabled
    sycl::queue queue(sycl::default_selector{}, sycl::property::queue::enable_profiling{});

    // Start timer for total operation time
    auto total_start = std::chrono::steady_clock::now();

    // Perform the matrix multiplication
    sycl::event event = queue.submit([&](sycl::handler& cgh) {
        // Get access to the buffers
        auto accA = bufA.get_access<sycl::access::mode::read>(cgh);
        auto accB = bufB.get_access<sycl::access::mode::read>(cgh);
        auto accC = bufC.get_access<sycl::access::mode::write>(cgh);

        // Execute a parallel for-workgroup. Each workgroup takes care of one block of C.
        cgh.parallel_for<class matrix_mult>(sycl::range<2>(M, N), [=](sycl::id<2> idx) {
            int row = idx[0];
            int col = idx[1];
            int sum = 0;
            for (int k = 0; k < K; ++k) {
                sum += accA[row * K + k] * accB[k * N + col];
            }
            accC[row * N + col] = sum;
        });
    });

    // Wait for the event to complete
    event.wait();

    // End timer for total operation time
    auto total_end = std::chrono::steady_clock::now();

    // Calculate time taken to execute the kernel
    auto kernel_start_time = event.get_profiling_info<sycl::info::event_profiling::command_start>();
    auto kernel_end_time = event.get_profiling_info<sycl::info::event_profiling::command_end>();
    auto kernel_time_ns = kernel_end_time - kernel_start_time;

    // Access the result
    auto accA = bufA.get_access<sycl::access::mode::read>();
    auto accB = bufB.get_access<sycl::access::mode::read>();
    auto accC = bufC.get_access<sycl::access::mode::read>();
    
    
    /*
    // Print the result
    std::cout << "Matrix A:" << std::endl;
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < N; ++j) {
            std::cout << accA[i * N + j] << " ";
        }
        //std::cout << std::endl;
    }

    std::cout << "Matrix B:" << std::endl;
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < N; ++j) {
            std::cout << accB[i * N + j] << " ";
        }
        std::cout << std::endl;
    }

    std::cout << "Result matrix C:" << std::endl;
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < N; ++j) {
            std::cout << accC[i * N + j] << " ";
        }
        std::cout << std::endl;
    }
    */

    // Print the profiling result
    std::cout << kernel_time_ns * 1e-9 << std::endl;

    return 0;
}

### Now we need to compile the code we wrote:

In [None]:
!icpx -fsycl {file_path} -w -o {folder_name}/dpcpp_matrix_multiplication

### Execute the parallel matrix multiplication:

In [None]:
!./{folder_name}/dpcpp_matrix_multiplication >> result.txt

In [None]:
!cat result.txt

### Now let's calculate our speedUp: (sequential_execution_time / parallel_execution_time)

In [None]:
!echo "SpeedUp: "
!echo "scale=10; $(head -n 1 result.txt) / $(tail -n 1 result.txt)" | bc