
# CUDA Exercise 09
> You should try to implement your own solution for matrix multiplication, and try to parallelize the computation.

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_09.ipynb)


## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
# !pip install git+git://github.com/depctg/nvcc4jupyter.git
# %load_ext nvcc_plugin
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp5gf4notc".


## Check the environment

In [2]:
!lsb_release -a
!nvcc --version
!nvidia-smi

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.4 LTS
Release:	22.04
Codename:	jammy
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Sat Jun  7 03:22:48 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |


## Matrix Multiplication - Implimentation 01

In [3]:
%%writefile matrix_mul_01.cu
// %%cu
#include <stdio.h>

__global__ void matrix_mul(int *matrix_a, int *matrix_b, int *matrix_c,int matrix_a_row,int matrix_a_column,int matrix_b_column){
    int matrix_c_element = 0;
    for (int i = 0; i < matrix_a_column; i++){
      matrix_c_element += matrix_a[(threadIdx.x/matrix_b_column)*matrix_a_column+i] * matrix_b[threadIdx.x%matrix_b_column+i*matrix_b_column];
    }
    matrix_c[threadIdx.x]= matrix_c_element;
}

int main(int argc, char *argv[]){

    //===========================================================================
    // Below, there are three example case, which you should only uncomment one
    // of them, to run the test.
    /* Example 1
    int matrix_a[16] = {5,0,34,21,7,17,-12,28,8,-3,-3,-3,0,-3,5,9};
    int matrix_a_row = 4;
    int matrix_a_column = 4;
    int matrix_b[16] = {0,16,24,-90,-23,0,11,1,3,3,0,3,66,7,8,0};
    int matrix_b_row = 4;
    int matrix_b_column = 4;
    */

    /* Example 2
    int matrix_a[12] = {12,6,22,7,17,-12,36,9,9,0,-1,-2};
    int matrix_a_row = 4;
    int matrix_a_column = 3;
    int matrix_b[15] = {0,16,24,-1,4,-23,0,11,1,4,3,3,0,3,4};
    int matrix_b_row = 3;
    int matrix_b_column = 5;
    */

    // random initialization of larger matrixes
    // matrix_a_row * matrix_b_column <= 1024
    int matrix_a_row = 50;
    int matrix_a_column = 30;
    int *matrix_a = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_a_column));
    for(int i = 0; i < matrix_a_row; i++){
        for(int j = 0; j < matrix_a_column; j++)
        {
            int index = i * matrix_a_column+j;
            matrix_a[index] = 1;
        }
    }
    int matrix_b_row = 30;
    int matrix_b_column = 20;
    int *matrix_b = (int*) malloc(sizeof(int) * (matrix_b_row * matrix_b_column));
    for(int i = 0; i < matrix_b_row; i++){
        for(int j = 0; j < matrix_b_column; j++)
        {
            int index = i * matrix_b_column+j;
            matrix_b[index] = 2;
        }
    }

    //===========================================================================

    int *matrix_c = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_b_column));
    int *d_matrix_a, *d_matrix_b, *d_matrix_c;

    cudaMalloc((void**)&d_matrix_a,sizeof(int) * (matrix_a_row * matrix_a_column));
    cudaMalloc((void**)&d_matrix_b,sizeof(int) * (matrix_b_row * matrix_b_column));
    cudaMalloc((void**)&d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column));

    cudaMemcpy(d_matrix_a, matrix_a, sizeof(int) * (matrix_a_row * matrix_a_column), cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix_b, matrix_b, sizeof(int) * (matrix_b_row * matrix_b_column), cudaMemcpyHostToDevice);

    // implement 100 times for getting average execution time
    for(int i=0; i<100;i++){
      matrix_mul<<<1,matrix_a_row * matrix_b_column>>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);
    }

    cudaMemcpy(matrix_c, d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column), cudaMemcpyDeviceToHost);

    // print matrix_c to check correction
    for(int i = 0; i < matrix_a_row; i++){
        for(int j = 0; j < matrix_b_column; j++){
            int index = i * matrix_b_column +j;
            printf("%d, ",matrix_c[index]);
        }
        printf("\n");
    }
    cudaDeviceSynchronize();

    cudaFree(d_matrix_c);
    cudaFree(d_matrix_b);
    cudaFree(d_matrix_a);

    return 0;
}

Writing matrix_mul_01.cu


## Evaluation to collect enough information for the benchmark

In [4]:
!nvcc -o matrix_mul_01 matrix_mul_01.cu
!nvprof ./matrix_mul_01


==993== NVPROF is profiling process 993, command: ./matrix_mul_01
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0

## Matrix Multiplication - Implimentation 02

In [5]:
%%writefile matrix_mul_02.cu
//%%cu
#include <stdio.h>

__global__ void matrix_mul(int *matrix_a, int *matrix_b, int *matrix_c,int matrix_a_row,int matrix_a_column,int matrix_b_column){
    int matrix_c_element = 0;
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    for (int i = 0; i < matrix_a_column; i++){
      matrix_c_element += matrix_a[(tid/matrix_b_column)*matrix_a_column+i] * matrix_b[tid%matrix_b_column+i*matrix_b_column];
    }
    matrix_c[tid]= matrix_c_element;
}

int main(int argc, char *argv[]){

    //===========================================================================
    // Below, there are three example case, which you should only uncomment one
    // of them, to run the test.

    /* Example 1
    int matrix_a[16] = {5,0,34,21,7,17,-12,28,8,-3,-3,-3,0,-3,5,9};
    int matrix_a_row = 4;
    int matrix_a_column = 4;
    int matrix_b[16] = {0,16,24,-90,-23,0,11,1,3,3,0,3,66,7,8,0};
    int matrix_b_row = 4;
    int matrix_b_column = 4;
    */

    /* Example 2
    int matrix_a[12] = {12,6,22,7,17,-12,36,9,9,0,-1,-2};
    int matrix_a_row = 4;
    int matrix_a_column = 3;
    int matrix_b[15] = {0,16,24,-1,4,-23,0,11,1,4,3,3,0,3,4};
    int matrix_b_row = 3;
    int matrix_b_column = 5;
    */


    // random initialization of larger matrixes
    // matrix_a_row as number of blocks
    // matrix_b_column as number of threads per block
    int matrix_a_row = 50;
    int matrix_a_column = 30;
    int *matrix_a = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_a_column));
    for(int i = 0; i < matrix_a_row; i++){
        for(int j = 0; j < matrix_a_column; j++)
        {
            int index = i * matrix_a_column+j;
            matrix_a[index] = 1;
        }
    }
    int matrix_b_row = 30;
    int matrix_b_column = 20;
    int *matrix_b = (int*) malloc(sizeof(int) * (matrix_b_row * matrix_b_column));
    for(int i = 0; i < matrix_b_row; i++){
        for(int j = 0; j < matrix_b_column; j++)
        {
            int index = i * matrix_b_column+j;
            matrix_b[index] = 2;
        }
    }
    //===========================================================================


    int *matrix_c = (int*) malloc(sizeof(int) * (matrix_a_row * matrix_b_column));
    int *d_matrix_a, *d_matrix_b, *d_matrix_c;

    cudaMalloc((void**)&d_matrix_a,sizeof(int) * (matrix_a_row * matrix_a_column));
    cudaMalloc((void**)&d_matrix_b,sizeof(int) * (matrix_b_row * matrix_b_column));
    cudaMalloc((void**)&d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column));

    cudaMemcpy(d_matrix_a, matrix_a, sizeof(int) * (matrix_a_row * matrix_a_column), cudaMemcpyHostToDevice);
    cudaMemcpy(d_matrix_b, matrix_b, sizeof(int) * (matrix_b_row * matrix_b_column), cudaMemcpyHostToDevice);

    // implement 100 times for getting average execution time
    for(int i=0; i<100;i++){
    matrix_mul<<<matrix_a_row,matrix_b_column>>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);

    //for comparison with 01.cu
    //matrix_mul<<<1,matrix_a_row * matrix_b_column>>>(d_matrix_a, d_matrix_b, d_matrix_c, matrix_a_row,matrix_a_column, matrix_b_column);
    }

    cudaMemcpy(matrix_c, d_matrix_c,sizeof(int) * (matrix_a_row * matrix_b_column), cudaMemcpyDeviceToHost);

    // print matrix_c to check correction
    for(int i = 0; i < matrix_a_row; i++){
        for(int j = 0; j < matrix_b_column; j++){
            int index = i * matrix_b_column +j;
            printf("%d, ",matrix_c[index]);
        }
        printf("\n");
    }
    cudaDeviceSynchronize();

    cudaFree(d_matrix_c);
    cudaFree(d_matrix_b);
    cudaFree(d_matrix_a);

    return 0;
}

Writing matrix_mul_02.cu


## Evaluation to collect enough information for the benchmark

In [6]:
!nvcc -o matrix_mul_02 matrix_mul_02.cu
!nvprof ./matrix_mul_02

==1087== NVPROF is profiling process 1087, command: ./matrix_mul_02
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0,