
# CUDA Exercise 08
> You should try to implement your own solution for matrix vector multiplication, and try to parallelize the computation.

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_08.ipynb)


## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpbiyaytcd".


## Check the environment

In [2]:
!lsb_release -a
!nvcc --version
!nvidia-smi

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.4 LTS
Release:	22.04
Codename:	jammy
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Thu Jun  5 12:11:16 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |


## Naive approach of matrix vector multiplication
Try to optimize it, you can do much better!

In [3]:
%%writefile matrix_vector_multiplication.cu
#include <stdio.h>
#include <assert.h>

#define M 100
#define N 100
#define MAX_ERR 1e-4

__global__ void matrix_vector_multiplication(float* vector_result, float *matrix_a, float *vector_b, int m_row, int n_col)
{
    extern __shared__ float temp[];

    // blockIdx.x => which row
    // blockDim.x => row length
    // threadIdx.x => which element in this row

    // Unique tid which can index each single element in the matrix
    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    // the condiction logic make sure, we only do the calculation in the matrix space
    int size_of_the_matrix = m_row*n_col;
    if(tid<size_of_the_matrix)
    {
        temp[tid] = matrix_a[tid] * vector_b[threadIdx.x]; // sum
    }

    __syncthreads(); // synchronize all threads

    // The accumulation only needs to happen at thread_0
    if (threadIdx.x == 0)
    {
        float sum = 0;
        int index = blockIdx.x * blockDim.x;
        for (int i = index; i < index + blockDim.x ; i++)
        {
            sum += temp[i];
        }
        vector_result[blockIdx.x] = sum;
    }
}

int main()
{
    float *martix_a, *martix_b, *vector_result;
    float *d_martix_a, *d_martix_b, *d_vector_result;

    martix_a = (float*)malloc(sizeof(float) * (M * N));
    martix_b = (float*)malloc(sizeof(float) * (N));
    vector_result = (float*)malloc(sizeof(float) * (M));

    // data initializtion
    for(int raw_num = 0; raw_num < M; raw_num++)
    {
        for(int col_num = 0; col_num < N; col_num++)
        {
            int index = raw_num*N+col_num;
            martix_a[index] = raw_num*3.14f+col_num;
        }
    }

    for(int col_num = 0; col_num < N; col_num++)
    {
        martix_b[col_num] = col_num+1;
    }

    // Allocate memory on GPU
    cudaMalloc((void**)&d_martix_a, sizeof(float) * (M * N));
    cudaMalloc((void**)&d_martix_b, sizeof(float) * N);
    cudaMalloc((void**)&d_vector_result, sizeof(float) * M);

    // copy operator to GPU
    cudaMemcpy(d_martix_a, martix_a, sizeof(float) * (M * N), cudaMemcpyHostToDevice);
    cudaMemcpy(d_martix_b, martix_b, sizeof(float) * N, cudaMemcpyHostToDevice);

    // GPU do the work, CPU waits
    matrix_vector_multiplication<<<M,N,sizeof(float) * (M * N)>>>(d_vector_result, d_martix_a, d_martix_b, M, N);

    // Get results from the GPU
    cudaMemcpy(vector_result, d_vector_result, sizeof(float) * M, cudaMemcpyDeviceToHost);

    // Test the result
    for(int i = 0; i < M; i++)
    {
        float temp_sum =0;
        for(int j = 0; j < N; j++)
        {
            int index = i*N+j;
            temp_sum = temp_sum + martix_a[index]*martix_b[j];
        }
        //printf("out[%d]: %f, %f\n", i, temp_sum, vector_result[i]);

        assert(fabs(vector_result[i] - temp_sum) < MAX_ERR);
    }
    printf("PASSED\n");

    // Free the memory
    cudaFree(d_martix_a);
    cudaFree(d_martix_b);
    cudaFree(d_vector_result);
    free(martix_a);
    free(martix_b);
    free(vector_result);

    return 0;
}

Writing matrix_vector_multiplication.cu


## Evaluation to collect enough information for the benchmark

In [4]:
!nvcc -o matrix_vector_multiplication matrix_vector_multiplication.cu
!nvprof ./matrix_vector_multiplication 0 0
!nvprof ./matrix_vector_multiplication 1 0
!nvprof ./matrix_vector_multiplication 2 0
!nvprof ./matrix_vector_multiplication 3 0
!nvprof ./matrix_vector_multiplication 4 0

==502== NVPROF is profiling process 502, command: ./matrix_vector_multiplication 0 0
matrix_vector_multiplication: matrix_vector_multiplication.cu:91: int main(): Assertion `fabs(vector_result[i] - temp_sum) < MAX_ERR' failed.
==502== Profiling application: ./matrix_vector_multiplication 0 0
==502== Profiling result:
No kernels were profiled.
No API activities were profiled.
==513== NVPROF is profiling process 513, command: ./matrix_vector_multiplication 1 0
matrix_vector_multiplication: matrix_vector_multiplication.cu:91: int main(): Assertion `fabs(vector_result[i] - temp_sum) < MAX_ERR' failed.
==513== Profiling application: ./matrix_vector_multiplication 1 0
==513== Profiling result:
No kernels were profiled.
No API activities were profiled.
==528== NVPROF is profiling process 528, command: ./matrix_vector_multiplication 2 0
matrix_vector_multiplication: matrix_vector_multiplication.cu:91: int main(): Assertion `fabs(vector_result[i] - temp_sum) < MAX_ERR' failed.
==528== Profiling