
# CUDA Exercise 07
> You should try to implement your own solution for vector dot product, and try to parallelize the computation.

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_07.ipynb)


## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
# !pip install git+git://github.com/depctg/nvcc4jupyter.git
# %load_ext nvcc_plugin
!pip install nvcc4jupyter
%load_ext nvcc4jupyter

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp2ugj0fnd".


## Check the environment

In [2]:
!lsb_release -a
!nvcc --version
!nvidia-smi

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.4 LTS
Release:	22.04
Codename:	jammy
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Sat Jun  7 03:18:53 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |


## Naive approach of vector dot product

In [3]:
%%writefile exercise01.cu
#include <stdio.h>
#include <assert.h>

#define MAX_ERR 0.1
#define MULTI_TIMES_RUN 1

__global__ void vector_dot_product(float *result, float *vector_a, float *vector_b, int vertor_length)
{
    extern __shared__ float temp[];

    int index = threadIdx.x;    // index offset of this thread
    int stride = blockDim.x;    // stride step of each iteration

    // so if threadIdx.x=0, and blockDim.x=10,
    // then this thread is responsible for calculating temp[0], temp[10], temp[20]
    // similiarly, the following thread will calculate temp[1], temp[11], temp[21]
    for(int i = index; i < vertor_length; i += stride)
    {
        temp[i] = vector_a[i] * vector_b[i];
    }

    __syncthreads(); // synchronize all threads

    // The accumulation only needs to happen at thread_0
    if (threadIdx.x == 0)
    {
        float sum = 0;
        for (int i = 0; i < vertor_length; i++)
        {
            sum += temp[i];
        }
        *result=sum;
    }
}

int main(int argc, char *argv[])
{
    float *vector_a, *vector_b, *result;
    float *d_vector_a, *d_vector_b, *d_result;
    int list_of_thread_num[]={1,64,128,256,512,1024};
    int list_of_vector_length[]={100,200,1000,2000,10000};
    int thread_num = 1;
    int vector_length = 1000;

     if( argc == 3 ) {
      //printf("The argument supplied is %s\n", argv[1]);
      int arg1 = atoi(argv[1]);  //argv[0] is the program name
                                //atoi = ascii to int
      int arg2 = atoi(argv[2]);

      vector_length = list_of_vector_length[arg1];
      thread_num = list_of_thread_num[arg2];
    }
    else if( argc > 2 ) {
      printf("Too many arguments supplied.\n");
    }
    else {
      printf("One argument expected.\n");

    }

    // Allocate memory on CPU
    vector_a = (float*)malloc(sizeof(float) * vector_length);
    vector_b = (float*)malloc(sizeof(float) * vector_length);
    result = (float*)malloc(sizeof(float));

    // data initializtion
    for(int i = 0; i < vector_length; i++)
    {
        vector_a[i] = 0.1f;
        vector_b[i] = 0.9f;
    }

    // Allocate memory on GPU
    cudaMalloc((void**)&d_vector_a, sizeof(float) * vector_length);
    cudaMalloc((void**)&d_vector_b, sizeof(float) * vector_length);
    cudaMalloc((void**)&d_result, sizeof(float));

    // copy operator to GPU
    cudaMemcpy(d_vector_a, vector_a, sizeof(float) * vector_length, cudaMemcpyHostToDevice);
    cudaMemcpy(d_vector_b, vector_b, sizeof(float) * vector_length, cudaMemcpyHostToDevice);

    // GPU do the work, CPU waits
#if MULTI_TIMES_RUN
    for(int i=0; i< 10; i++)
    {
#endif
        vector_dot_product<<<1,thread_num,sizeof(float) * vector_length>>>(d_result, d_vector_a, d_vector_b, vector_length);
#if MULTI_TIMES_RUN
    }
 #endif

    // Get results from the GPU
    cudaMemcpy(result, d_result, sizeof(float),
               cudaMemcpyDeviceToHost);

    // Test the result
    //assert(fabs(*result - vector_length*2*3.14) < MAX_ERR);

    // you only need them for checking if the math is correct
     printf("result[0] = %f\n", result[0]);
    // printf("PASSED\n");

    // Free the memory
    cudaFree(d_vector_a);
    cudaFree(d_vector_b);
    cudaFree(d_result);
    free(vector_a);
    free(vector_a);
    free(result);

}

Writing exercise01.cu


## Optimized approach of vector dot product

In [4]:
%%writefile exercise01.cu
#include <stdio.h>
#include <assert.h>

#define MAX_ERR 0.1
#define MULTI_TIMES_RUN 1

__global__ void vector_dot_product(float *result, float *vector_a, float *vector_b, int vertor_length)
{
    extern __shared__ float temp[];

    int index = threadIdx.x;    // index offset of this thread
    int stride = blockDim.x;    // stride step of each iteration

    temp[threadIdx.x] = 0;
    for(int i = index; i < vertor_length; i += stride)
    {
        temp[threadIdx.x] = temp[threadIdx.x] + vector_a[i] * vector_b[i];
    }

    __syncthreads(); // synchronize all threads

    // The accumulation only needs to happen at thread_0
    if (threadIdx.x == 0)
    {
        float sum = 0;
        int thread_num = (vertor_length+blockDim.x)/blockDim.x;
        for (int i = 0; i < thread_num; i++)
        {
            sum += temp[i];
        }
        *result=sum;
    }
}

int main(int argc, char *argv[])
{
    float *vector_a, *vector_b, *result;
    float *d_vector_a, *d_vector_b, *d_result;
    int list_of_thread_num[]={1,64,128,256,512,1024};
    int list_of_vector_length[]={100,200,1000,2000,10000};
    int thread_num = 1;
    int vector_length = 1000;

     if( argc == 3 ) {
      //printf("The arguments supplied are %s, %s\n", argv[1], argv[2]);
      int arg1 = atoi(argv[1]);  //argv[0] is the program name
                                //atoi = ascii to int
      int arg2 = atoi(argv[2]);

      vector_length = list_of_vector_length[arg1];
      thread_num = list_of_thread_num[arg2];
    }
    else if( argc > 2 ) {
      printf("Too many arguments supplied.\n");
    }
    else {
      printf("Two argument expected.\n");
      return 0;
    }

    // Allocate memory on CPU
    vector_a = (float*)malloc(sizeof(float) * vector_length);
    vector_b = (float*)malloc(sizeof(float) * vector_length);
    result = (float*)malloc(sizeof(float));

    // data initializtion
    for(int i = 0; i < vector_length; i++)
    {
        vector_a[i] = 0.1f;
        vector_b[i] = 0.9f;
    }

    // Allocate memory on GPU
    cudaMalloc((void**)&d_vector_a, sizeof(float) * vector_length);
    cudaMalloc((void**)&d_vector_b, sizeof(float) * vector_length);
    cudaMalloc((void**)&d_result, sizeof(float));

    // copy operator to GPU
    cudaMemcpy(d_vector_a, vector_a, sizeof(float) * vector_length, cudaMemcpyHostToDevice);
    cudaMemcpy(d_vector_b, vector_b, sizeof(float) * vector_length, cudaMemcpyHostToDevice);

    // GPU do the work, CPU waits
#if MULTI_TIMES_RUN
    for(int i=0; i< 10; i++)
    {
#endif
        vector_dot_product<<<1,thread_num,sizeof(float) * thread_num>>>(d_result, d_vector_a, d_vector_b, vector_length);
#if MULTI_TIMES_RUN
    }
 #endif

    // Get results from the GPU
    cudaMemcpy(result, d_result, sizeof(float),
               cudaMemcpyDeviceToHost);

    // Test the result
    //assert(fabs(*result - vector_length*2*3.14) < MAX_ERR);

    // you only need them for checking if the math is correct
     printf("result[0] = %f\n", result[0]);
    // printf("PASSED\n");

    // Free the memory
    cudaFree(d_vector_a);
    cudaFree(d_vector_b);
    cudaFree(d_result);
    free(vector_a);
    free(vector_a);
    free(result);
}

Overwriting exercise01.cu


## Evaluation to collect enough information for the benchmark

In [5]:
!nvcc -o exercise01 exercise01.cu
!nvprof ./exercise01 0 0
!nvprof ./exercise01 1 0
!nvprof ./exercise01 2 0
!nvprof ./exercise01 3 0
!nvprof ./exercise01 4 0

==1380== NVPROF is profiling process 1380, command: ./exercise01 0 0
result[0] = 0.000000
double free or corruption (!prev)
==1380== Profiling application: ./exercise01 0 0
==1380== Profiling result:
No kernels were profiled.
No API activities were profiled.
==1395== NVPROF is profiling process 1395, command: ./exercise01 1 0
result[0] = 0.000000
free(): double free detected in tcache 2
==1395== Profiling application: ./exercise01 1 0
==1395== Profiling result:
No kernels were profiled.
No API activities were profiled.
==1406== NVPROF is profiling process 1406, command: ./exercise01 2 0
result[0] = 0.000000
double free or corruption (!prev)
==1406== Profiling application: ./exercise01 2 0
==1406== Profiling result:
No kernels were profiled.
No API activities were profiled.
==1421== NVPROF is profiling process 1421, command: ./exercise01 3 0
result[0] = 0.000000
double free or corruption (!prev)
==1421== Profiling application: ./exercise01 3 0
==1421== Profiling result:
No kernels were 

In [6]:
!nvcc -o exercise01 exercise01.cu
!nvprof ./exercise01 4 0
!nvprof ./exercise01 4 1
!nvprof ./exercise01 4 2
!nvprof ./exercise01 4 3
!nvprof ./exercise01 4 4

==1510== NVPROF is profiling process 1510, command: ./exercise01 4 0
result[0] = 0.000000
double free or corruption (!prev)
==1510== Profiling application: ./exercise01 4 0
==1510== Profiling result:
No kernels were profiled.
No API activities were profiled.
==1521== NVPROF is profiling process 1521, command: ./exercise01 4 1
result[0] = 0.000000
double free or corruption (!prev)
==1521== Profiling application: ./exercise01 4 1
==1521== Profiling result:
No kernels were profiled.
No API activities were profiled.
==1532== NVPROF is profiling process 1532, command: ./exercise01 4 2
result[0] = 0.000000
double free or corruption (!prev)
==1532== Profiling application: ./exercise01 4 2
==1532== Profiling result:
No kernels were profiled.
No API activities were profiled.
==1549== NVPROF is profiling process 1549, command: ./exercise01 4 3
result[0] = 0.000000
double free or corruption (!prev)
==1549== Profiling application: ./exercise01 4 3
==1549== Profiling result:
No kernels were profile