# CUDA Exercise 03
> Vector dot product(inner product) example on GPU, only applied with single thread.

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_03.ipynb)

## Initialize the CUDA dev environment

In [1]:
# clone the code repo,
# !pip install git+git://github.com/depctg/nvcc4jupyter.git
# %load_ext nvcc_plugin
!pip install nvcc4jupyter
%load_ext nvcc4jupyter
# Check the environment
!lsb_release -a
!nvcc --version
!nvidia-smi

Collecting nvcc4jupyter
  Downloading nvcc4jupyter-1.2.1-py3-none-any.whl.metadata (5.1 kB)
Downloading nvcc4jupyter-1.2.1-py3-none-any.whl (10 kB)
Installing collected packages: nvcc4jupyter
Successfully installed nvcc4jupyter-1.2.1
Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp2n5gzcgc".
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.4 LTS
Release:	22.04
Codename:	jammy
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Sat Jun  7 03:07:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persiste

## Vector Dot Production

In [4]:
%%cuda
#include <stdio.h>
#include <assert.h>

#define VECTOR_LENGTH 10
#define MAX_ERR 1e-5

__global__ void vector_dot_product(float *out, float *a, float *b, int n)
{
    float sum=0;
    for(int i = 0; i < n; i++)
    {
        sum = sum +  a[i] * b[i];
    }
    *out = sum;
}

void test_vector_dot_product(void)
{
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    // Allocate memory on CPU
    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    out = (float*)malloc(sizeof(float));

    // data initializtion
    for(int i = 0; i < VECTOR_LENGTH; i++)
    {
        a[i] = 3.14f;
        b[i] = 2.0f;
    }

    // Allocate memory on GPU
    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_out, sizeof(float));

    // copy operator to GPU
    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);

    // GPU do the work, CPU waits
    vector_dot_product<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);

    // Get results from the GPU
    cudaMemcpy(out, d_out, sizeof(float),
               cudaMemcpyDeviceToHost);

    // Test the result
    // assert(fabs(*out - 20*3.14) < MAX_ERR);

    printf("out[0] = %f\n", out[0]);
    printf("PASSED\n");

    // Free the memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);
}

int main()
{
    test_vector_dot_product();
}

   51 |     # assert(fabs(*out - 20*3.14) < MAX_ERR);
      |       ^~~~~~
/tmp/tmp2n5gzcgc/7f7df3de-e063-478c-a77f-5325e4a5a25f/single_file.cu:51:13: error: predicate must be an identifier
   51 |     # assert(fabs(*out - 20*3.14) < MAX_ERR);
      |             ^

