# CUDA Exercise 02
> Vector add example with CPU and GPU, only applied with single thread.

This Jupyter Notebook can also be open by the google colab, so you don't have to buy a PC with a graphic card to play with CUDA. To launch the Google Colab, please click the below Icon.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg#left)](https://colab.research.google.com/github/SuperChange001/CUDA_Learning/blob/main/Solution/Exercise_02.ipynb)

## Initialize the CUDA dev environment

In [9]:
# clone the code repo,
# !pip install git+git://github.com/depctg/nvcc4jupyter.git
# %load_ext nvcc_plugin
!pip install nvcc4jupyter
%load_ext nvcc4jupyter
# Check the environment
!lsb_release -a
!nvcc --version
!nvidia-smi

The nvcc4jupyter extension is already loaded. To reload it, use:
  %reload_ext nvcc4jupyter
No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.4 LTS
Release:	22.04
Codename:	jammy
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Sat Jun  7 03:05:07 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |


## Vector Add

In [8]:
%%cuda

#include <stdio.h>
#include <assert.h>

#define VECTOR_LENGTH 10
#define MAX_ERR 1e-3

__global__ void vector_add(float *out, float *a, float *b, int n)
{
    for(int i = 0; i < n; i++)
    {
        out[i] = a[i] + b[i];
    }
}

int main()
{
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    //===================步骤1===================
    // Allocate memory on CPU
    a = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    b = (float*)malloc(sizeof(float) * VECTOR_LENGTH);
    out = (float*)malloc(sizeof(float) * VECTOR_LENGTH);

    // data initializtion
    for(int i = 0; i < VECTOR_LENGTH; i++)
    {
        a[i] = 3.0f;
        b[i] = 2.0f;
    }
    //===================步骤1===================

    //===================步骤2===================
    // Allocate memory on GPU
    cudaMalloc((void**)&d_a, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_b, sizeof(float) * VECTOR_LENGTH);
    cudaMalloc((void**)&d_out, sizeof(float) * VECTOR_LENGTH);
    //===================步骤2===================

    //===================步骤3===================
    // copy operator to GPU
    cudaMemcpy(d_a, a, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, sizeof(float) * VECTOR_LENGTH, cudaMemcpyHostToDevice);
    //===================步骤3===================

    //===================步骤4===================
    // GPU do the work, CPU waits
    vector_add<<<1,1>>>(d_out, d_a, d_b, VECTOR_LENGTH);
    //===================步骤4===================

    //===================步骤5===================
    // Get results from the GPU
    cudaMemcpy(out, d_out, sizeof(float) * VECTOR_LENGTH, cudaMemcpyDeviceToHost);

    // Test the result
    for(int i = 0; i < VECTOR_LENGTH; i++)
    {
        // assert(fabs(out[i] - a[i] - b[i]) < MAX_ERR);
        printf("out[%d] is %f\n", i, out[i]);
    }
    printf("out[0] is %f\n", out[0]);
    printf("PASSED\n");
    //===================步骤5===================

    //===================步骤6===================
    // Free the memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);
    //===================步骤6===================
}

out[0] is 0.000000
out[1] is 0.000000
out[2] is 0.000000
out[3] is 0.000000
out[4] is 0.000000
out[5] is 0.000000
out[6] is 0.000000
out[7] is 0.000000
out[8] is 0.000000
out[9] is 0.000000
out[0] is 0.000000
PASSED

