<a href="https://colab.research.google.com/github/lparis/Algo/blob/master/CUDA_VECTOR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
%%writefile vector/vector_add.cu
#include <iostream>  // Standard C++ header for input/output

// Kernel - runs on GPU
// __global__ marks it as callable from CPU, executed on GPU
// threadIdx.x is a built-in CUDA variable - each thread gets a unique index
__global__ void add(int *a, int *b, int *c) {
    int i = threadIdx.x;
    c[i] = a[i] + b[i];
}

// Standard C++ entry point, runs on CPU
int main() {
    // Host (CPU) memory - regular C++ arrays
    int a[] = {1, 2, 3};
    int b[] = {4, 5, 6};
    int c[3] = {0, 0, 0};

    // Allocate device (GPU) memory
    // d_ prefix is convention meaning "device"
    // cudaMalloc is like malloc but for GPU memory
    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, 3 * sizeof(int));
    cudaMalloc(&d_b, 3 * sizeof(int));
    cudaMalloc(&d_c, 3 * sizeof(int));

    // Copy data from CPU to GPU
    cudaMemcpy(d_a, a, 3 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, 3 * sizeof(int), cudaMemcpyHostToDevice);

    // Launch kernel on the GPU
    // <<<1, 3>>> means "1 block, 3 threads per block"
    // 3 threads run in parallel, each handling one element
    // GPU execution starts here
    add<<<1, 3>>>(d_a, d_b, d_c);

    // Wait for GPU to finish - kernel launches are asynchronous
    cudaDeviceSynchronize();

    // Copy results back from GPU to CPU
    cudaMemcpy(c, d_c, 3 * sizeof(int), cudaMemcpyDeviceToHost);

    // Print results - standard C++
    std::cout << c[0] << " " << c[1] << " " << c[2] << std::endl;

    // Free GPU memory
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}

Overwriting vector/vector_add.cu


In [38]:
!nvcc -arch=sm_75 vector/vector_add.cu -o vector/vector_add && ./vector/vector_add

5 7 9
