In [32]:
#include <iostream>
#include <cmath>

__global__ void vectorAdd(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

int main() {
    const int N = 10;
    float A[N], B[N], C[N];

    // Initialize A and B with some values
    for (int i = 0; i < N; ++i) {
        A[i] = i;
        B[i] = i * 2;
    }

    // Print vectors A and B
    std::cout << "Vector A:" << std::endl;
    for (int i = 0; i < N; ++i) {
        std::cout << A[i] << " ";
    }
    std::cout << std::endl;

    std::cout << "Vector B:" << std::endl;
    for (int i = 0; i << N; ++i) {
        std::cout << B[i] << " ";
    }
    std::cout << std::endl;

    float *d_a, *d_b,*d_c;
    cudaMalloc(&d_a,N*sizeof(float));
    cudaMalloc(&d_b,N*sizeof(float));
    cudaMalloc(&d_c,N*sizeof(float));
    cudaMemcpy(d_a,A,N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,B,N*sizeof(float),cudaMemcpyHostToDevice);

    std::cout << "Launching CUDA kernel..." << std::endl;
    int blocksize=256;
    int gridsize=static_cast<int>(std::ceil(static_cast<float>(N)/blocksize)); // Use std::ceil for ceiling calculation
    vectorAdd<<<gridsize,blocksize>>>(d_a,d_b,d_c,N);
    cudaDeviceSynchronize(); // Wait for the kernel to finish
    std::cout << "CUDA kernel finished." << std::endl;

    cudaMemcpy(C,d_c,N*sizeof(float),cudaMemcpyDeviceToHost);
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Print the resulting vector C
    std::cout << "Resulting vector C:" << std::endl;
    for (int i = 0; i < N; ++i) {
        std::cout << C[i] << " ";
    }
    std::cout << std::endl;

    return 0;
}

Overwriting vector_add.cu


In [34]:
!nvcc vector_add.cu -o vectAdd -gencode arch=compute_75,code=sm_75 -lcudart

# Run the executable
!./vectAdd

Vector A:
0 1 2 3 4 5 6 7 8 9 
Vector B:
0 2 4 6 8 10 12 14 16 18 
Launching CUDA kernel...
CUDA kernel finished.
Resulting vector C:
0 0 0 0 0 0 0 0 0 0 
