In [8]:
%%bash
cat > vector_add.cu << 'EOF'
#include <cstdio>
#include <cuda_runtime.h>

__global__ void vector_add(const float *a, const float *b, float *c, int n) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid < n) {
    c[tid] = a[tid] + b[tid];
  }
}

int main() {
  const int N = 1 << 20; // 1 << n  ==  2^n
  // bitwise left shift. Shift it left by 20 positions → adds 20 zeros on the right. 2^20 = 1,048,576.
  size_t size = N * sizeof(float);

  //Allocate CPU Memory
  float *h_a = (float*)malloc(size);
  float *h_b = (float*)malloc(size);
  float *h_c = (float*)malloc(size);

  //Initialize host arrays
  for (int i = 0; i < N; i++) {
    h_a[i] = 1.0f;
    h_b[i] = 2.0f;
  }

  //Allocate GPU Memory
  float *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, size);
  cudaMalloc(&d_b, size);
  cudaMalloc(&d_c, size);

  //Copy inputs to GPU
  cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

  //Launch kernel
  int threads = 256;
  int blocks = (N + threads -1) / threads;

  vector_add<<<blocks, threads>>>(d_a, d_b, d_c, N);

  //Copy output back to host
  cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

  //Verify result
  bool pass = true;
  for (int i = 0; i < N; ++i) {
      if (fabs(h_c[i] - 3.0f) > 1e-5f) {
          printf("Error at %d: %f\n", i, h_c[i]);
          pass = false;
          break;
      }
  }
  printf("Vector addition: %s\n", pass ? "PASS" : "FAIL");

  // 9. Cleanup
    free(h_a); free(h_b); free(h_c);
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
    return 0;
}
EOF

In [9]:
!nvcc -arch=sm_75 vector_add.cu -o vector_add
!./vector_add

Vector addition: PASS
