In [1]:
!nvidia-smi
!nvcc --version


Tue Dec 16 08:50:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   56C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [22]:
%%writefile add.cu
#include <iostream>
#include <cuda_runtime.h>
#include <time.h>

using namespace std;

#define N (1<<24)   // ~1 million elements

__global__ void add_gpu(int *a, int *b, int *c, int n)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = idx; i < n; i += stride)
    {
        c[i] = a[i] + b[i];
    }
}

void add_cpu(int *a, int *b, int *c, int n)
{
    for (int i = 0; i < n; i++)
        c[i] = a[i] + b[i];
}

double get_time()
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec + ts.tv_nsec * 1e-9;
}

int main()
{
    int *a = new int[N];
    int *b = new int[N];
    int *c = new int[N];

    for (int i = 0; i < N; i++)
    {
        a[i] = i;
        b[i] = i;
    }

    // CPU timing
    double t1 = get_time();
    add_cpu(a, b, c, N);
    double t2 = get_time();
    cout << "CPU Time (ms): " << (t2 - t1) * 1000 << endl;

    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, N * sizeof(int));
    cudaMalloc(&d_b, N * sizeof(int));
    cudaMalloc(&d_c, N * sizeof(int));

    cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice);

    // GPU timing using CUDA events
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    add_gpu<<<256, 256>>>(d_a, d_b, d_c, N);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
    {
    cout << "CUDA kernel error: "
         << cudaGetErrorString(err) << endl;
    }

cudaDeviceSynchronize();

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float gpu_ms = 0;
    cudaEventElapsedTime(&gpu_ms, start, stop);
    cout << "GPU Kernel Time (ms): " << gpu_ms << endl;

    cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    cout << "Check: " << c[0] << " " << c[1] << " " << c[2] << endl;

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    delete[] a;
    delete[] b;
    delete[] c;

    return 0;
}


Overwriting add.cu


In [25]:
!nvcc add.cu -O2 -arch=sm_75 -o add


In [26]:
!./add


CPU Time (ms): 39.4761
GPU Kernel Time (ms): 0.933792
Check: 0 2 4
