In [27]:
!nvidia-smi
!nvcc --version


Tue Dec 16 08:58:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [37]:
%%writefile add.cu
#include <iostream>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <time.h>
using namespace std;
#define N (1<<24)   // ~1 million elements
#define BD_X 256
#define BD_Y 16
#define BD_Z 16
#define BLOCK_SIZE_1D 256
__global__ void add(int *a, int *b, int *c, int n)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;

    for (int i = idx; i < n; i += stride)
    {
        c[i] = a[i] + b[i];
    }
}

__global__ void add_3d(int *a, int *b, int *c, int nx, int ny, int nz) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    int k = blockIdx.z * blockDim.z + threadIdx.z;
    // 3 adds, 3 multiplies, 3 stores

    if (i < nx && j < ny && k < nz) {
        int idx = i + j * nx + k * nx * ny;
        if (idx < nx * ny * nz) {
            c[idx] = a[idx] + b[idx];
        }
    }
    // you get the point...
}
void add_cpu(int *a, int *b, int *c, int n)
{
    for (int i = 0; i < n; i++)
        c[i] = a[i] + b[i];
}


double get_time()
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC,&ts);
    return ts.tv_sec +ts.tv_nsec*1e-9;
}

int main()
{
    int *a = new int[N];
    int *b = new int[N];
    int *c = new int[N];

    for (int i = 0; i < N; i++)
    {
        a[i] = i;
        b[i] = i;
    }

    struct timespec ts;

    int *d_a, *d_b, *d_c;
    int nx = (1<<12) , ny = (1<<6), nz = (1<<6);
    dim3 blockdim(BD_X,BD_Y,BD_Z);
    dim3 numblock((nx+blockdim.x-1)/blockdim.x, (ny+blockdim.y-1)/blockdim.y,(nz+blockdim.z-1)/blockdim.z);

    cudaMalloc((void**)&d_a, N * sizeof(int));
    cudaMalloc((void**)&d_b, N * sizeof(int));
    cudaMalloc((void**)&d_c, N * sizeof(int));

    double t1 = get_time();
    add_cpu(a, b, c, N);
    double t2 = get_time();
    cout << "CPU Time msec = " << (t2 - t1) * 1000 << endl;


    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMemcpy(d_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
    int num_blocks_1d = (N + BLOCK_SIZE_1D - 1) / BLOCK_SIZE_1D;
    cudaEventRecord(start);
    add<<<1, num_blocks_1d>>>(d_a, d_b, d_c, N);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);

    float gpu_ms = 0;
    cudaEventElapsedTime(&gpu_ms, start, stop);

    cout << "GPU Kernel Time msec 1d = " << gpu_ms << endl;
    cudaMemcpy(c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);

    cout << "Check: " << c[0] << " " << c[1] << " " << c[2] << endl;
    cudaEventRecord(start);
    add_3d<<<numblock,blockdim>>>(d_a,d_b,d_c,nx,ny,nz);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&gpu_ms, start, stop);

    cout << "GPU Kernel Time msec 3d = " << gpu_ms << endl;

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    return 0;
}


Overwriting add.cu


In [38]:
!nvcc add.cu -O2 -arch=sm_75 -o add


      struct timespec ts;
                      ^




In [39]:
!./add


CPU Time msec = 40.528
GPU Kernel Time msec 1d = 0.52
Check: 0 0 0
GPU Kernel Time msec 3d = 0.07152
