In [None]:
!nvidia-smi

Tue Nov  4 05:06:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   54C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
%%writefile example.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

// CUDA kernel function that runs on the GPU
// Each thread computes one element of the result vector
__global__ void vectorAdd(int *a, int *b, int *c, int n) {

    // Calculate the global thread ID
    // blockIdx.x = which block this thread belongs to
    // blockDim.x = number of threads per block
    // threadIdx.x = thread index within its block


    const int idx = blockIdx.x * blockDim.x + threadIdx.x;

    //printf("Thread %2d in block %2d: idx=%2d\n", threadIdx.x, blockIdx.x, idx);

    c[idx] = a[idx] + b[idx];
}

int main(void) {
    // Vector size
    int n = 32;
    size_t bytes = n * sizeof(int);

    // Host (CPU) pointers
    int *h_a, *h_b, *h_c;

    // Device (GPU) pointers
    int *d_a, *d_b, *d_c;

    // Allocate host memory
    h_a = (int*)malloc(bytes);
    h_b = (int*)malloc(bytes);

    //result from gpu
    h_c = (int*)malloc(bytes);

    // Initialize input vectors
    for (int i = 0; i < n; i++) {
        h_a[i] = i;
        h_b[i] = i;
    }

    // Allocate device memory
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);

    // Copy data from host to device
    cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);


    int threadsPerBlock = 32; // will be by cuda runtime chopped into warps of 32 threads
    int blocksPerGrid = 1;

    printf("Launching kernel with %d blocks and %d threads per block\n", blocksPerGrid, threadsPerBlock);

    // Launch the kernel on the GPU
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_a, d_b, d_c, n);

    // to block the main thread on the cpu until the kernel is done
    cudaDeviceSynchronize();
    cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);

    for (int i = 0; i < n; i++) { printf("i -> %2d,   res -> %2d\n", i, h_c[i]); }

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

Writing example.cu


In [None]:
!nvcc -arch=sm_75 -o example example.cu
!./example

Launching kernel with 1 blocks and 32 threads per block
i ->  0,   res ->  0
i ->  1,   res ->  2
i ->  2,   res ->  4
i ->  3,   res ->  6
i ->  4,   res ->  8
i ->  5,   res -> 10
i ->  6,   res -> 12
i ->  7,   res -> 14
i ->  8,   res -> 16
i ->  9,   res -> 18
i -> 10,   res -> 20
i -> 11,   res -> 22
i -> 12,   res -> 24
i -> 13,   res -> 26
i -> 14,   res -> 28
i -> 15,   res -> 30
i -> 16,   res -> 32
i -> 17,   res -> 34
i -> 18,   res -> 36
i -> 19,   res -> 38
i -> 20,   res -> 40
i -> 21,   res -> 42
i -> 22,   res -> 44
i -> 23,   res -> 46
i -> 24,   res -> 48
i -> 25,   res -> 50
i -> 26,   res -> 52
i -> 27,   res -> 54
i -> 28,   res -> 56
i -> 29,   res -> 58
i -> 30,   res -> 60
i -> 31,   res -> 62


In [None]:
!nvcc -arch=sm_75 -g -G example.cu -o example

In [None]:
# set style enabled off <due to ANSI color issue in collab>
# -g for CPU debug Symbols -G for GPU Debug Symbols
# set cuda break_on_launch all
# info cuda threads
# info cuda warps
# cuda thread
# cuda lane
# cuda lane 15
# info locals
# print a[idx]
# print b[idx]
# print c[idx]