In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [2]:
%%bash
cat > hello2_debug.cu << 'EOF'
#include <cstdio>

// Write each thread’s global ID into out[tid]
__global__ void hello_write(int *out) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    out[tid] = tid;
}

int main() {
    const int N = 8;
    int *out;

    // Allocate unified memory
    cudaMallocManaged(&out, N * sizeof(int));

    // Launch 1 block of N threads
    hello_write<<<1, N>>>(out);

    // Check for launch errors
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        printf("Kernel launch failed: %s\n", cudaGetErrorString(err));
        return 1;
    }

    // Wait for GPU, check for runtime errors
    err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        printf("Kernel execution failed: %s\n", cudaGetErrorString(err));
        return 1;
    }

    // Print results on the CPU
    for (int i = 0; i < N; i++) {
        printf("out[%d] = %d\n", i, out[i]);
    }

    cudaFree(out);
    return 0;
}
EOF


In [3]:
!nvcc -arch=sm_75 hello2_debug.cu -o hello2_debug
!./hello2_debug

out[0] = 0
out[1] = 1
out[2] = 2
out[3] = 3
out[4] = 4
out[5] = 5
out[6] = 6
out[7] = 7
