In [None]:
%%writefile cuda_check.cu

#include <iostream>
#include <cuda_runtime.h>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);

    if (deviceCount == 0) {
        std::cerr << "No CUDA devices found." << std::endl;
        return 1;
    }

    for (int i = 0; i < deviceCount; ++i) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);

        std::cout << "# Device " << i << ": " << prop.name << std::endl;
        std::cout << "#   Compute capability: " << prop.major << "." << prop.minor << std::endl;
        std::cout << "#   Total global memory: " << prop.totalGlobalMem / (1024 * 1024) << " MB" << std::endl;
        std::cout << "#   Max threads per block: " << prop.maxThreadsPerBlock << std::endl;
        std::cout << "#   Max threads per multiprocessor: " << prop.maxThreadsPerMultiProcessor << std::endl;
        std::cout << "#   Number of multiprocessors: " << prop.multiProcessorCount << std::endl;
        std::cout << "#   Shared memory per block: " << prop.sharedMemPerBlock / 1024 << " KB" << std::endl;
        std::cout << "#   Registers per block: " << prop.regsPerBlock << std::endl;
        std::cout << "#   Warp size: " << prop.warpSize << std::endl;
        std::cout << "#   Max block dimensions: [" << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", " << prop.maxThreadsDim[2] << "]" << std::endl;
        std::cout << "#   Max grid dimensions: [" << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", " << prop.maxGridSize[2] << "]" << std::endl << std::endl;
    }

    return 0;
}

Writing cuda_check.cu


In [None]:
!nvcc cuda_check.cu -o cuda_check && ./cuda_check

# Device 0: Tesla T4
#   Compute capability: 7.5
#   Total global memory: 15095 MB
#   Max threads per block: 1024
#   Max threads per multiprocessor: 1024
#   Number of multiprocessors: 40
#   Shared memory per block: 48 KB
#   Registers per block: 65536
#   Warp size: 32
#   Max block dimensions: [1024, 1024, 64]
#   Max grid dimensions: [2147483647, 65535, 65535]



In [None]:
%%writefile main.cu

#include <stdio.h>
#include <cuda_runtime.h>

__global__ void add_kernel(float *array1, float *array2, float *result, int size)
{
    int start = blockDim.x * blockIdx.x + threadIdx.x;
    int stride = gridDim.x * blockDim.x;

    for (int i=start; i<size; i+=stride) {
        result[i] = array1[i] + array2[i];
    }
}

void add(float *array1, float *array2, float *result, int size)
{
    float *cuda_array1;
    float *cuda_array2;
    float *cuda_result;

    // Allocate memory on GPU
    cudaMalloc(&cuda_array1, sizeof(float) * size);
    cudaMalloc(&cuda_array2, sizeof(float) * size);
    cudaMalloc(&cuda_result, sizeof(float) * size);

    // Copy array1 and array2 from CPU to GPU
    cudaMemcpy(cuda_array1, array1, sizeof(float) * size, cudaMemcpyHostToDevice);
    cudaMemcpy(cuda_array2, array2, sizeof(float) * size, cudaMemcpyHostToDevice);

    // Run the kernel
    int block_dim = 1024;
    int grid_dim = (size + block_dim - 1) / block_dim;
    add_kernel<<<grid_dim, block_dim>>>(cuda_array1, cuda_array2, cuda_result, size);

    // Wait for the kernel to finish
    cudaDeviceSynchronize();

    // Copy cuda_result from GPU to CPU
    cudaMemcpy(result, cuda_result, sizeof(float) * size, cudaMemcpyDeviceToHost);

    // Free GPU memory
    cudaFree(cuda_array1);
    cudaFree(cuda_array2);
    cudaFree(cuda_result);
}

int main() {
    int size = 1 << 20;
    float *array1 = (float*) malloc(sizeof(float) * size);
    float *array2 = (float*) malloc(sizeof(float) * size);
    float *result = (float*) malloc(sizeof(float) * size);

    for (int i = 0; i < size; i++)
    {
        array1[i] = 2;
        array2[i] = 3;
    }

    add(array1, array2, result, size);

    int errors = 0;
    for (int i = 0; i < size; i++)
    {
        if (result[i] != 5)
        {
            errors++;
        }
    }
    printf("Errors = %d\n", errors);

    free(array1);
    free(array2);
    free(result);
}

Writing main.cu


In [None]:
!nvcc main.cu -o main -arch=sm_75 && nvprof ./main

==957== NVPROF is profiling process 957, command: ./main
Errors = 0
==957== Profiling application: ./main
==957== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   52.62%  1.7883ms         1  1.7883ms  1.7883ms  1.7883ms  [CUDA memcpy DtoH]
                   45.14%  1.5340ms         2  767.02us  766.49us  767.55us  [CUDA memcpy HtoD]
                    2.23%  75.936us         1  75.936us  75.936us  75.936us  add_kernel(float*, float*, float*, int)
      API calls:   96.66%  227.84ms         3  75.948ms  88.745us  227.65ms  cudaMalloc
                    2.57%  6.0637ms         3  2.0212ms  988.06us  3.3135ms  cudaMemcpy
                    0.34%  793.50us         1  793.50us  793.50us  793.50us  cuDeviceGetPCIBusId
                    0.24%  575.07us         3  191.69us  136.39us  223.11us  cudaFree
                    0.08%  184.58us       114  1.6190us     181ns  74.374us  cuDeviceGetAttribute
                    