In [1]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


Program 1:Execute the following program and check the properties of your GPGPU.

In [None]:
# Step 1: Write the CUDA code
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

int main()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    printf("Checking device count...\n"); // Debug statement
    if (deviceCount == 0)
    {
        printf("There is no device supporting CUDA\n");
        return 0; // Exit the program
    }
    printf("Device Count: %d\n", deviceCount); // Debug statement
    int dev;
    for (dev = 0; dev < deviceCount; ++dev)
    {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);
        if (dev == 0)
        {
            if (deviceProp.major < 1)
            {
                printf("There is no device supporting CUDA.\n");
            }
            else if (deviceCount == 1)
            {
                printf("There is 1 device supporting CUDA\n");
            }
            else
            {
                printf("There are %d devices supporting CUDA\n", deviceCount);
            }
        }
        printf("  Device %d: \"%s\"\n", dev, deviceProp.name);
        printf("  Major revision number:                         %d\n", deviceProp.major);
        printf("  Minor revision number:                         %d\n", deviceProp.minor);
        printf("  Total amount of global memory:                 %ld bytes\n", deviceProp.totalGlobalMem);
        printf("  Total amount of constant memory:               %ld bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %ld bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Multiprocessor count:                          %d\n", deviceProp.multiProcessorCount);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %ld bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %ld bytes\n", deviceProp.textureAlignment);
        printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
    }
    return 0;
}


Writing cuda_device_info.cu


In [None]:
# Step 2: Compile the code
!nvcc cuda_device_info.cu -o cuda_device_info

In [None]:
# Step 3: Run the executable
!./cuda_device_info

Checking device count...
Device Count: 1
There is 1 device supporting CUDA
  Device 0: "Tesla T4"
  Major revision number:                         7
  Minor revision number:                         5
  Total amount of global memory:                 15835660288 bytes
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Multiprocessor count:                          40
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Clock rate:                                    1590000 kilohertz


Program 2: Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with one block and multiple threads.

In [None]:
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void helloWorldKernel() {
    // Get the block ID and thread ID
    int blockId = blockIdx.x;
    int threadId = threadIdx.x;
    printf("Hello World from block %d, thread %d\n", blockId, threadId);
}

int main() {
    // Launch the kernel with 5 blocks and 10 threads per block
    helloWorldKernel<<<5, 10>>>(); // 5 blocks, 10 threads per block
    cudaDeviceSynchronize(); // Wait for the kernel to finish
    return 0;
}


Overwriting cuda_device_info.cu


In [None]:
!nvcc cuda_device_info.cu -o cuda_device_info


In [None]:
# Step 3: Run the executable
!./cuda_device_info

Hello World from block 4, thread 0
Hello World from block 4, thread 1
Hello World from block 4, thread 2
Hello World from block 4, thread 3
Hello World from block 4, thread 4
Hello World from block 4, thread 5
Hello World from block 4, thread 6
Hello World from block 4, thread 7
Hello World from block 4, thread 8
Hello World from block 4, thread 9
Hello World from block 1, thread 0
Hello World from block 1, thread 1
Hello World from block 1, thread 2
Hello World from block 1, thread 3
Hello World from block 1, thread 4
Hello World from block 1, thread 5
Hello World from block 1, thread 6
Hello World from block 1, thread 7
Hello World from block 1, thread 8
Hello World from block 1, thread 9
Hello World from block 3, thread 0
Hello World from block 3, thread 1
Hello World from block 3, thread 2
Hello World from block 3, thread 3
Hello World from block 3, thread 4
Hello World from block 3, thread 5
Hello World from block 3, thread 6
Hello World from block 3, thread 7
Hello World from blo

Program 3:Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with multiple blocks and multiple threads.

In [None]:
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void helloWorldKernel() {
    // Get the block ID and thread ID within the block
    int blockId = blockIdx.x;
    int threadIdInBlock = threadIdx.x;

    // Get the global thread ID across all blocks
    int globalThreadId = blockId * blockDim.x + threadIdInBlock;

    printf("Hello World from block %d, thread %d (global thread ID: %d)\n", blockId, threadIdInBlock, globalThreadId);
}

int main() {
    int numBlocks = 2;          // Number of blocks
    int threadsPerBlock = 5;    // Number of threads per block

    // Launch the kernel with multiple blocks and multiple threads
    helloWorldKernel<<<numBlocks, threadsPerBlock>>>();

    // Synchronize the device
    cudaDeviceSynchronize(); // Wait for the kernel to finish

    return 0;
}


Overwriting cuda_device_info.cu


In [None]:
# Step 2: Compile the code
!nvcc cuda_device_info.cu -o cuda_device_info

In [None]:
# Step 3: Run the executable
!./cuda_device_info

Hello World from block 1, thread 0 (global thread ID: 5)
Hello World from block 1, thread 1 (global thread ID: 6)
Hello World from block 1, thread 2 (global thread ID: 7)
Hello World from block 1, thread 3 (global thread ID: 8)
Hello World from block 1, thread 4 (global thread ID: 9)
Hello World from block 0, thread 0 (global thread ID: 0)
Hello World from block 0, thread 1 (global thread ID: 1)
Hello World from block 0, thread 2 (global thread ID: 2)
Hello World from block 0, thread 3 (global thread ID: 3)
Hello World from block 0, thread 4 (global thread ID: 4)


Program 4: Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with 2D blocks and 2D threads.

In [3]:
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void helloWorldKernel() {
    // Get the 2D thread ID within the block
    int threadIdX = threadIdx.x;
    int threadIdY = threadIdx.y;

    // Get the 2D block ID
    int blockIdX = blockIdx.x;
    int blockIdY = blockIdx.y;

    // Get the global thread ID in 2D grid
    int globalThreadIdX = blockIdX * blockDim.x + threadIdX;
    int globalThreadIdY = blockIdY * blockDim.y + threadIdY;

    printf("Hello World from block (%d, %d), thread (%d, %d) (global thread ID: (%d, %d))\n",
            blockIdX, blockIdY, threadIdX, threadIdY, globalThreadIdX, globalThreadIdY);
}

int main() {
    dim3 threadsPerBlock(2, 2); // Size of the block (2x2 threads)
    dim3 numBlocks(2, 2);       // Number of blocks (2x2 blocks)

    // Launch the kernel with 2D blocks and 2D threads
    helloWorldKernel<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize(); // Wait for the kernel to finish

    return 0;
}


Writing cuda_device_info.cu


In [4]:
# Step 2: Compile the code
!nvcc cuda_device_info.cu -o cuda_device_info

In [5]:
# Step 3: Run the executable
!./cuda_device_info

Hello World from block (0, 1), thread (0, 0) (global thread ID: (0, 2))
Hello World from block (0, 1), thread (1, 0) (global thread ID: (1, 2))
Hello World from block (0, 1), thread (0, 1) (global thread ID: (0, 3))
Hello World from block (0, 1), thread (1, 1) (global thread ID: (1, 3))
Hello World from block (0, 0), thread (0, 0) (global thread ID: (0, 0))
Hello World from block (0, 0), thread (1, 0) (global thread ID: (1, 0))
Hello World from block (0, 0), thread (0, 1) (global thread ID: (0, 1))
Hello World from block (0, 0), thread (1, 1) (global thread ID: (1, 1))
Hello World from block (1, 1), thread (0, 0) (global thread ID: (2, 2))
Hello World from block (1, 1), thread (1, 0) (global thread ID: (3, 2))
Hello World from block (1, 1), thread (0, 1) (global thread ID: (2, 3))
Hello World from block (1, 1), thread (1, 1) (global thread ID: (3, 3))
Hello World from block (1, 0), thread (0, 0) (global thread ID: (2, 0))
Hello World from block (1, 0), thread (1, 0) (global thread ID: 

In [6]:
%%writefile cuda_device_info.cu
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void helloWorldKernel() {
    // Get the 2D thread ID within the block
    int threadIdX = threadIdx.x;
    int threadIdY = threadIdx.y;

    // Get the 2D block ID
    int blockIdX = blockIdx.x;
    int blockIdY = blockIdx.y;

    // Get the global thread ID in 2D grid
    int globalThreadIdX = blockIdX * blockDim.x + threadIdX;
    int globalThreadIdY = blockIdY * blockDim.y + threadIdY;

    // Calculate the 1D global ID
    int globalThreadId1D = (blockIdY * gridDim.x + blockIdX) * (blockDim.x * blockDim.y) + (threadIdY * blockDim.x) + threadIdX;

    printf("Hello World from block (%d, %d), thread (%d, %d) (global thread ID: (%d, %d), 1D global ID: %d)\n",
            blockIdX, blockIdY, threadIdX, threadIdY, globalThreadIdX, globalThreadIdY, globalThreadId1D);
}

int main() {
    dim3 threadsPerBlock(2, 2); // Size of the block (2x2 threads)
    dim3 numBlocks(2, 2);       // Number of blocks (2x2 blocks)

    // Launch the kernel with 2D blocks and 2D threads
    helloWorldKernel<<<numBlocks, threadsPerBlock>>>();
    cudaDeviceSynchronize(); // Wait for the kernel to finish

    return 0;
}


Overwriting cuda_device_info.cu


In [7]:
# Step 2: Compile the code
!nvcc cuda_device_info.cu -o cuda_device_info

In [8]:
# Step 3: Run the executable
!./cuda_device_info

Hello World from block (0, 1), thread (0, 0) (global thread ID: (0, 2), 1D global ID: 8)
Hello World from block (0, 1), thread (1, 0) (global thread ID: (1, 2), 1D global ID: 9)
Hello World from block (0, 1), thread (0, 1) (global thread ID: (0, 3), 1D global ID: 10)
Hello World from block (0, 1), thread (1, 1) (global thread ID: (1, 3), 1D global ID: 11)
Hello World from block (0, 0), thread (0, 0) (global thread ID: (0, 0), 1D global ID: 0)
Hello World from block (0, 0), thread (1, 0) (global thread ID: (1, 0), 1D global ID: 1)
Hello World from block (0, 0), thread (0, 1) (global thread ID: (0, 1), 1D global ID: 2)
Hello World from block (0, 0), thread (1, 1) (global thread ID: (1, 1), 1D global ID: 3)
Hello World from block (1, 1), thread (0, 0) (global thread ID: (2, 2), 1D global ID: 12)
Hello World from block (1, 1), thread (1, 0) (global thread ID: (3, 2), 1D global ID: 13)
Hello World from block (1, 1), thread (0, 1) (global thread ID: (2, 3), 1D global ID: 14)
Hello World from