In [None]:
 !nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [None]:
# Problem Statement 1:
# Execute the following program and check the properties of your GPGPU.

In [None]:
%%writefile device_query.cu

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>  // Include CUDA runtime header

int main()
{
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    if (deviceCount == 0) {
        printf("There is no device supporting CUDA\n");
    }

    int dev;
    for (dev = 0; dev < deviceCount; ++dev) {
        cudaDeviceProp deviceProp;
        cudaGetDeviceProperties(&deviceProp, dev);

        if (dev == 0) {
            if (deviceProp.major < 1) {
                printf("There is no device supporting CUDA.\n");
            } else if (deviceCount == 1) {
                printf("There is 1 device supporting CUDA\n");
            } else {
                printf("There are %d devices supporting CUDA\n", deviceCount);
            }
        }

        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
        printf("  Major revision number:                         %d\n", deviceProp.major);
        printf("  Minor revision number:                         %d\n", deviceProp.minor);
        printf("  Total amount of global memory:                 %zu bytes\n", deviceProp.totalGlobalMem);  // Use %zu for size_t
        printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);   // Use %zu for size_t
        printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);  // Use %zu for size_t
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Multiprocessor count:                          %d\n", deviceProp.multiProcessorCount);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],  deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);  // Use %zu for size_t
        printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);  // Use %zu for size_t
        printf("  Clock rate:                                    %d kilohertz\n", deviceProp.clockRate);
    }
}

Writing device_query.cu


In [None]:
!nvcc device_query.cu -o device_query

In [None]:
!./device_query

There is 1 device supporting CUDA

Device 0: "Tesla T4"
  Major revision number:                         7
  Minor revision number:                         5
  Total amount of global memory:                 15835660288 bytes
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Multiprocessor count:                          40
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     2147483647 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Clock rate:                                    1590000 kilohertz


In [None]:
# Problem Statement 2:
# Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with one block and multiple threads.

In [None]:
%%writefile hello_world.cu

#include <stdio.h>
#include <cuda_runtime.h>  // Include CUDA runtime header

__global__ void helloWorld() {
    int threadId = threadIdx.x;
    printf("Hello World from thread %d\n", threadId);
}

int main() {
    helloWorld<<<1, 10>>>();
    cudaDeviceSynchronize();  // Ensure that the kernel completes
    return 0;
}

Writing hello_world.cu


In [None]:
!nvcc hello_world.cu -o hello_world

In [None]:
!./hello_world

Hello World from thread 0
Hello World from thread 1
Hello World from thread 2
Hello World from thread 3
Hello World from thread 4
Hello World from thread 5
Hello World from thread 6
Hello World from thread 7
Hello World from thread 8
Hello World from thread 9


In [None]:
# Problem Statement 3:
# Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with multiple blocks and multiple threads.

In [None]:
%%writefile hello_world_blocks.cu

#include <stdio.h>
#include <cuda_runtime.h>  // Include CUDA runtime header

__global__ void helloWorld() {
    int threadId = threadIdx.x + blockIdx.x * blockDim.x;
    printf("Hello World from thread %d (Block %d)\n", threadId, blockIdx.x);
}

int main() {
    helloWorld<<<5, 10>>>();
    cudaDeviceSynchronize();
    return 0;
}

Writing hello_world_blocks.cu


In [None]:
!nvcc hello_world_blocks.cu -o hello_world_blocks

In [None]:
!./hello_world_blocks

Hello World from thread 40 (Block 4)
Hello World from thread 41 (Block 4)
Hello World from thread 42 (Block 4)
Hello World from thread 43 (Block 4)
Hello World from thread 44 (Block 4)
Hello World from thread 45 (Block 4)
Hello World from thread 46 (Block 4)
Hello World from thread 47 (Block 4)
Hello World from thread 48 (Block 4)
Hello World from thread 49 (Block 4)
Hello World from thread 10 (Block 1)
Hello World from thread 11 (Block 1)
Hello World from thread 12 (Block 1)
Hello World from thread 13 (Block 1)
Hello World from thread 14 (Block 1)
Hello World from thread 15 (Block 1)
Hello World from thread 16 (Block 1)
Hello World from thread 17 (Block 1)
Hello World from thread 18 (Block 1)
Hello World from thread 19 (Block 1)
Hello World from thread 30 (Block 3)
Hello World from thread 31 (Block 3)
Hello World from thread 32 (Block 3)
Hello World from thread 33 (Block 3)
Hello World from thread 34 (Block 3)
Hello World from thread 35 (Block 3)
Hello World from thread 36 (Block 3)
H

In [None]:
# Problem Statement 4:
# Write a program to where each thread prints its thread ID along with hello world. Lauch the kernel with 2D blocks and 2D threads.

In [None]:
%%writefile hello_world_2D.cu

#include <stdio.h>
#include <cuda_runtime.h>  // Include CUDA runtime header

__global__ void helloWorld() {
    int threadIdX = threadIdx.x + blockIdx.x * blockDim.x;
    int threadIdY = threadIdx.y + blockIdx.y * blockDim.y;
    printf("Hello World from thread (%d, %d)\n", threadIdX, threadIdY);
}

int main() {
    dim3 grid(2, 2);  // 2D grid (2x2 blocks)
    dim3 block(4, 4);  // 2D block (4x4 threads)
    helloWorld<<<grid, block>>>();
    cudaDeviceSynchronize();
    return 0;
}

Writing hello_world_2D.cu


In [None]:
!nvcc hello_world_2D.cu -o hello_world_2D

In [None]:
!./hello_world_2D

Hello World from thread (4, 0)
Hello World from thread (5, 0)
Hello World from thread (6, 0)
Hello World from thread (7, 0)
Hello World from thread (4, 1)
Hello World from thread (5, 1)
Hello World from thread (6, 1)
Hello World from thread (7, 1)
Hello World from thread (4, 2)
Hello World from thread (5, 2)
Hello World from thread (6, 2)
Hello World from thread (7, 2)
Hello World from thread (4, 3)
Hello World from thread (5, 3)
Hello World from thread (6, 3)
Hello World from thread (7, 3)
Hello World from thread (4, 4)
Hello World from thread (5, 4)
Hello World from thread (6, 4)
Hello World from thread (7, 4)
Hello World from thread (4, 5)
Hello World from thread (5, 5)
Hello World from thread (6, 5)
Hello World from thread (7, 5)
Hello World from thread (4, 6)
Hello World from thread (5, 6)
Hello World from thread (6, 6)
Hello World from thread (7, 6)
Hello World from thread (4, 7)
Hello World from thread (5, 7)
Hello World from thread (6, 7)
Hello World from thread (7, 7)
Hello Wo