In [1]:
!nvidia-smi

Sun Nov  3 14:57:27 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...    Off |   00000000:01:00.0 Off |                  N/A |
| N/A   41C    P3             17W /   30W |      14MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Thu_Nov_18_09:45:30_PST_2021
Cuda compilation tools, release 11.5, V11.5.119
Build cuda_11.5.r11.5/compiler.30672275_0


In [3]:
!git clone https://github.com/NVIDIA/cuda-samples.git

Cloning into 'cuda-samples'...
remote: Enumerating objects: 19507, done.[K
remote: Counting objects: 100% (10080/10080), done.[K
remote: Compressing objects: 100% (569/569), done.[K
remote: Total 19507 (delta 9605), reused 9714 (delta 9511), pack-reused 9427 (from 1)[K
Receiving objects: 100% (19507/19507), 133.56 MiB | 5.80 MiB/s, done.
Resolving deltas: 100% (17060/17060), done.
Updating files: 100% (4026/4026), done.


In [4]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && make

/usr/local/cuda/bin/nvcc -ccbin g++ -I../../../Common -m64 --threads 0 --std=c++11 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 -o deviceQuery.o -c deviceQuery.cpp


/usr/local/cuda/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90 -o deviceQuery deviceQuery.o 
mkdir -p ../../../bin/x86_64/linux/release
cp deviceQuery ../../../bin/x86_64/linux/release


In [5]:
!cd cuda-samples/Samples/1_Utilities/deviceQuery && ls
!cuda-samples/Samples/1_Utilities/deviceQuery/./deviceQuery

deviceQuery		deviceQuery_vs2017.vcxproj  deviceQuery_vs2022.vcxproj
deviceQuery.cpp		deviceQuery_vs2019.sln	    Makefile
deviceQuery.o		deviceQuery_vs2019.vcxproj  NsightEclipse.xml
deviceQuery_vs2017.sln	deviceQuery_vs2022.sln	    README.md


cuda-samples/Samples/1_Utilities/deviceQuery/./deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce RTX 3050 Laptop GPU"
  CUDA Driver Version / Runtime Version          12.6 / 12.6
  CUDA Capability Major/Minor version number:    8.6
  Total amount of global memory:                 3801 MBytes (3985833984 bytes)
  (016) Multiprocessors, (128) CUDA Cores/MP:    2048 CUDA Cores
  GPU Max Clock rate:                            1500 MHz (1.50 GHz)
  Memory Clock rate:                             6001 Mhz
  Memory Bus Width:                              128-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount

In [6]:
!pip install nvcc4jupyter

Defaulting to user installation because normal site-packages is not writeable


In [7]:
%load_ext nvcc4jupyter

Source files will be saved in "/tmp/tmp73rf_o0i".


In [17]:
%%cuda

// 1D convolution of a matrix

#include <iostream>
#include <cuda_runtime.h>

__global__ void conv1D(float* input, float* kernel, float* output, int dataSize, int kernelSize) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int halfKernel = kernelSize / 2;
    float sum = 0;

    if (idx < dataSize) {
        for (int j = -halfKernel; j <= halfKernel; j++) {
            int index = idx + j;
            if (index >= 0 && index < dataSize) {
                sum += input[index] * kernel[halfKernel + j];
            }
        }
        output[idx] = sum;
    }
}

int main() {
    int dataSize = 1024;
    int kernelSize = 5;
    int size = dataSize * sizeof(float);

    float *h_input = new float[dataSize];
    float *h_kernel = new float[kernelSize];
    float *h_output = new float[dataSize];

    float *d_input, *d_kernel, *d_output;
    cudaMalloc(&d_input, size);
    cudaMalloc(&d_kernel, kernelSize * sizeof(float));
    cudaMalloc(&d_output, size);

    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_kernel, h_kernel, kernelSize * sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (dataSize + threadsPerBlock - 1) / threadsPerBlock;
    conv1D<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_kernel, d_output, dataSize, kernelSize);

    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);

    delete[] h_input;
    delete[] h_kernel;
    delete[] h_output;
    cudaFree(d_input);
    cudaFree(d_kernel);
    cudaFree(d_output);

    return 0;
}





In [9]:
%%cuda

// matrix to vector and perform element-wise operation

#include <iostream>
#include <cuda_runtime.h>

__global__ void matrixToVector(float* matrix, float* vector, int rows, int cols) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < rows * cols) {
        vector[idx] = matrix[idx] * 2.0f; // can use any other operation
    }
}

int main() {
    int rows = 32, cols = 32;
    int size = rows * cols * sizeof(float);

    float *h_matrix = new float[rows * cols];
    float *h_vector = new float[rows * cols];

    float *d_matrix, *d_vector;
    cudaMalloc(&d_matrix, size);
    cudaMalloc(&d_vector, size);

    cudaMemcpy(d_matrix, h_matrix, size, cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (rows * cols + threadsPerBlock - 1) / threadsPerBlock;
    matrixToVector<<<blocksPerGrid, threadsPerBlock>>>(d_matrix, d_vector, rows, cols);

    cudaMemcpy(h_vector, d_vector, size, cudaMemcpyDeviceToHost);

    delete[] h_matrix;
    delete[] h_vector;
    cudaFree(d_matrix);
    cudaFree(d_vector);

    return 0;
}





In [10]:
%%cuda

// apply non-linear function on input(Sigmoid approximation)

#include <iostream>
#include <cuda_runtime.h>

__global__ void applySigmoid(float* input, float* output, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < size) {
        output[idx] = 1.0f / (1.0f + expf(-input[idx]));
    }
}

int main() {
    int size = 1024;
    float *h_input = new float[size];
    float *h_output = new float[size];

    float *d_input, *d_output;
    cudaMalloc(&d_input, size * sizeof(float));
    cudaMalloc(&d_output, size * sizeof(float));

    cudaMemcpy(d_input, h_input, size * sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    applySigmoid<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, size);

    cudaMemcpy(h_output, d_output, size * sizeof(float), cudaMemcpyDeviceToHost);

    delete[] h_input;
    delete[] h_output;
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}





In [11]:
%%cuda

// stream compaction(can convert based on boolean conditions)

#include <iostream>
#include <cuda_runtime.h>

__global__ void streamCompactStage1(float* input, int* output, int size) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < size) {
        output[idx] = (input[idx] > 0.5f) ? 1 : 0;  // Example boolean condition
    }
}

int main() {
    int size = 1024;
    float *h_input = new float[size];
    int *h_output = new int[size];

    float *d_input;
    int *d_output;
    cudaMalloc(&d_input, size * sizeof(float));
    cudaMalloc(&d_output, size * sizeof(int));

    cudaMemcpy(d_input, h_input, size * sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    streamCompactStage1<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, size);

    cudaMemcpy(h_output, d_output, size * sizeof(int), cudaMemcpyDeviceToHost);

    delete[] h_input;
    delete[] h_output;
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}





In [12]:
%%cuda

// matrix-vector multiplication

#include <iostream>
#include <cuda_runtime.h>

__global__ void matrixVectorMultiply(float* matrix, float* vector, float* output, int rows, int cols) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < rows) {
        float sum = 0.0f;
        for (int j = 0; j < cols; j++) {
            sum += matrix[row * cols + j] * vector[j];
        }
        output[row] = sum;
    }
}

int main() {
    int rows = 32, cols = 32;
    float *h_matrix = new float[rows * cols];
    float *h_vector = new float[cols];
    float *h_output = new float[rows];

    float *d_matrix, *d_vector, *d_output;
    cudaMalloc(&d_matrix, rows * cols * sizeof(float));
    cudaMalloc(&d_vector, cols * sizeof(float));
    cudaMalloc(&d_output, rows * sizeof(float));

    cudaMemcpy(d_matrix, h_matrix, rows * cols * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_vector, h_vector, cols * sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (rows + threadsPerBlock - 1) / threadsPerBlock;
    matrixVectorMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_matrix, d_vector, d_output, rows, cols);

    cudaMemcpy(h_output, d_output, rows * sizeof(float), cudaMemcpyDeviceToHost);

    delete[] h_matrix;
    delete[] h_vector;
    delete[] h_output;
    cudaFree(d_matrix);
    cudaFree(d_vector);
    cudaFree(d_output);

    return 0;
}





In [None]:
%%cuda

// reduction on a matrix(matrix to vector)

#include <iostream>
#include <cuda_runtime.h>

__global__ void matrixReduction(float* input, float* output, int size) {
    extern __shared__ float sharedData[];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;

    sharedData[tid] = (idx < size) ? input[idx] : 0.0f;
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sharedData[tid] += sharedData[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) {
        output[blockIdx.x] = sharedData[0];
    }
}


int main() {
    int rows = 32, cols = 32;
    int size = rows * cols;
    float *h_matrix = new float[size];
    float *h_output = new float[(size + 255) / 256];

    float *d_matrix, *d_partialOutput;
    cudaMalloc(&d_matrix, size * sizeof(float));
    cudaMalloc(&d_partialOutput, ((size + 255) / 256) * sizeof(float));

    cudaMemcpy(d_matrix, h_matrix, size * sizeof(float), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    matrixReduction<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(float)>>>(d_matrix, d_partialOutput, size);

    cudaMemcpy(h_output, d_partialOutput, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);

    // perform only if needed or specified in the question
    float totalSum = 0.0f;
    for (int i = 0; i < blocksPerGrid; i++) {
        totalSum += h_output[i];
    }

    delete[] h_matrix;
    delete[] h_output;
    cudaFree(d_matrix);
    cudaFree(d_partialOutput);

    return 0;
}


In [None]:
%%cuda

// matrix transpose using shared memory

#include <cuda_runtime.h>
#include <iostream>

#define TILE_DIM 32
#define BLOCK_ROWS 8

__global__ void transposeSharedMemory(float *input, float *output, int width, int height) {
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];

    int x = blockIdx.x * TILE_DIM + threadIdx.x;
    int y = blockIdx.y * TILE_DIM + threadIdx.y;

    if (x < width && y < height) {
        tile[threadIdx.y][threadIdx.x] = input[y * width + x];
    }

    __syncthreads();

    x = blockIdx.y * TILE_DIM + threadIdx.x;
    y = blockIdx.x * TILE_DIM + threadIdx.y;

    if (x < height && y < width) {
        output[y * height + x] = tile[threadIdx.x][threadIdx.y];
    }
}

int main() {
    int width = 64;
    int height = 64;

    float *h_input = new float[width * height];
    float *h_output = new float[width * height];
    for (int i = 0; i < width * height; i++) {
        h_input[i] = static_cast<float>(i);
    }

    float *d_input, *d_output;
    cudaMalloc(&d_input, width * height * sizeof(float));
    cudaMalloc(&d_output, width * height * sizeof(float));

    cudaMemcpy(d_input, h_input, width * height * sizeof(float), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(TILE_DIM, BLOCK_ROWS);
    dim3 blocksPerGrid((width + TILE_DIM - 1) / TILE_DIM, (height + TILE_DIM - 1) / TILE_DIM);

    transposeSharedMemory<<<blocksPerGrid, threadsPerBlock>>>(d_input, d_output, width, height);

    cudaMemcpy(h_output, d_output, width * height * sizeof(float), cudaMemcpyDeviceToHost);

    std::cout << "Transposed matrix:\n";
    for (int row = 0; row < 8; row++) {
        for (int col = 0; col < 8; col++) {
            std::cout << h_output[row * height + col] << " ";
        }
        std::cout << "\n";
    }

    delete[] h_input;
    delete[] h_output;
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}


In [None]:
%%cuda

// matrix-matrix multiplication using standard CUDA kernel with flattened matrices

#include <iostream>
#include <cuda_runtime.h>

__global__ void matrixMultiply(float* A, float* B, float* C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < n && col < n) {
        float sum = 0.0f;
        for (int k = 0; k < n; k++) {
            sum += A[row * n + k] * B[k * n + col];
        }
        C[row * n + col] = sum;
    }
}

int main() {
    int size = 1024;

    float h_A[size], h_B[size], h_C[size];

    /*
    for (int i = 0; i < size; i++) {
        h_A[i] = static_cast<float>(i % 100);
        h_B[i] = static_cast<float>((i + 1) % 100);
    }
    */

    float *d_A, *d_B, *d_C;
    cudaMalloc(&d_A, size * sizeof(float));
    cudaMalloc(&d_B, size * sizeof(float));
    cudaMalloc(&d_C, size * sizeof(float));

    cudaMemcpy(d_A, h_A, size * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size * sizeof(float), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(16, 16);
    dim3 blocksPerGrid((size + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (size + threadsPerBlock.y - 1) / threadsPerBlock.y);

    matrixMultiply<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
    cudaDeviceSynchronize();

    cudaMemcpy(h_C, d_C, size * sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

In [None]:
%%cuda

// using the Thrust library to perform matrix-vector multiplication

#include <iostream>
#include <cuda_runtime.h>

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/inner_product.h>

const int rows = 32;
const int cols = 32;

struct RowDotProduct {
    float* vector;
    int cols;

    RowDotProduct(float* vec, int cols) : vector(vec), cols(cols) {}

    __device__ float operator()(const int& row_idx) const {
        return thrust::inner_product(
            thrust::device_pointer_cast(vector),
            thrust::device_pointer_cast(vector + cols),
            thrust::device_pointer_cast(&vector[row_idx * cols]),
            0.0f
        );
    }
};

int main() {
    float h_matrix[rows * cols];
    float h_vector[cols];
    float h_output[rows];

    thrust::device_vector<float> d_matrix(h_matrix, h_matrix + rows * cols);
    thrust::device_vector<float> d_vector(h_vector, h_vector + cols);
    thrust::device_vector<float> d_output(rows);

    thrust::transform(
        thrust::make_counting_iterator(0),
        thrust::make_counting_iterator(rows),
        d_output.begin(),
        RowDotProduct(thrust::raw_pointer_cast(d_vector.data()), cols)
    );

    thrust::copy(d_output.begin(), d_output.end(), h_output);

    for (int i = 0; i < rows; i++) {
        std::cout << "Row " << i << " result: " << h_output[i] << std::endl;
    }

    return 0;
}
