<a href="https://colab.research.google.com/github/Kunal726/LP5/blob/main/cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-8zt_2p_0
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-8zt_2p_0
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=f6b704a71b925a4511d0216ee02a2c48cd0e3e774384fce18908b0ecba560025
  Stored in directory: /tmp/pip-ephem-wheel-cache-z8ndtrgs/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [None]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


# Addition of large vectors

In [None]:
%%cu

#include <iostream>
#include <cuda.h>

// CUDA kernel for vector addition
__global__ void vectorAddition(float* a, float* b, float* c, int size) {
    int index = threadIdx.x + blockDim.x * blockIdx.x;

    if (index < size) {
        c[index] = a[index] + b[index];
    }
}

int main() {
    int size = 10;  // Size of the vectors
    int memSize = size * sizeof(float);

    // Allocate memory on the host
    float* hostA = new float[size];
    float* hostB = new float[size];
    float* hostC = new float[size];

    // Initialize the vectors
    for (int i = 0; i < size; i++) {
        hostA[i] = i;
        hostB[i] = i * 2;
    }

    // Allocate memory on the device
    float* deviceA;
    float* deviceB;
    float* deviceC;

    cudaMalloc((void**)&deviceA, memSize);
    cudaMalloc((void**)&deviceB, memSize);
    cudaMalloc((void**)&deviceC, memSize);

    // Copy data from host to device
    cudaMemcpy(deviceA, hostA, memSize, cudaMemcpyHostToDevice);
    cudaMemcpy(deviceB, hostB, memSize, cudaMemcpyHostToDevice);

    // Launch the CUDA kernel
    int blockSize = 256;
    int gridSize = (size + blockSize - 1) / blockSize;
    vectorAddition<<<gridSize, blockSize>>>(deviceA, deviceB, deviceC, size);

    // Copy the result back to the host
    cudaMemcpy(hostC, deviceC, memSize, cudaMemcpyDeviceToHost);

    // Print the result
    std::cout<< "Vector A : \n";
    for (int i = 0; i < size; i++) {
        std::cout << hostA[i] << " ";
    }
    std::cout << std::endl;

    // Print the result
    std::cout<< "\nVector B : \n";
    for (int i = 0; i < size; i++) {
        std::cout << hostB[i] << " ";
    }
    std::cout << std::endl;

    // Print the result
    std::cout<< "\nResult : \n";
    for (int i = 0; i < size; i++) {
        std::cout << hostC[i] << " ";
    }
    std::cout << std::endl;

    // Free device memory
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);

    // Free host memory
    delete[] hostA;
    delete[] hostB;
    delete[] hostC;

    return 0;
}


Vector A : 
0 1 2 3 4 5 6 7 8 9 

Vector B : 
0 2 4 6 8 10 12 14 16 18 

Result : 
0 3 6 9 12 15 18 21 24 27 



# matrix Multiplication

In [None]:
%%cu

#include <iostream>

__global__ void matmul(int* A, int* B, int* C, int N)
{
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (Row < N && Col < N)
    {
        int Pvalue = 0;
        for (int k = 0; k < N; k++)
        {
            Pvalue += A[Row * N + k] * B[k * N + Col];
        }
        
        C[Row * N + Col] = Pvalue;
    }
}

int main()
{
    int N = 3;
    int size = N * N * sizeof(int);
    int *A, *B, *C;
    int *dev_A, *dev_B, *dev_C;
    
    // Allocate memory on host
    A = new int[N * N];
    B = new int[N * N];
    C = new int[N * N];
    
    // Initialize matrices
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            A[i * N + j] = i * N + j;
            B[i * N + j] = j * N + i;
        }
    }
    
    // Allocate memory on device
    cudaMalloc((void**)&dev_A, size);
    cudaMalloc((void**)&dev_B, size);
    cudaMalloc((void**)&dev_C, size);
    
    // Copy input matrices from host to device
    cudaMemcpy(dev_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_B, B, size, cudaMemcpyHostToDevice);
    
    // Define block size and grid size
    dim3 dimBlock(16, 16);
    dim3 dimGrid((N + dimBlock.x - 1) / dimBlock.x, (N + dimBlock.y - 1) / dimBlock.y);
    
    // Launch the kernel for matrix multiplication
    matmul<<<dimGrid, dimBlock>>>(dev_A, dev_B, dev_C, N);
    
    // Copy the result from device to host
    cudaMemcpy(C, dev_C, size, cudaMemcpyDeviceToHost);
    
    // Print matrices
    std::cout << "Matrix A:" << std::endl;
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            std::cout << A[i * N + j] << " ";
        }
        std::cout << std::endl;
    }
    
    std::cout << "Matrix B:" << std::endl;
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            std::cout << B[i * N + j] << " ";
        }
        std::cout << std::endl;
    }
    
    std::cout << "Resultant Matrix:" << std::endl;
    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            std::cout << C[i * N + j] << " ";
        }
        std::cout << std::endl;
    }
    
    // Free device memory
    cudaFree(dev_A);
    cudaFree(dev_B);
    cudaFree(dev_C);
    
    // Free host memory
    delete[] A;
    delete[] B;
    delete[] C;
    
    return 0;
}


Matrix A:
0 1 2 
3 4 5 
6 7 8 
Matrix B:
0 3 6 
1 4 7 
2 5 8 
Resultant Matrix:
5 14 23 
14 50 86 
23 86 149 

