# PCAP Lab - 6: CUDA

<h4>Parthivi Choubey</h4>

*180905456*

CSE - B - 6th sem

Roll. no.: 60

# CUDA installation and update

In [None]:
%%bash
apt-get --purge remove cuda nvidia* libnvidia-*
dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
apt-get remove cuda-*
apt autoremove
apt-get update

In [None]:
%%bash
wget https://developer.nvidia.com/compute/cuda/9.2/Prod/local_installers/cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64 -O cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
dpkg -i cuda-repo-ubuntu1604-9-2-local_9.2.88-1_amd64.deb
apt-key add /var/cuda-repo-9-2-local/7fa2af80.pub
apt-get update
apt-get install cuda-9.2

In [None]:
!nvcc --version
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Wed_Jul_22_19:09:09_PDT_2020
Cuda compilation tools, release 11.0, V11.0.221
Build cuda_11.0_bu.TC445_37.28845127_0
Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-5nipz6r8
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-5nipz6r8
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-cp37-none-any.whl size=4307 sha256=12919c4982e338a94fb41619ae98037496816c7089f55cf454235c9d1ac8feb8
  Stored in directory: /tmp/pip-ephem-wheel-cache-7jigab20/wheels/10/c2/05/ca241da37bff77d60d31a9174f988109c61ba989e4d4650516
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2
created output directory

#Question 1

Write a program in CUDA to add two Matrices for the following specifications:  
	
    a. Each row of resultant matrix to be computed by one thread.  
	
    b. Each column of resultant matrix to be computed by one thread.  
	
    c. Each element of resultant matrix to be computed by one thread. 

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

__global__ void add_row(int *A, int *B, int *C , int M)
{
    int id = threadIdx.x;
    for(int j=0;j<M;j++)
        C[id*M+j]=A[id*M+j]+B[id*M+j];
}

__global__ void add_col(int *A, int *B, int *C , int M)
{
    int id = threadIdx.x;
    for(int i=0;i<M;i++)
        C[i*M+id]=A[i*M+id]+B[i*M+id];
}

__global__ void add_matrix(int *A, int *B, int *C , int M)
{
    int id = blockIdx.x * blockDim.x + threadIdx.x;
    C[id]=A[id]+B[id];
}

int main()
{
    int N=3, M=3;
    int A[N][M], B[N][M], C[N][M];
    srand(time(0));
 
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<M;j++)
        {
            A[i][j]=(rand()%10);
            B[i][j]=(rand()%10);
        }
    }
    printf("Matrix A:\n");
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<M;j++)
            printf("%d ",A[i][j]);
        printf("\n");
    }
    printf("\n");
    printf("Matrix B:\n");
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<M;j++)
            printf("%d ",B[i][j]);
        printf("\n");
    }

    int size=sizeof(int)*N*M;
    int *d_A,*d_B, *d_C;
 
    cudaMalloc((void**)&d_A,size);
    cudaMalloc((void**)&d_B,size);
    cudaMalloc((void**)&d_C,size);

    cudaMemcpy(d_A,A,size,cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,B,size,cudaMemcpyHostToDevice);

    add_row<<<1,N>>>(d_A,d_B,d_C,M);
    cudaMemcpy(C,d_C,size,cudaMemcpyDeviceToHost);
    printf("\n\n");
    printf("Matrix added using 1 thread for each row:\n");
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<M;j++)
            printf("%d ",C[i][j]);
        printf("\n");
    }
 
    add_col<<<1,M>>>(d_A,d_B,d_C,M);
    cudaMemcpy(C,d_C,size,cudaMemcpyDeviceToHost);
    printf("\n\n");
    printf("Matrix added using 1 thread for each column:\n");
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<M;j++)
            printf("%d ",C[i][j]);
        printf("\n");
    }
 
    dim3 blockDim(N,M);
    add_matrix<<<1,blockDim>>>(d_A,d_B,d_C,M);
    cudaMemcpy(C,d_C,size,cudaMemcpyDeviceToHost);
    printf("\n\n");
    printf("Matrix added using 1 thread for each element:\n");
    for(int i=0;i<N;i++)
    {
        for(int j=0;j<M;j++)
            printf("%d ",C[i][j]);
        printf("\n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 0;
}

Matrix A:
7 0 0 
3 6 1 
5 1 6 

Matrix B:
5 4 0 
9 8 1 
6 5 8 


Matrix added using 1 thread for each row:
12 4 0 
12 14 2 
11 6 14 


Matrix added using 1 thread for each column:
12 4 0 
12 14 2 
11 6 14 


Matrix added using 1 thread for each element:
12 4 0 
12 14 2 
11 6 14 



#Question 2a

Write a program in CUDA to multiply two Matrices for the following specifications:   
	
    Each row of resultant matrix to be computed by one thread.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

__global__ void mul_row(int *A, int *B , int *C,int WA, int WB)
{
      int rid = threadIdx.x;
        for(int cid=0;cid<WB;cid++)
       {
           for(int k=0;k<WA;k++)
               C[rid*WB+cid] += A[rid*WA+k]*B[k*WB+cid];
       }
}

int main()
{
    int HA=3, WA=3, HB=3, WB=3;
    int A[HA][WA], B[HB][WB], C[HA][WB];
    if(WA!=HB)
      {
          printf("Incorrect dimensions!");
          return 0;
      }

    srand(time(0));
    printf("Matrix A:\n");
    for(int i=0;i<HA;i++)
    {
        for(int j=0;j<WB;j++)
        {
            A[i][j]=(rand()%10);
            printf("%d ",A[i][j]);
        }
        printf("\n");
    }
    printf("\n");
    printf("Matrix B:\n");
    for(int i=0;i<HB;i++)
    {
        for(int j=0;j<WB;j++)
        {
            B[i][j]=(rand()%10);
            printf("%d ",B[i][j]);
        }
        printf("\n");
    }

    int sizeA=sizeof(int)*HA*WA;
    int sizeB=sizeof(int)*HB*WB;
    int sizeC=sizeof(int)*HA*WB;
    int *d_A,*d_B,*d_C;
 
    cudaMalloc((void**)&d_A,sizeA);
    cudaMalloc((void**)&d_B,sizeB);
    cudaMalloc((void**)&d_C,sizeC);
 
    cudaMemcpy(d_A,A,sizeA,cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,B,sizeB,cudaMemcpyHostToDevice);

    mul_row<<<1,HA>>>(d_A,d_B,d_C,WA,WB);
    cudaMemcpy(C,d_C,sizeC,cudaMemcpyDeviceToHost);

    printf("\n\n");
    printf("Multiplied matrix:\n");
    for(int i=0;i<HA;i++)
    {
        for(int j=0;j<WB;j++)
        {
            printf("%d ",C[i][j]);
        }
        printf("\n");
    }
 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 0;
}

Matrix A:
5 3 5 
5 8 6 
5 2 2 

Matrix B:
5 1 8 
6 6 5 
9 3 3 


Multiplied matrix:
88 38 70 
127 71 98 
55 23 56 



#Question 2b

Write a program in CUDA to multiply two Matrices for the following specifications:   
	
    Each column of resultant matrix to be computed by one thread.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

__global__ void mul_col(int *A, int *B , int *C, int HA, int WA)
{
    int cid = threadIdx.x;
    int WB = blockDim.x;
    for(int rid=0;rid<HA;rid++)
    {
           for(int k=0;k<WA;k++)
               C[rid*WB+cid] += A[rid*WA+k]*B[k*WB+cid];
    }
}

int main()
{
    int HA=3, WA=3, HB=3, WB=3;
    int A[HA][WA], B[HB][WB], C[HA][WB];
    if(WA!=HB)
      {
          printf("Incorrect dimensions!");
          return 0;
      }
 
    srand(time(0));
    printf("Matrix A:\n");
    for(int i=0;i<HA;i++)
    {
        for(int j=0;j<WB;j++)
        {
            A[i][j]=(rand()%10);
            printf("%d ",A[i][j]);
        }
        printf("\n");
    }
    printf("\n");
    printf("Matrix B:\n");
    for(int i=0;i<HB;i++)
    {
        for(int j=0;j<WB;j++)
        {
            B[i][j]=(rand()%10);
            printf("%d ",B[i][j]);
        }
        printf("\n");
    }

    int sizeA=sizeof(int)*HA*WA;
    int sizeB=sizeof(int)*HB*WB;
    int sizeC=sizeof(int)*HA*WB;
    int *d_A,*d_B,*d_C;
 
    cudaMalloc((void**)&d_A,sizeA);
    cudaMalloc((void**)&d_B,sizeB);
    cudaMalloc((void**)&d_C,sizeC);

    cudaMemcpy(d_A,A,sizeA,cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,B,sizeB,cudaMemcpyHostToDevice);

    mul_col<<<1,WB>>>(d_A,d_B,d_C,HA,WA);
    cudaMemcpy(C,d_C,sizeC,cudaMemcpyDeviceToHost);

    printf("\n\n");
    printf("Multiplied matrix:\n");
    for(int i=0;i<HA;i++)
    {
        for(int j=0;j<WB;j++)
        {
            printf("%d ",C[i][j]);
        }
        printf("\n");
    }
 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 0;
}

Matrix A:
7 1 7 
4 8 9 
2 8 4 

Matrix B:
2 0 6 
1 6 6 
6 8 0 


Multiplied matrix:
57 62 48 
70 120 72 
36 80 60 



#Question 2c

Write a program in CUDA to multiply two Matrices for the following specifications:   
	
    Each element of resultant matrix to be computed by one thread.

In [None]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <time.h>

__global__ void mul_matrix(int * A, int *B , int *C,int WA)
{
    int row = threadIdx.y;
    int col = threadIdx.x;
    int WB = blockDim.x;
    for(int k=0;k<WA;k++)
        C[row*WB+col] += A[row*WA+k]*B[k*WB+col];
}

int main()
{
    int HA=3, WA=3, HB=3, WB=3;
    int A[HA][WA], B[HB][WB], C[HA][WB];
    if(WA!=HB)
      {
          printf("Incorrect dimensions!");
          return 0;
      }
 
    srand(time(0));
    printf("Matrix A:\n");
    for(int i=0;i<HA;i++)
    {
        for(int j=0;j<WB;j++)
        {
            A[i][j]=(rand()%10);
            printf("%d ",A[i][j]);
        }
        printf("\n");
    }
    printf("\n");
    printf("Matrix B:\n");
    for(int i=0;i<HB;i++)
    {
        for(int j=0;j<WB;j++)
        {
            B[i][j]=(rand()%10);
            printf("%d ",B[i][j]);
        }
        printf("\n");
    }

    int sizeA=sizeof(int)*HA*WA;
    int sizeB=sizeof(int)*HB*WB;
    int sizeC=sizeof(int)*HA*WB;
    int *d_A,*d_B,*d_C;
 
    cudaMalloc((void**)&d_A,sizeA);
    cudaMalloc((void**)&d_B,sizeB);
    cudaMalloc((void**)&d_C,sizeC);
 
    cudaMemcpy(d_A,A,sizeA,cudaMemcpyHostToDevice);
    cudaMemcpy(d_B,B,sizeB,cudaMemcpyHostToDevice);

    dim3 gridDim(1,1);
    dim3 blockDim(WB,HA);
    mul_matrix<<<gridDim,blockDim>>>(d_A,d_B,d_C,WA);
    cudaMemcpy(C,d_C,sizeC,cudaMemcpyDeviceToHost);

    printf("\n\n");
    printf("Multiplied matrix :\n");
    for(int i=0;i<HA;i++)
    {
        for(int j=0;j<WB;j++)
            printf("%d ",C[i][j]);
        printf("\n");
    }
 
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    return 0;
}

Matrix A:
5 2 5 
3 3 6 
7 2 7 

Matrix B:
2 0 4 
3 9 8 
8 7 2 


Multiplied matrix :
56 53 46 
63 69 48 
76 67 58 

