In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [3]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-fr8ygxol
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-fr8ygxol
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4287 sha256=5adcd8e07eb653c52c30f75cf3e7f7644155c62cf00efc58f554cdb3785b424a
  Stored in directory: /tmp/pip-ephem-wheel-cache-vpy48cnd/wheels/a8/b9/18/23f8ef71ceb0f63297dd1903aedd067e6243a68ea756d6feea
Successfully built NVCCPlugin
Installing collecte

In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [None]:
%%cu
#include<stdio.h>
#include<cuda.h>
__global__ void matadd(int *l,int *m, int *n)
{
    int x=blockIdx.x; //By using blockIdx.x, we can determine the position of the current block along the x-axis of the grid.
    int y=blockIdx.y;
    int id=gridDim.x * y +x; // Since the grid can have multiple dimensions (in this case, it's a 2D grid), we need to calculate a unique identifier for each thread that takes into account both the x and y coordinates
    n[id]=l[id]+m[id];
}
int main()
{
    int a[2][3];
    int b[2][3];
    int c[2][3];
    int *d,*e,*f;
    int i,j;
    
    for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
            {
                a[i][j]=i+j;
            }
    }
    
        for(i=0;i<2;i++)
        {
            for(j=0;j<3;j++)
                {
                    b[i][j]=i+j;
                }
        }
     // (void **)&d is used to pass the address of the pointer d to cudaMalloc. 
     // sizeof(int) gives the size of an integer in bytes. Multiplying it by 2 * 3 gives the total number of bytes required to store the matrix.
 
    cudaMalloc((void **)&d,2*3*sizeof(int));  // responsible for allocating device memory on the GPU.
    cudaMalloc((void **)&e,2*3*sizeof(int));
    cudaMalloc((void **)&f,2*3*sizeof(int));
 
    // When you allocate device memory using cudaMalloc, you are requesting a block of memory on the GPU with a specified size. 
    //This memory is used to store data that will be processed by the GPU kernels.
    
 cudaMemcpy(d,a,2*3*sizeof(int),cudaMemcpyHostToDevice);
 cudaMemcpy(e,b,2*3*sizeof(int),cudaMemcpyHostToDevice);
    
dim3 grid(3,2);
/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */

    matadd<<<grid,1>>>(d,e,f);

 cudaMemcpy(c,f,2*3*sizeof(int),cudaMemcpyDeviceToHost);
    printf("\nSum of two matrices:\n ");
    for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
        {
              printf("%d\t",c[i][j]);
        }
        printf("\n");
    }
    cudaFree(d);
    cudaFree(e);
    cudaFree(f);
    return 0;
}



Sum of two matrices:
 0	2	4	
2	4	6	



In [None]:
%%cu
#include<stdio.h>
#include<cuda.h>


__global__ void matproduct(int *l,int *m, int *n)
{
    int x=blockIdx.x;
    int y=blockIdx.y;
    int k;
  
n[2*y+x]=0;
for(k=0;k<2;k++)
   {
    n[2*y+x]=n[2*y+x]+l[2*y+k]*m[2*k+x];
   }
}

int main()
{
    int row1=2, col1=2, row2=2, col2=2;
    int a[row1][col1];
    int b[row2][col2];
    int c[row1][col2];
    int *d,*e,*f;
    int i,j;

    //printf("\n Enter elements of first matrix of size 2*3\n");
    for(i=0;i<2;i++)
    {
        for(j=0;j<2;j++)
            {
                a[i][j]=i+j;
            }
    }
 for(i=0;i<2;i++)
    {
        for(j=0;j<2;j++)
        {
              printf("%d\t",a[i][j]);
        }
        printf("\n");
    }
    //printf("\n Enter elements of second matrix of size 3*2\n");
        for(i=0;i<2;i++)
        {
            for(j=0;j<2;j++)
                {
                    b[i][j]=i+j;
                }
        }
 for(i=0;i<2;i++)
    {
        for(j=0;j<2;j++)
        {
              printf("%d\t",b[i][j]);
        }
        printf("\n");
    }

    cudaMalloc((void **)&d,row1*col1*sizeof(int));
    cudaMalloc((void **)&e,row2*col2*sizeof(int));
    cudaMalloc((void **)&f,row1*col2*sizeof(int));

 cudaMemcpy(d,a,row1*col1*sizeof(int),cudaMemcpyHostToDevice);
 cudaMemcpy(e,b,row2*col2*sizeof(int),cudaMemcpyHostToDevice);

dim3 grid(col2,row1);
/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */

    matproduct<<<grid,1>>>(d,e,f);

 cudaMemcpy(c,f,row1*col2*sizeof(int),cudaMemcpyDeviceToHost);
    printf("\nProduct of two matrices:\n ");
    for(i=0;i<2;i++)
    {
        for(j=0;j<2;j++)
        {
              printf("%d\t",c[i][j]);
        }
        printf("\n");
    }

    cudaFree(d);
    cudaFree(e);
    cudaFree(f);

    return 0;
}

0	1	
1	2	
0	1	
1	2	

Product of two matrices:
 1	2	
2	5	



In [None]:
%%cu
#include<stdio.h>
#include<cuda.h>

__global__ void matproduct(int *l, int *m, int *n)
{
    int x = blockIdx.x; //blockIdx is a built-in variable in CUDA that represents the index of the current block within the grid.
    int y = blockIdx.y;
    int k;
  
    n[2*y+x] = 0;

    // Perform matrix multiplication
    for(k = 0; k < 2; k++)
    {
        n[2*y+x] = n[2*y+x] + l[2*y+k] * m[2*k+x];
    }
}

int main()
{
    int row1 = 2, col1 = 2, row2 = 2, col2 = 2;
    int a[row1][col1];
    int b[row2][col2];
    int c[row1][col2];
    int *d, *e, *f;
    int i, j;

    // Initialize the first matrix
    for(i = 0; i < 2; i++)
    {
        for(j = 0; j < 2; j++)
        {
            a[i][j] = i + j;
        }
    }

    // Print the first matrix
    for(i = 0; i < 2; i++)
    {
        for(j = 0; j < 2; j++)
        {
            printf("%d\t", a[i][j]);
        }
        printf("\n");
    }

    // Initialize the second matrix
    for(i = 0; i < 2; i++)
    {
        for(j = 0; j < 2; j++)
        {
            b[i][j] = i + j;
        }
    }

    // Print the second matrix
    for(i = 0; i < 2; i++)
    {
        for(j = 0; j < 2; j++)
        {
            printf("%d\t", b[i][j]);
        }
        printf("\n");
    }

    // Allocate memory on the GPU
    // cudaMalloc is used to allocate device memory for an integer array of size row1 * col1. The allocated memory is then stored in the variable d
    cudaMalloc((void **)&d, row1 * col1 * sizeof(int));
    cudaMalloc((void **)&e, row2 * col2 * sizeof(int));
    cudaMalloc((void **)&f, row1 * col2 * sizeof(int));

    // Copy matrices from host to device
    //row1 * col1 * sizeof(int): The number of bytes to be copied, calculated based on the size of the data (row1 * col1) and the element size (sizeof(int)).
    //cudaMemcpyHostToDevice: A flag specifying the direction of the memory transfer. In this case, it indicates that the data should be copied from the host (CPU) to the device (GPU).
    cudaMemcpy(d, a, row1 * col1 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(e, b, row2 * col2 * sizeof(int), cudaMemcpyHostToDevice);

    dim3 grid(col2, row1);
    /* Here we are defining a two-dimensional grid (a collection of blocks) structure. 
       Syntax is dim3 grid(no. of columns, no. of rows) */

    // Launch the matrix multiplication kernel
    matproduct<<<grid, 1>>>(d, e, f);

    // Copy the result matrix from device to host
    cudaMemcpy(c, f, row1 * col2 * sizeof(int), cudaMemcpyDeviceToHost);

    // Print the resulting product matrix
    printf("\nProduct of two matrices:\n ");
    for(i = 0; i < 2; i++)
    {
        for(j = 0; j < 2; j++)
        {
              printf("%d\t", c[i][j]);
        }
        printf("\n");
    }

    // Free the allocated memory on the GPU
    cudaFree(d);
    cudaFree(e);
    cudaFree(f);

    return 0;
}


0	1	
1	2	
0	1	
1	2	

Product of two matrices:
 1	2	
2	5	



In [None]:
%%cu
#include stdio.h
#include<cuda.h>

void __global__ metadd(int *l,int *m, int *n){
    int x=blockIdx.x;
    int y=blockIdx.y;
    int id=gridDim.x*y+x;
    n[id]=l[id]+m[id];
}

int main(){
    int a[2][3], b[2][3], c[2][3];
    int *d, *e, *f;
    for(int i=0;i<2;i++){
        for(int j=0;j<3;j++){
            a[i][j]=i+j;
        }
    }
    for(int i=0;i<2;i++){
        for(int j=0;j<3;j++){
            b[i][j]=i+j;
        }
    }
    cudaMalloc((void **)&d, 2*3*sizeof(int));
    cudaMalloc((void **)&e, 2*3*sizeof(int));
    
}

In [None]:
%%cu
#include<stdio.h>
#include<cuda.h>

 __global__ void matadd(int *l, int *m, int *n){
    int x=blockIdx.x;
    int y=blockIdx.y;
    int id=gridDim.x*y+x;
    n[id]=l[id]+m[id];
}

int main(){
    int a[2][3], b[2][3], c[2][3];
    int *d, *e, *f;
    for(int i=0;i<2;i++){
        for(int j=0;j<3;j++){
            a[i][j]=i+j;
        }
    }
    for(int i=0;i<2;i++){
        for(int j=0;j<3;j++){
            b[i][j]=i+j;
        }
    }
    cudaMalloc((void **)&d,2*3*sizeof(int));
    cudaMalloc((void **)&e,2*3*sizeof(int));
    cudaMalloc((void **)&f,2*3*sizeof(int));
    cudaMemcpy(d,a,2*3*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(e,b,2*3*sizeof(int),cudaMemcpyHostToDevice);
    dim3 grid (3,2);                                                    //
    matadd<<<grid,1>>>(d,e,f);                                          //
    cudaMemcpy(c,f,2*3*sizeof(int),cudaMemcpyDeviceToHost);
     printf("\nSum of two matrices:\n ");
    for(int i=0;i<2;i++)
    {
        for(int j=0;j<3;j++)
        {
              printf("%d\t",c[i][j]);
        }
        printf("\n");
    }
    cudaFree(d);
    cudaFree(e);
    cudaFree(f);
}


Sum of two matrices:
 0	2	4	
2	4	6	



In [None]:
%%cu
#include<stdio.h>
#include<cuda.h>
__global__ void matadd(int *l,int *m, int *n)
{
    int x=blockIdx.x; //By using blockIdx.x, we can determine the position of the current block along the x-axis of the grid.
    int y=blockIdx.y;
    int id=gridDim.x * y +x; // Since the grid can have multiple dimensions (in this case, it's a 2D grid), we need to calculate a unique identifier for each thread that takes into account both the x and y coordinates
    n[id]=l[id]+m[id];
}
int main()
{
    int a[2][3];
    int b[2][3];
    int c[2][3];
    int *d,*e,*f;
    int i,j;
    
    for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
            {
                a[i][j]=i+j;
            }
    }
    
        for(i=0;i<2;i++)
        {
            for(j=0;j<3;j++)
                {
                    b[i][j]=i+j;
                }
        }
     // (void **)&d is used to pass the address of the pointer d to cudaMalloc. 
     // sizeof(int) gives the size of an integer in bytes. Multiplying it by 2 * 3 gives the total number of bytes required to store the matrix.
 
    cudaMalloc((void **)&d,2*3*sizeof(int));  // responsible for allocating device memory on the GPU.
    cudaMalloc((void **)&e,2*3*sizeof(int));
    cudaMalloc((void **)&f,2*3*sizeof(int));
 
    // When you allocate device memory using cudaMalloc, you are requesting a block of memory on the GPU with a specified size. 
    //This memory is used to store data that will be processed by the GPU kernels.
    
 cudaMemcpy(d,a,2*3*sizeof(int),cudaMemcpyHostToDevice);
 cudaMemcpy(e,b,2*3*sizeof(int),cudaMemcpyHostToDevice);
    
dim3 grid(3,2);
/* Here we are defining two dimensional Grid(collection of blocks) structure. Syntax is dim3 grid(no. of columns,no. of rows) */

    matadd<<<grid,1>>>(d,e,f);

 cudaMemcpy(c,f,2*3*sizeof(int),cudaMemcpyDeviceToHost);
    printf("\nSum of two matrices:\n ");
    for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
        {
              printf("%d\t",c[i][j]);
        }
        printf("\n");
    }
    cudaFree(d);
    cudaFree(e);
    cudaFree(f);
    return 0;
}



Sum of two matrices:
 0	2	4	
2	4	6	



In [None]:
%%cu
#include<stdio.h>
#include<cuda.h>

 __global__ void matadd(int *l, int *m, int *n){
    int x = blockIdx.x;
    int y = blockIdx.y;
    int id = gridDim.x *y +x;
    n[id] = l[id] + m[id];
}

int main(){
    int a[2][3], b[2][3], c[2][3];
    int *d, *e, *f;
    int i, j;
    for(i=0;i<2;i++){
        for(j=0;j<3;j++){
            a[i][j] = i+j;
        }
    }
    for(i=0;i<2;i++){
        for(j=0;j<3;j++){
            b[i][j] = i+j;
        }
    }
    cudaMalloc((void **)&d, 2*3*sizeof(int));
    cudaMalloc((void **)&e, 2*3*sizeof(int));
    cudaMalloc((void **)&f, 2*3*sizeof(int));
    cudaMemcpy(d,a,2*3*sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(e,b,2*3*sizeof(int),cudaMemcpyHostToDevice);

    dim3 grid(3,2);
    matadd<<<grid, 1>>>(d,e,f);
    cudaMemcpy(c,f,2*3*sizeof(int),cudaMemcpyDeviceToHost);
    for(i=0;i<2;i++){
        for(j=0;j<3;j++){
            printf("%d ",c[i][j]);
        }
        printf("\n");
    }
}

0 2 4 
2 4 6 



In [7]:
%%cu
#include<stdio.h>
#include<cuda.h>

__global__ void matadd(int *l, int *m, int *n)
{
    int x=blockIdx.x;
    int y=blockIdx.y;
    int id=gridDim.x*y+x;
    n[id]=l[id]+m[id];
}

int main()
{
    int a[2][3], b[2][3], c[2][3];
    int *d,*e,*f;
    int i,j;
 
    for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
        {
            a[i][j]=i+j;
        }
    }
 
    for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
        {
            b[i][j]=i+j;
        }
    }
 
    cudaMalloc((void**)&d,2*3*sizeof(int));
    cudaMalloc((void**)&e,2*3*sizeof(int));
    cudaMalloc((void**)&f,2*3*sizeof(int));
 
    cudaMemcpy(d,a,2*3*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(e,b,2*3*sizeof(int), cudaMemcpyHostToDevice);
 
    dim3 grid(3,2);
    matadd<<<grid, 1>>>(d,e,f);
 
    cudaMemcpy(c,f,2*3*sizeof(int), cudaMemcpyDeviceToHost);
 
    for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
        {
            printf("%d \t", c[i][j]);
        }
        printf("\n");
    }
}

0 	2 	4 	
2 	4 	6 	



In [6]:
%%cu
#include<stdio.h>
#include<cuda.h>

__global__ void matadd(int *l, int *m, int *n)
{
    int x=blockIdx.x;
    int y=blockIdx.y;
    int id=gridDim.x*y+x;
    n[id]=l[id]+m[id];
}

int main()
{
    int a[2][3], b[2][3], c[2][3];
    int *d,*e,*f;
    int i,j;
 
    for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
        {
            a[i][j]=i+j;
        }
    }
 
     for(i=0;i<2;i++)
    {
        for(j=0;j<3;j++)
        {
            b[i][j]=i+j;
        }
    }
 
    cudaMalloc((void**)&d,2*3*sizeof(int));
    cudaMalloc((void**)&e,2*3*sizeof(int));
    cudaMalloc((void**)&f,2*3*sizeof(int));
 
    cudaMemcpy(d,a,2*3*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(e,b,2*3*sizeof(int), cudaMemcpyHostToDevice);
 
    dim3 grid(3,2);
    matadd<<<grid, 1>>>(d,e,f);
 
    cudaMemcpy(c,f,2*3*sizeof(int), cudaMemcpyDeviceToHost);
 
    for(int i=0;i<2;i++)
    {
        for(int j=0;j<3;j++)
        {
            printf("%d \t", c[i][j]);
        }
     printf("\n");
    }
 
}

0 	2 	4 	
2 	4 	6 	

