<a href="https://colab.research.google.com/github/mu06905/GPU-Accelerated-Programming-in-Cuda-2023/blob/main/Week5/MatrixMultiplication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
%load_ext nvcc_plugin

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-33ugldbm
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-33ugldbm
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4304 sha256=aa96c30c19c4376b15aa3aa920729303390d0405435334c3d45f5e056f53f2cd
  Stored in directory: /tmp/pip-ephem-wheel-cache-az84w22p/wheels/f3/08/cc/e2b5b0e1c92df07dbb50a6f024a68ce090f5e7b2316b41756d
Successfully built NVCCPlugin
Installing collecte

In [2]:
%%cu
#include <stdio.h>
#include <cuda.h>

__global__ void matrix_multiplication(int* M, int* N, int* P, int Width){
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    if ((Row < Width) && (Col < Width)) {
        int pVal = 0;
        for(int k = 0; k<Width; ++k){
            pVal += M[Row * Width + k] * N[k * Width + Col];
        }
        P[Row * Width + Col] = pVal;
    }
}


__global__ void matrix_multiplication_tiled(int* M, int* N, int* P, int Width){
    const int TILE_WIDTH = 16;
    __shared__ int dM[TILE_WIDTH][TILE_WIDTH];
    __shared__ int dN[TILE_WIDTH][TILE_WIDTH];

    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    int pVal = 0;
    for(int ph = 0; ph<Width/TILE_WIDTH; ph++){
        dM[threadIdx.y][threadIdx.x] = M[Row*Width + TILE_WIDTH*ph + threadIdx.x];
        dN[threadIdx.y][threadIdx.x] = N[TILE_WIDTH*ph*Width + threadIdx.y * Width + Col];
        __syncthreads();        
    
        for(int k = 0; k<TILE_WIDTH;k++){
            pVal += dM[threadIdx.y][k] * dN[k][threadIdx.x];
        }
        __syncthreads();
    }
    P[Row * Width + Col] = pVal;
}

int main(int argc, char** argv){

    int* h_m = 0;
    int* h_n = 0;
    int* h_p = 0;

    int* d_m = 0;
    int* d_n = 0;
    int* d_p = 0;
    int* d_p2 = 0;

    const int N = 1024;
    const int size = N * N * sizeof(int);

    h_m = (int*)malloc(size);
    h_n = (int*)malloc(size);
    h_p = (int*)malloc(size);
    int* h_p2 = (int*)malloc(size);
    int* h_p3 = (int*)malloc(size);

    for(int i = 0; i<N*N;i++){
        h_m[i] = 1;
        h_n[i] = i;
        h_p[i] = 0;
        h_p2[i] = 0;
        h_p3[i] = 0;
    }
    for(int i = 0; i<N;i++){
        for(int j = 0; j<N;j++){
            int pVal = 0;
            for(int k = 0; k<N; k++){
                pVal += h_m[i*N+k] * h_n[k*N+j];
            }
            h_p[i * N + j] = pVal;
        }
    }

    
    printf("********************\n");
    for(int i = 0; i<10;i++){
        printf("%d\n",h_p[i]);
    }
  
   
    cudaMalloc((void**)&d_m, size);
    cudaMalloc((void**)&d_n, size);
    cudaMalloc((void**)&d_p, size);
    cudaMalloc((void**)&d_p2,size);

    cudaMemcpy(d_m, h_m, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_n,h_n, size, cudaMemcpyHostToDevice);
    
    dim3    blocksGrids;
	dim3    threadsBlock(16, 16, 1);

	blocksGrids.x = ceil(N/16);
	blocksGrids.y = ceil(N/16);

    matrix_multiplication<<<blocksGrids, threadsBlock>>>(d_m, d_n, d_p, N);
    cudaMemcpy(h_p2, d_p, size, cudaMemcpyDeviceToHost);
    

    printf("********************\n");
    for(int i = 0; i<10;i++){
        printf("%d\n",h_p2[i]);
    }

    matrix_multiplication_tiled<<<blocksGrids, threadsBlock>>>(d_m,d_n,d_p2,N);
  
    cudaMemcpy(h_p3, d_p2, size, cudaMemcpyDeviceToHost);
    printf("********************\n");
    for(int i = 0; i<10;i++){
        printf("%d\n",h_p3[i]);
    }
    int ok = 0;
    for(int i = 0; i<N; i++){
        for(int j = 0; j<N;j++){
            if(h_p[i*N+j]==h_p2[i*N+j] && h_p2[i*N+j]==h_p3[i*N+j]){
                ok = 0;
            }
            else{
                ok = 1;
                 break;
            }
        }
    }
    if(ok)printf("Incorrect Results\n");
    else{
        printf("Correct Results\n");
    }
   
    cudaFree(d_m);
    cudaFree(d_n);
    cudaFree(d_p);

    free(h_m);
    free(h_n);
    free(h_p);



    return 0;
}

********************
536346624
536347648
536348672
536349696
536350720
536351744
536352768
536353792
536354816
536355840
********************
536346624
536347648
536348672
536349696
536350720
536351744
536352768
536353792
536354816
536355840
********************
536346624
536347648
536348672
536349696
536350720
536351744
536352768
536353792
536354816
536355840
Correct Results

