<a href="https://colab.research.google.com/github/m-pandey5/CUDA/blob/main/Matrixmul.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
%%writefile matrixmul.cu
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <functional>
#include <iostream>
#include <vector>

using std::cout;
using std::generate;
using std::vector;
__global__ void matrixmul(const int *a, const int *b, int *c, int N){
  //compute each threads's global row and coloumn index
  int row = blockIdx.y*blockDim.y+ threadIdx.y;
  int col = blockIdx.x*blockDim.x+ threadIdx.x;

  //iterate over row, and down coloumn
  c[row*N+col]=0;
  for (int k = 0; k<N;k++){
    //Accumulate results for a single element
    c[row*N+col]+=a[row*N+k]*b[k*N +col];

  }}
  // check result on compute
  void verify_result(vector<int>&a, vector<int>&b, vector<int>&c, int N){
    //for every row..
    for (int i =0; i<N;i++){
      //for every coloumn
      for (int j =0; j<N;j++){
        //for every element in the row-column pair
        int tmp =0;
        for (int k = 0; k<N;k++){
          tmp +=a[i*N+k]*b[k*N+j];

        }
        //check against the CPU result
        assert(tmp==c[i*N+j]);
      }
    }
  }
  int main(){
    //matrix size of 1024*1024
    int N =1<<10;
    //size of matrix
    size_t bytes = N*N*sizeof(int);
    //host vector
    vector <int> h_a(N*N);
    vector <int> h_b(N*N);
    vector <int> h_c(N*N);
    //intialise the matrix
    generate(h_a.begin(), h_a.end(),[](){return rand()%100;});
    generate(h_b.begin(), h_b.end(),[](){return rand()%100;});
    //allocate device memory
    int *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, bytes);
    cudaMalloc(&d_b, bytes);
    cudaMalloc(&d_c, bytes);
    // copy data to the device
    cudaMemcpy(d_a, h_a.data(),bytes,cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b.data(),bytes,cudaMemcpyHostToDevice);
    // threads dimension 32*32
    int Threads= 32;
    //block dim = 32*32
    int Blocks = N/Threads;
    //use dim3 structs for block and grid dim
    dim3 threads(Threads,Threads);
    dim3 blocks(Blocks,Blocks);
    //launch kernel
    matrixmul<<<blocks,threads>>>(d_a,d_b,d_c,N);
    // copy data to the host
    cudaMemcpy(h_c.data(),d_c,bytes,cudaMemcpyDeviceToHost);
    //check result
    verify_result(h_a,h_b,h_c,N);
    cout<<"completed successfullu";
    //free memory on device
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;




  }

Overwriting matrixmul.cu


In [4]:
!nvcc -arch=sm_75 matrixmul.cu -o matrixmul

In [6]:
!./matrixmul

completed successfullu