<a href="https://colab.research.google.com/github/park-geun-hyeong/CUDA/blob/main/EX8_1014.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

Wed Oct 19 06:08:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


## CPU Matrix Multiplication

In [None]:
%%writefile matrix_mul.cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define Length 12

void MatMul(int *M, int *N, int *P);
void printResult(int *P);

int main()
{
  srand(time(NULL));

  const int Matsize = Length * Length;
  const int Buffersize = Matsize * sizeof(int);

  int *M, *N, *P;
  M = (int*)malloc(Buffersize);
  N = (int*)malloc(Buffersize);
  P = (int*)malloc(Buffersize);

  for(int i=0; i<Matsize; i++)
  {
    M[i] = rand()%4;
    N[i] = rand()%4;
    P[i] = 0;
  }

  MatMul(M,N,P);
  printResult(P);

  free(M); free(N); free(P);
  return 0;
}


void MatMul(int *M, int *N, int *P)
{
  for(int row = 0; row < Length; row ++)
  {
    for(int col = 0; col< Length; col++)
    {
      for(int i=0; i<Length; i++)
      {
        int idx = row * Length + col;
        P[idx] += M[row*Length + i] * N[col + i*Length];
      }
    }
  }
}

void printResult(int *P)
{
  for(int row = 0; row < Length; row ++)
  {
    for(int col = 0; col< Length; col++)
    {
      int idx = row * Length + col;
      printf("%d ", P[idx]);
    }
    printf("\n");
  }
}


Writing matrix_mul.cu


In [None]:
!nvcc -o matrix_mul matrix_mul.cu

In [None]:
!./matrix_mul

14 35 26 20 15 18 39 36 15 23 27 30 
19 47 27 32 24 24 48 49 22 26 42 40 
14 21 24 15 22 18 38 32 22 19 27 20 
19 28 25 18 15 11 37 24 14 22 31 24 
14 31 27 24 22 14 40 35 16 25 26 23 
11 8 11 18 27 9 35 18 16 20 16 8 
9 29 18 21 16 13 27 28 11 21 15 18 
27 49 41 35 30 32 60 52 34 41 46 38 
21 34 30 27 25 32 50 46 29 32 39 25 
21 32 23 24 24 27 42 37 26 27 37 22 
16 30 24 23 22 15 40 29 22 27 30 31 
29 46 34 31 20 25 47 36 32 32 39 33 


In [None]:
%%writefile matmul.c
#include<stdio.h>

void MatMul(int *M, int *N, int *P, int m_width, int m_height, int n_width, n_height)
{
  for(int row = 0; row < m_heigth; row ++)
  {
    for(int col = 0; col< n_width; col++)
    {
      for(int i=0; i<m_width; i++)
      {
        for(int j=0; j<n_height; j++)
        {
           int idx = row * m_height + col;
           P[idx] += M[row*m_height + i] * N[col + j*n_width] 
        }
      }
    }
  }
}



Writing matmul.c


## GPU Matrix Multiplication

In [None]:
%%writefile matmul_gpu.cu

#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#define LENGTH 12

__global__ void Matmul(int *M, int *N, int*P)
{
  int tid, tx, ty;
  tx = blockDim.x * blockIdx.x + threadIdx.x;
  ty = blockDim.y * blockIdx.y + threadIdx.y;
  int DimX = gridDim.x * blockDim.x;
  tid = DimX * ty + tx;

  int Value = 0;
  int MVal = 0;
  int NVal = 0;

  for(int k = 0; k<DimX; k++)
  {
    MVal = M[ty * DimX + k];
    NVal = N[k * DimX + tx];
    Value += MVal*NVal;
  }
  P[tid] = Value;
}

void printResult(int *P);


int main()
{
  srand(time(NULL));
  const int MatrixSize = LENGTH*LENGTH;
  const int BufferSize = MatrixSize * sizeof(int);

  int *M, *N, *P;
  M = (int*)malloc(BufferSize);
  N = (int*)malloc(BufferSize);
  P = (int*)malloc(BufferSize);
  for(int i=0; i<MatrixSize; i++)
  {
    M[i] = rand()%4;
    N[i] = rand()%8;
    P[i] = 0; 
  }
 
  int *dev_M, *dev_N, *dev_p;
  cudaMalloc((void**)&dev_M, BufferSize);
  cudaMalloc((void**)&dev_N, BufferSize);
  cudaMalloc((void**)&dev_p, BufferSize);

  cudaMemcpy(dev_M, M, BufferSize, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_N, N, BufferSize, cudaMemcpyHostToDevice);

  dim3 Dg(3,3,1);
  dim3 Db(8,6,1);
  //int DimX = 3*8;

  Matmul<<<Dg,Db>>>(dev_M, dev_N, dev_p);
  cudaMemcpy(P, dev_p, BufferSize, cudaMemcpyDeviceToHost);

  printf("\n[M Matrix]\n");
  printResult(M);

  printf("\n[N Matrix]\n");
  printResult(N);

  printf("\n[P Matrix]\n");
  printResult(P);
  cudaFree(dev_M); cudaFree(dev_N); cudaFree(dev_p);
  free(M); free(N); free(P);

  return 0;
}

void printResult(int *P)
{
  for(int row = 0; row < LENGTH; row ++)
  {
    for(int col = 0; col< LENGTH; col++)
    {
      int idx = row * LENGTH + col;
      printf("%d ", P[idx]);
    }
    printf("\n");
  }
}

Overwriting matmul_gpu.cu


In [None]:
!nvcc -o matmul_gpu matmul_gpu.cu

In [None]:
!./matmul_gpu


[M Matrix]
3 1 0 3 2 3 2 1 0 1 1 3 
0 0 0 2 3 2 3 3 2 1 1 2 
3 3 3 3 3 3 2 3 0 0 3 1 
1 1 1 3 0 0 0 2 2 3 2 1 
0 1 0 0 1 3 1 2 1 0 1 1 
3 3 3 3 1 2 0 2 0 0 0 3 
1 2 2 2 1 3 2 0 3 1 1 1 
1 2 3 2 1 3 0 0 2 2 2 0 
0 1 0 2 3 2 2 3 1 2 3 3 
2 2 3 2 3 1 1 1 0 2 0 2 
3 2 0 0 2 3 0 1 0 2 1 1 
3 1 1 0 0 3 0 3 1 0 3 0 

[N Matrix]
4 3 4 0 4 7 3 2 0 5 7 7 
5 4 7 7 7 7 1 2 6 0 6 5 
0 0 6 3 7 3 1 1 5 3 7 5 
6 6 0 4 6 7 3 0 1 0 6 2 
1 3 3 5 5 6 6 5 6 1 6 3 
5 1 4 7 1 5 2 5 5 5 6 1 
7 6 4 5 5 4 2 7 6 7 7 1 
1 1 1 4 4 7 5 1 3 4 2 1 
1 1 6 5 1 6 0 3 6 5 7 5 
5 0 6 7 3 7 3 0 3 7 1 1 
5 2 5 6 2 1 1 7 2 5 3 2 
3 3 3 4 2 4 7 2 2 1 5 7 

[P Matrix]
50 35 57 46 42 51 19 55 41 64 72 45 
43 30 45 63 51 75 48 15 40 29 47 43 
54 45 84 72 72 81 39 75 75 78 111 69 
75 45 63 99 69 111 63 30 60 51 78 51 
16 7 27 26 14 12 4 25 17 23 23 16 
20 15 15 23 15 26 27 6 10 10 22 24 
36 28 51 49 45 42 24 52 46 47 63 36 
43 29 32 56 38 64 45 20 33 28 50 35 
27 19 42 40 24 31 7 38 39 42 48 26 
29 14 26 41 27 50 36 6 20 31 23 2

In [None]:
%%writefile matmul_gpu.cu

#include<stdio.h>
#include<stdlib.h>
#include<time.h>

__global__ void Matmul(int *M, int *N, int*P, int M_height,int M_width, int N_width)
{
  int tid, tx, ty;
  tx = blockDim.x * blockIdx.x + threadIdx.x;
  ty = blockDim.y * blockIdx.y + threadIdx.y;
  int DimX = gridDim.x * blockDim.x;
  tid = DimX * ty + tx;

  int Value = 0;
  int MVal = 0;
  int NVal = 0;

  for(int k = 0; k<M_width; k++)
  {
    MVal = M[ty * M_width + k];
    NVal = N[k * N_width + tx];
    Value += MVal*NVal;
  }
  P[tid] = Value;
}

void fillArray(int *matrix, const int height, const int width, bool dst);
void printResult(int *P, const int height, const int width);

int main()
{
  const int M_height = 0; 
  const int M_width = 0;
  const int N_height = 0;
  const int N_width = 0;
  printf("M_height: ");
  scanf("%d", &M_height);
  
  printf("M_width & N_height: ");
  scanf("%d", &M_width);
  N_height = M_width;

  printf("N_width: ");
  scanf("%d", &N_width); 

  const int M_matrixSize = M_height * M_width;
  const int N_matrixSize = N_height * N_width;
  const int P_matrixSize = M_height * N_width;

  const int M_bufferSize = M_matrixSize*sizeof(int);
  const int N_bufferSize = N_matrixSize*sizeof(int);
  const int P_bufferSize = P_matrixSize*sizeof(int);

  int *M, *N, *P;
  M = (int*)malloc(M_bufferSize); fillArray(M, M_height, M_width, false);
  N = (int*)malloc(N_bufferSize); fillArray(M, N_height, N_width, false);
  P = (int*)malloc(P_bufferSize); fillArray(P, M_height, N_width, true);
 
  int *dev_M, *dev_N, *dev_p;
  cudaMalloc((void**)&dev_M, M_bufferSize);
  cudaMalloc((void**)&dev_N, N_bufferSize);
  cudaMalloc((void**)&dev_p, P_bufferSize);

  cudaMemcpy(dev_M, M, M_bufferSize, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_N, N, N_bufferSize, cudaMemcpyHostToDevice);

  dim3 Dg(3,3,1);
  dim3 Db(8,6,1);
  //int DimX = 3*8;

  Matmul<<<Dg,Db>>>(dev_M, dev_N, dev_p, M_height, M_width, N_width);
  cudaMemcpy(P, dev_p, P_bufferSize, cudaMemcpyDeviceToHost);

  printf("\n[M Matrix]\n");
  printResult(M, M_heigth, M_width);

  printf("\n[N Matrix]\n");
  printResult(N, N_height, N_width);

  printf("\n[P Matrix]\n");
  printResult(P, M_height, N_width);

  cudaFree(dev_M); cudaFree(dev_N); cudaFree(dev_p);
  free(M); free(N); free(P);

  return 0;
}

void fillArray(int *matrix, const int height, const int width, bool dst)
{
  srand(time(NULL));
  for(int i=0; i<height; i++)
  {
    for(int j=0; j<width; j++)
    {
      int loc = height * i + j;
      if(dst == true)
      {
        matrix[loc] = 0;
      }
      else
      {
        matrix[loc] =  rand()%4;
      } 
    }
  }
}

void printResult(int *P, const int height, const int width)
{
  for(int row = 0; row < height; row ++)
  {
    for(int col = 0; col< width; col++)
    {
      int idx = row * height + col;
      printf("%d ", P[idx]);
    }
    printf("\n");
  }
}

Overwriting matmul_gpu.cu


In [None]:
!nvcc -o matmul_gpu matmul_gpu.cu

In [None]:
!./matmul_gpu