<a href="https://colab.research.google.com/github/park-geun-hyeong/CUDA/blob/main/EX7_1012.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi 

Fri Oct 14 04:36:59 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


## CPU Matrix Addition

In [3]:
%%writefile matrix_addition.cu

#include <stdio.h>
#define LENGTH 12

void MatrixAdd(int *M, int *N, int *P, int width, int height);
void printResult(int *M, int*N, int* P, int width, int height);

int main()
{ 
  int width; int height; 
  printf("width: ");
  scanf("%d", &width);
  printf("height ");
  scanf("%d", &height);

  const int MatrixSize = height * width;
  const int BufferSize = MatrixSize * sizeof(int);

  int *M, *N, *P;
  M = (int*)malloc(BufferSize);
  N = (int*)malloc(BufferSize);
  P = (int*)malloc(BufferSize);
  
  for(int i=0; i<MatrixSize; i++)
  {
    M[i] = 1;
    N[i] = 1;
    P[i] = 0; 
  }

  MatrixAdd(M,N,P,width, height);
  printResult(M,N,P,width, height);

  free(M); free(N); free(P);
  return 0;
}

void MatrixAdd(int *M, int *N, int *P, int width, int height)
{
  for(int row =0; row<height; row++)
  {
    for(int col = 0; col<width; col++)
    {
      int Destindex = row*height + col;
      P[Destindex] = M[Destindex] + N[Destindex];
    }
  }
}

void printResult(int *M, int*N, int* P, int width, int height)
{
  for(int row =0; row<height; row++)
  {
    for(int col = 0; col<width; col++)
    {
      int Destindex = row*height + col;
      printf("%d ", P[Destindex]);
    }
    printf("\n");
  }
}

Writing matrix_addition.cu


In [4]:
!nvcc -o matrix_add_cpu matrix_addition.cu

In [5]:
!./matrix_add_cpu

width: 6
height 8
2 2 2 2 2 2 
2 2 2 2 2 2 
2 2 2 2 2 2 
2 2 2 2 2 2 
2 2 2 2 2 2 
2 2 2 2 2 2 
3 3 3 3 3 3 
3 3 3 3 3 3 


## CPU Matrix Addition

In [21]:
%%writefile matrix_addition_gpu.cu

#include<stdio.h>
#define LENGTH 14

__global__ void MatrixAdd(int *m, int *n, int *p)
{
  int tid, tx, ty;
  tx = blockDim.x * blockIdx.x + threadIdx.x;
  ty = blockDim.y * blockIdx.y + threadIdx.y;
  tid = gridDim.x * ty + tx;

  p[tid] = m[tid] + n[tid];
}

void printResult(int *m, int *n, int *p);

int main()
{

  const int MatrixSize = LENGTH*LENGTH;
  const int BufferSize = MatrixSize * sizeof(int);

  int *M, *N, *P;
  M = (int*)malloc(BufferSize);
  N = (int*)malloc(BufferSize);
  P = (int*)malloc(BufferSize);
  for(int i=0; i<MatrixSize; i++)
  {
    M[i] = 1;
    N[i] = 1;
    P[i] = 0; 
  }
 
  int *dev_M, *dev_N, *dev_p;
  cudaMalloc((void**)&dev_M, BufferSize);
  cudaMalloc((void**)&dev_N, BufferSize);
  cudaMalloc((void**)&dev_p, BufferSize);

  cudaMemcpy(dev_M, M, BufferSize, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_N, N, BufferSize, cudaMemcpyHostToDevice);

  dim3 Dg(3,3,1);
  dim3 Db(8,6,1);
  //int DimX = 3*8;

  MatrixAdd<<<Dg,Db>>>(dev_M, dev_N, dev_p);
  cudaMemcpy(P, dev_p, BufferSize, cudaMemcpyDeviceToHost);

  printResult(M,N,P);
  cudaFree(dev_M); cudaFree(dev_N); cudaFree(dev_p);
  free(M); free(N); free(P);
  
  return 0;
}


void printResult(int *m, int *n, int *p)
{
  for(int i = 0; i<LENGTH; i++)
  {
    for(int j=0; j<LENGTH; j++)
    {
      int dstIndex = i*LENGTH + j;
      printf("%d ", p[i]);
    }
    printf("\n");
  }

}

Overwriting matrix_addition_gpu.cu


In [22]:
!nvcc -o matrix_addition_gpu matrix_addition_gpu.cu




In [23]:
!./matrix_addition_gpu

2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
2 2 2 2 2 2 2 2 2 2 2 2 2 2 
