<a href="https://colab.research.google.com/github/park-geun-hyeong/CUDA/blob/main/EX6_1005.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Fri Oct  7 07:22:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


## CPU위에서 벡터의 합 계산

In [None]:
%%writefile vector_addition.cu
#include<stdio.h>
#define N 512

void host_add(int *a, int *b, int *c);
void fill_array(int *data);
void print_output(int *a, int *b, int *c);

int main()
{

  int *a, *b, *c;
  a = (int*)malloc(N*sizeof(int)); fill_array(a);
  b = (int*)malloc(N*sizeof(int)); fill_array(b);
  c = (int*)malloc(N*sizeof(int));

  host_add(a,b,c);
  print_output(a,b,c);
  free(a); free(b); free(c);

  return 0;
}

void host_add(int *a, int *b, int *c)
{
  for(int i=0; i<N; i++)
  {
    c[i] = a[i] + b[i];
  }
}

void fill_array(int *data)
{
  for(int i=0; i<N; i++)
  {
    data[i] = i;
  }
}

void print_output(int *a, int *b, int *c)
{
  for(int i=0; i<N; i++)
  {
    printf("%d + %d = %d\n", a[i],b[i],c[i]);
  }
}

Overwriting vector_addition.cu


In [None]:
!nvcc -o vector_add_cpu vector_addition.cu

In [None]:
!./vector_add_cpu

In [None]:
### GPU위에서 벡터의 합(블록 개수 N, 각각 1개의 Thread 포함)

In [None]:
%%writefile vector_addition2.cu
#include<stdio.h>
#define N 512

__global__ void device_add(int *a, int *b , int *c)
{
  c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}

void fill_array(int *data);
void print_output(int *a, int *b, int *c);

int main()
{

  int *a, *b, *c;
  int *dev_a, *dev_b, *dev_c;
  a = (int*)malloc(N*sizeof(int)); fill_array(a);
  b = (int*)malloc(N*sizeof(int)); fill_array(b);
  c = (int*)malloc(N*sizeof(int));

  cudaMalloc((void**)&dev_a, N*sizeof(int));
  cudaMalloc((void**)&dev_b, N*sizeof(int));
  cudaMalloc((void**)&dev_c, N*sizeof(int));

  cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
  
  device_add<<<N,1>>>(dev_a, dev_b, dev_c);

  cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
  print_output(a,b,c);

  free(a); free(b); free(c);
  cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
  return 0;
}

void fill_array(int *data)
{
  for(int i=0; i<N; i++)
  {
    data[i] = i;
  }
}

void print_output(int *a, int *b, int *c)
{
  for(int i=0; i<N; i++)
  {
    printf("%d + %d = %d\n", a[i],b[i],c[i]);
  }
}

Overwriting vector_addition2.cu


In [None]:
!nvcc -o vector_addition_gpu vector_addition2.cu

In [None]:
!./vector_addition_gpu

In [None]:
%%writefile vector_addition_gpu_block_thread.cu

#include<stdio.h>
#define N 32

__global__ void device_add(int *a, int *b, int *c, int *id)
{
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  id[idx] = idx;
  c[idx] = a[idx] + b[idx];

  // idx 순서가 아닌 무작위적으로 병렬처리 됨을 확인할 수 있다.
  printf("[Thread Id: %d] => [ThreadIdx.x: %d] + [BlockIdx.x: %d] x [BlockDim.x: %d]\n", idx, threadIdx.x, blockIdx.x, blockDim.x); 
}

void fill_array(int *data);
void print_output(int *a, int *b, int *c, int *id);

int main()
{

  int *a, *b, *c, *id;
  int *dev_a, *dev_b, *dev_c, *dev_id;
  a = (int*)malloc(N*sizeof(int)); fill_array(a);
  b = (int*)malloc(N*sizeof(int)); fill_array(b);
  c = (int*)malloc(N*sizeof(int));
  id = (int*)malloc(N*sizeof(int));

  cudaMalloc((void**)&dev_a, N*sizeof(int));
  cudaMalloc((void**)&dev_b, N*sizeof(int));
  cudaMalloc((void**)&dev_c, N*sizeof(int));
  cudaMalloc((void**)&dev_id, N*sizeof(int));

  cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
  
  int thread_per_block = 4;
  int N_blocks = N/thread_per_block;

  device_add<<<N_blocks,thread_per_block>>>(dev_a, dev_b, dev_c, dev_id);

  cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
  cudaMemcpy(id, dev_id, N*sizeof(int), cudaMemcpyDeviceToHost);
  
  print_output(a,b,c,id);

  free(a); free(b); free(c), free(id);
  cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c), cudaFree(dev_id);
  return 0;
}

void fill_array(int *data)
{
  for(int i=0; i<N; i++)
  {
    data[i] = i;
  }
}

void print_output(int *a, int *b, int *c, int* id)
{ 
  printf("===========================================================\n");
  for(int i=0; i<N; i++)
  {
    printf("[ThreadId: %d ]%d + %d = %d\n", id[i],a[i],b[i],c[i]);
  }
}

Overwriting vector_addition_gpu_block_thread.cu


In [None]:
!nvcc -o vector_addition_gpu_block_thread vector_addition_gpu_block_thread.cu

In [None]:
!./vector_addition_gpu_block_thread

[Thread Id: 24] => [ThreadIdx.x: 0] + [BlockIdx.x: 6] x [BlockDim.x: 4]
[Thread Id: 25] => [ThreadIdx.x: 1] + [BlockIdx.x: 6] x [BlockDim.x: 4]
[Thread Id: 26] => [ThreadIdx.x: 2] + [BlockIdx.x: 6] x [BlockDim.x: 4]
[Thread Id: 27] => [ThreadIdx.x: 3] + [BlockIdx.x: 6] x [BlockDim.x: 4]
[Thread Id: 16] => [ThreadIdx.x: 0] + [BlockIdx.x: 4] x [BlockDim.x: 4]
[Thread Id: 17] => [ThreadIdx.x: 1] + [BlockIdx.x: 4] x [BlockDim.x: 4]
[Thread Id: 18] => [ThreadIdx.x: 2] + [BlockIdx.x: 4] x [BlockDim.x: 4]
[Thread Id: 19] => [ThreadIdx.x: 3] + [BlockIdx.x: 4] x [BlockDim.x: 4]
[Thread Id: 4] => [ThreadIdx.x: 0] + [BlockIdx.x: 1] x [BlockDim.x: 4]
[Thread Id: 5] => [ThreadIdx.x: 1] + [BlockIdx.x: 1] x [BlockDim.x: 4]
[Thread Id: 6] => [ThreadIdx.x: 2] + [BlockIdx.x: 1] x [BlockDim.x: 4]
[Thread Id: 7] => [ThreadIdx.x: 3] + [BlockIdx.x: 1] x [BlockDim.x: 4]
[Thread Id: 28] => [ThreadIdx.x: 0] + [BlockIdx.x: 7] x [BlockDim.x: 4]
[Thread Id: 29] => [ThreadIdx.x: 1] + [BlockIdx.x: 7] x [BlockDim.x:

## 유휴코어 최소화
- 미리 그래픽카드 성능향상을 생각하지 않고 블록 수를 적게 만들면 유휴 SM이 늘어나게 됨
- 가급적 많은 블록 및 스레드를 생성할 것 

In [None]:
%%writefile vector_addition_large.cu

#include<stdio.h>

__global__ void device_add(int *a, int *b, int *c)
{
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  c[idx] = a[idx] + b[idx];
}

void fill_array(int *data);

int main()
{
  const int size = 512*65535;
  const int BufferSize = size*sizeof(int);

  int *a, *b, *c;
  int *dev_a, *dev_b, *dev_c;
  a = (int*)malloc(BufferSize); fill_array(a);
  b = (int*)malloc(BufferSize); fill_array(b);
  c = (int*)malloc(BufferSize);

  cudaMalloc((void**)&dev_a, BufferSize);
  cudaMalloc((void**)&dev_b, BufferSize);
  cudaMalloc((void**)&dev_c, BufferSize);

  cudaMemcpy(dev_a, a, BufferSize, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, BufferSize, cudaMemcpyHostToDevice);
  
  int thread_per_block = 512;
  int N_blocks = size/thread_per_block;

  device_add<<<N_blocks,thread_per_block>>>(dev_a, dev_b, dev_c);
  cudaMemcpy(c, dev_c, BufferSize, cudaMemcpyDeviceToHost);
  for(int i = 0; i < 20; i++)
    printf(" Result[%d] : %d\n",i,c[i]);
  
  printf(" ......\n");
  
  for(int i = size-20; i < size; i++)
    printf(" Result[%d] : %d\n",i,c[i]);
  
  
  free(a); free(b); free(c);
  cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
  return 0;
}

void fill_array(int *data)
{
  const int N = 512*65535;
  for(int i=0; i<N; i++)
  {
    data[i] = i;
  }
}

Overwriting vector_addition_large.cu


In [None]:
!nvcc -o vector_addition_large vector_addition_large.cu

In [None]:
!./vector_addition_large

 Result[0] : 0
 Result[1] : 2
 Result[2] : 4
 Result[3] : 6
 Result[4] : 8
 Result[5] : 10
 Result[6] : 12
 Result[7] : 14
 Result[8] : 16
 Result[9] : 18
 Result[10] : 20
 Result[11] : 22
 Result[12] : 24
 Result[13] : 26
 Result[14] : 28
 Result[15] : 30
 Result[16] : 32
 Result[17] : 34
 Result[18] : 36
 Result[19] : 38
 ......
 Result[33553900] : 67107800
 Result[33553901] : 67107802
 Result[33553902] : 67107804
 Result[33553903] : 67107806
 Result[33553904] : 67107808
 Result[33553905] : 67107810
 Result[33553906] : 67107812
 Result[33553907] : 67107814
 Result[33553908] : 67107816
 Result[33553909] : 67107818
 Result[33553910] : 67107820
 Result[33553911] : 67107822
 Result[33553912] : 67107824
 Result[33553913] : 67107826
 Result[33553914] : 67107828
 Result[33553915] : 67107830
 Result[33553916] : 67107832
 Result[33553917] : 67107834
 Result[33553918] : 67107836
 Result[33553919] : 67107838


## 블록 및 스레드 동적 설정
- G80기준 블록 생성 최대개수 : 65535 개
- G80기준 블록당 스레드 생성 최대개수 : 512
- 만약 원소수 65535*512*3개가 있는 벡터합은 인덱스 할당을 어떻게 해주어야 할까?

In [3]:
%%writefile vector_addition_shift.cu

#include<stdio.h>

__global__ void device_add(int *a, int *b, int *c, int arr_cnt)
{  
  int tid = 0;
  int idx = threadIdx.x + blockIdx.x * blockDim.x;
  while(tid < 3)
  {
    c[tid*(arr_cnt/3) + idx] = a[tid*(arr_cnt/3) + idx] + b[tid*(arr_cnt/3) + idx];
    tid +=1;
  }
}

int main()
{
  int *a, *b, *c;
  int *dev_a, *dev_b, *dev_c;

  int arr_cnt = 65535*512*3;

  a = (int*)malloc(arr_cnt*sizeof(int)); 
  b = (int*)malloc(arr_cnt*sizeof(int)); 
  c = (int*)malloc(arr_cnt*sizeof(int));

  cudaMalloc((void**)&dev_a, arr_cnt*sizeof(int));
  cudaMalloc((void**)&dev_b, arr_cnt*sizeof(int));
  cudaMalloc((void**)&dev_c, arr_cnt*sizeof(int));
  
  for (int i=0; i<arr_cnt; i++) {
    a[i] = i;
    b[i] = i;
  }
  
  cudaMemcpy(dev_a, a, arr_cnt*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, arr_cnt*sizeof(int), cudaMemcpyHostToDevice);

  int thread_per_block = 512;
  int N_blocks = 65535;

  device_add<<<N_blocks, thread_per_block>>>(dev_a, dev_b, dev_c, arr_cnt);
  cudaMemcpy(c, dev_c, arr_cnt*sizeof(int), cudaMemcpyDeviceToHost);
  
  bool success = true;
  for(int i = 0; i < arr_cnt; i++)
  {
    if((a[i]+b[i])!=c[i])
    {
      //printf("Error: %d + %d != %d\n", a[i],b[i],c[i]);
      success=false;
    }
  }
  if(success) printf("we did it\n");
  else printf("we fail\n");

  free(a); free(b); free(c);
  cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);
  return 0;
}

Writing vector_addition_shift.cu


In [4]:
!!nvcc -o vector_addition_shift vector_addition_shift.cu 

[]

In [5]:
!./vector_addition_shift

we did it
