<a href="https://colab.research.google.com/github/park-geun-hyeong/CUDA/blob/main/EX4_0923.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Kernel Function

In [1]:
!nvidia-smi

Wed Sep 28 06:22:29 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   50C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2020 NVIDIA Corporation
Built on Mon_Oct_12_20:09:46_PDT_2020
Cuda compilation tools, release 11.1, V11.1.105
Build cuda_11.1.TC455_06.29190527_0


- CUDA 에서는 접두어(prefix)를 사용하여 CPU와 GPU 코드를 구분한다

### __ host__
- 호스트(CPU)에서 호출 및 실행하는 함수 접두어
- 별다른 접두어가 없는 경우 __host__가 생략된 것으로 간주

### __ device__
- 디바이스(GPU)에서 호출 및 실행하는 함수 접두어

### __ global__
- 호스트(CPU)에서 호출
- 디바이스(GPU)에서 실행
- 호스트와 디바이스를 연결하는 다리 역할 수행

In [3]:
%%writefile kernel_example.cu

// __global__ function : add()
// __host__ function   : main()

#include<stdio.h>

__global__ void add(int a, int b, int *c)
{
  *c = a + b;
}

int main(){
  int c;
  int *dev_c;

  // 일반적으로 비동기적으로 진행된다. 
  // 하지만 cudaMemcpy() 함수는 add()함수가 계산하여 dev_c에 값이 할당되기 전까지 대기상태가 되기 때문에, 동기적으로 실행되는 것처럼 보이는 것
  
  cudaMalloc((void**)&dev_c, sizeof(int));
  add<<<1,1>>>(2,7,dev_c); // <<<블록 개수, 쓰레드 개수>>>
  cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);

  printf("2 + 7 = %d\n", c);
  return 0;
}

Writing kernel_example.cu


In [4]:
!nvcc -o kernel_example kernel_example.cu

In [5]:
!./kernel_example

2 + 7 = 9


### 단일 CPU 기반 Vector 합 계산 (for loop)

In [6]:
%%writefile vector_add_only_cpu.cu

#include<stdio.h>
#define N 10

int main(){
  int a[N], b[N], c[N];

  for(int i = 0; i < N; i++)
  {
    a[i] = i;
    b[i] = 100*i;
    c[i] = a[i] + b[i];
  }

  for(int i = 0; i < N; i++)
  {
    printf("%d + %d = %d\n", a[i], b[i], c[i]);
  }

  return 0;
}

Writing vector_add_only_cpu.cu


In [7]:
!nvcc -o vector_add_only_cpu vector_add_only_cpu.cu

In [8]:
!./vector_add_only_cpu

0 + 0 = 0
1 + 100 = 101
2 + 200 = 202
3 + 300 = 303
4 + 400 = 404
5 + 500 = 505
6 + 600 = 606
7 + 700 = 707
8 + 800 = 808
9 + 900 = 909



## Multi CPU 기반 Vector 합 계산(for loop)
- 단일코어지만 멀티코어 계산하는 것처럼 코드 작성

In [9]:
!lscpu

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              2
On-line CPU(s) list: 0,1
Thread(s) per core:  2
Core(s) per socket:  1
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               79
Model name:          Intel(R) Xeon(R) CPU @ 2.20GHz
Stepping:            0
CPU MHz:             2199.998
BogoMIPS:            4399.99
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            56320K
NUMA node0 CPU(s):   0,1
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_sin

In [10]:
%%writefile vector_add_multi_cpu.cu

#include<stdio.h>

#define N 60
#define K 4

void add(int myid, int *a, int *b, int *c)
{
  int tid = myid;
  while(tid < N)
  {
    c[tid] = a[tid] + b[tid];
    tid += K;
  }
}

int main(){

  int a[N], b[N], c[N];

  for(int i = 0; i<N; i++)
  {
    a[i] = i;
    b[i] = 100*i;
  }

 add(0, a,b,c); add(1,a,b,c); add(2,a,b,c); add(3,a,b,c);
 for(int i = 0; i < N; i++)
  {
    printf("%d + %d = %d\n", a[i], b[i], c[i]);
  }
  return 0;
}

Writing vector_add_multi_cpu.cu


In [11]:
!nvcc -o vector_add_multi_cpu vector_add_multi_cpu.cu

In [12]:
!./vector_add_multi_cpu

0 + 0 = 0
1 + 100 = 101
2 + 200 = 202
3 + 300 = 303
4 + 400 = 404
5 + 500 = 505
6 + 600 = 606
7 + 700 = 707
8 + 800 = 808
9 + 900 = 909
10 + 1000 = 1010
11 + 1100 = 1111
12 + 1200 = 1212
13 + 1300 = 1313
14 + 1400 = 1414
15 + 1500 = 1515
16 + 1600 = 1616
17 + 1700 = 1717
18 + 1800 = 1818
19 + 1900 = 1919
20 + 2000 = 2020
21 + 2100 = 2121
22 + 2200 = 2222
23 + 2300 = 2323
24 + 2400 = 2424
25 + 2500 = 2525
26 + 2600 = 2626
27 + 2700 = 2727
28 + 2800 = 2828
29 + 2900 = 2929
30 + 3000 = 3030
31 + 3100 = 3131
32 + 3200 = 3232
33 + 3300 = 3333
34 + 3400 = 3434
35 + 3500 = 3535
36 + 3600 = 3636
37 + 3700 = 3737
38 + 3800 = 3838
39 + 3900 = 3939
40 + 4000 = 4040
41 + 4100 = 4141
42 + 4200 = 4242
43 + 4300 = 4343
44 + 4400 = 4444
45 + 4500 = 4545
46 + 4600 = 4646
47 + 4700 = 4747
48 + 4800 = 4848
49 + 4900 = 4949
50 + 5000 = 5050
51 + 5100 = 5151
52 + 5200 = 5252
53 + 5300 = 5353
54 + 5400 = 5454
55 + 5500 = 5555
56 + 5600 = 5656
57 + 5700 = 5757
58 + 5800 = 5858
59 + 5900 = 5959


## 병렬 GPU 기반 CUDA 연산

In [27]:
%%writefile vector_add_gpu.cu

#include<stdio.h>
#define N 20

__global__ void add(int *a, int *b, int *c)
{
  int tid = blockIdx.x;
  c[tid] = a[tid] + b[tid];
}

int main(){

  int size = N*sizeof(int);
  int a[N], b[N], c[N];

  int *dev_a, *dev_b, *dev_c;

  cudaMalloc((void**)&dev_a, size);
  cudaMalloc((void**)&dev_b, size);
  cudaMalloc((void**)&dev_c, size);  

  for(int i=0; i<N; i++)
  {
    a[i] = i;
    b[i] = 100*i;
  }

  cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

  add<<<N,1>>>(dev_a, dev_b, dev_c);

  cudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

  cudaFree(dev_a);
  cudaFree(dev_b);
  cudaFree(dev_c);

  for(int i=0; i<N; i++)
  {
    printf("%d + %d = %d\n", a[i], b[i],c[i]);
  }

  return 0;
}

Overwriting vector_add_gpu.cu


In [28]:
!nvcc -o vector_add_gpu vector_add_gpu.cu

In [30]:
!./vector_add_gpu

0 + 0 = 0
1 + 100 = 101
2 + 200 = 202
3 + 300 = 303
4 + 400 = 404
5 + 500 = 505
6 + 600 = 606
7 + 700 = 707
8 + 800 = 808
9 + 900 = 909
10 + 1000 = 1010
11 + 1100 = 1111
12 + 1200 = 1212
13 + 1300 = 1313
14 + 1400 = 1414
15 + 1500 = 1515
16 + 1600 = 1616
17 + 1700 = 1717
18 + 1800 = 1818
19 + 1900 = 1919
