**Sprawdźmy dostępną kartę graficzną i zainstalowaną wersję nvcc:**


In [1]:
!nvidia-smi
!lsb_release -a
!nvcc --version

Mon May 13 07:07:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              14W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    


**Konieczna jest też instalacja pluginu, który umożliwi nam pisanie kodu w języku C++ w Colab:**


In [2]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-1vsvmi81
  Running command git clone --filter=blob:none --quiet https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-1vsvmi81
  Resolved https://github.com/andreinechaev/nvcc4jupyter.git to commit 326b0a57a80c6d0b4bad25ca7adf8138419ef1cb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
%load_ext nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmpuf8_1b9q".


**Teraz możemy wyświetlić informację o dostępnej karcie graficznej. Warto zwrócić uwagę m.in. na liczbę wątków dostępnych w bloku (może się przydać do uzyskania dużej wydajności w zadaniu domowym):**

In [6]:
%%cuda
#include <memory>
#include <iostream>
#include <cuda_runtime.h>
// Main Program
int main(void)
{
    int device_Count = 0;
    cudaGetDeviceCount(&device_Count);
    // This function returns count of number of CUDA enable devices and 0 if there are no CUDA capable devices.
    if (device_Count == 0)
    {
        printf("There are no available device(s) that support CUDA\n");
    }
    else
    {
        printf("Detected %d CUDA Capable device(s)\n", device_Count);
    }

    int device = 0;
    int driver_Version, runtime_Version;

    cudaDeviceProp device_Property;
    cudaGetDeviceProperties(&device_Property, device);
    printf("\nDevice %d: \"%s\"\n", device, device_Property.name);
    cudaDriverGetVersion(&driver_Version);
    cudaRuntimeGetVersion(&runtime_Version);
    printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driver_Version / 1000, (driver_Version % 100) / 10, runtime_Version / 1000, (runtime_Version % 100) / 10);
    printf( " Total amount of global memory: %.0f MBytes (%llu bytes)\n",
    (float)device_Property.totalGlobalMem / 1048576.0f, (unsigned long long) device_Property.totalGlobalMem);
    printf(" (%2d) Multiprocessors", device_Property.multiProcessorCount );
    printf("  GPU Max Clock rate: %.0f MHz (%0.2f GHz)\n", device_Property.clockRate * 1e-3f, device_Property.clockRate * 1e-6f);
    printf("\n");
    printf( " Total amount of global memory: %.0f MBytes (%llu bytes)\n",
    (float)device_Property.totalGlobalMem / 1048576.0f, (unsigned long long) device_Property.totalGlobalMem);
    printf(" Memory Clock rate: %.0f Mhz\n", device_Property.memoryClockRate * 1e-3f);
    printf(" Memory Bus Width: %d-bit\n", device_Property.memoryBusWidth);
    if (device_Property.l2CacheSize)
    {
        printf(" L2 Cache Size: %d bytes\n", device_Property.l2CacheSize);
    }
    printf(" Total amount of constant memory: %lu bytes\n",         device_Property.totalConstMem);
    printf(" Total amount of shared memory per block: %lu bytes\n", device_Property.sharedMemPerBlock);
    printf(" Total number of registers available per block: %d\n", device_Property.regsPerBlock);
    printf("\n");
    printf(" Maximum number of threads per multiprocessor: %d\n",              device_Property.maxThreadsPerMultiProcessor);
    printf(" Maximum number of threads per block: %d\n",         device_Property.maxThreadsPerBlock);
    printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
        device_Property.maxThreadsDim[0],
        device_Property.maxThreadsDim[1],
        device_Property.maxThreadsDim[2]);
    printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
        device_Property.maxGridSize[0],
        device_Property.maxGridSize[1],
        device_Property.maxGridSize[2]);
}

Detected 1 CUDA Capable device(s)

Device 0: "Tesla T4"
 CUDA Driver Version / Runtime Version 12.2 / 12.2
 Total amount of global memory: 15102 MBytes (15835660288 bytes)
 (40) Multiprocessors  GPU Max Clock rate: 1590 MHz (1.59 GHz)

 Total amount of global memory: 15102 MBytes (15835660288 bytes)
 Memory Clock rate: 5001 Mhz
 Memory Bus Width: 256-bit
 L2 Cache Size: 4194304 bytes
 Total amount of constant memory: 65536 bytes
 Total amount of shared memory per block: 49152 bytes
 Total number of registers available per block: 65536

 Maximum number of threads per multiprocessor: 1024
 Maximum number of threads per block: 1024
 Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
 Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)




**Pierwszy przykład pokazuje jak wykonać operacje na karcie graficznej. Napiszemy funkcję, która dodaje dwie liczby i zwraca wynik:**


In [7]:
%%cuda
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b, int *c) {
  *c = *a + *b;
}

int main() {
  int a, b, c;
  // host copies of variables a, b & c
  int *d_a, *d_b, *d_c;
  // device copies of variables a, b & c
  int size = sizeof(int);
  // Allocate space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);
  // Setup input values
  c = 0;
  a = 3;
  b = 5;
  // Copy inputs to device
  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
  // Launch add() kernel on GPU
  add<<<1,1>>>(d_a, d_b, d_c);
  // Copy result back to host
  cudaError err = cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
    printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
  printf("result is %d\n",c);
  // Cleanup
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);
  return 0;
}

result is 8



**W kolejnym przykładzie napiszemy kernel, który wykonuje operację dodawania na całej tablicy i porównamy z szybkością wykonywania tej samej funkcji na CPU:**

In [8]:
%%cuda
#include <iostream>
#include <vector>
#include <random>
#include <chrono>

#define N 10000000

void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
        out[i] = a[i] + b[i];
    }
}

// Kernel function to add the elements of two arrays
__global__ void add(int n, float *x, float *y, float *out)
{
  for (int i = 0; i < n; i++)
    out[i] = x[i] + y[i];
}

int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    // Allocate memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);

    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = 1.0f; b[i] = 2.0f;
    }

    std::chrono::steady_clock::time_point beginCPU = std::chrono::steady_clock::now();
    // add vectors
    vector_add(out, a, b, N);
    std::chrono::steady_clock::time_point endCPU = std::chrono::steady_clock::now();
    std::cout << "Time difference for CPU= " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCPU - beginCPU).count() << "[µs]\n";

    // print result
    for(int i = 0; i < 10; i++){
        std::cout << out[i] << "\n";
    }

    // GPU CUDA
    // Allocate Unified Memory – accessible from CPU or GPU
    cudaMallocManaged(&d_a, N*sizeof(float));
    cudaMallocManaged(&d_b, N*sizeof(float));
    cudaMallocManaged(&d_out, N*sizeof(float));

    // Initialize array
    for(int i = 0; i < N; i++){
        d_a[i] = 1.0f; d_b[i] = 2.0f;
    }

    std::chrono::steady_clock::time_point beginCUDA = std::chrono::steady_clock::now();
    // add vectors
    add<<<1, 1>>>(N, d_a, d_b, d_out);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    std::chrono::steady_clock::time_point endCUDA = std::chrono::steady_clock::now();
    std::cout << "Time difference for GPU= " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCUDA - beginCUDA).count() << "[µs]\n";

    // print result
    for(int i = 0; i < 10; i++){
        std::cout << d_out[i] << "\n";
    }

    // Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);

    std::cout << "Finished\n";
}

Time difference for CPU= 54391[µs]
3
3
3
3
3
3
3
3
3
3
Time difference for GPU= 647003[µs]
3
3
3
3
3
3
3
3
3
3
Finished



**Wywołanie tego samego kodu na GPU jest wielokrotnie wolniejsze niż na CPU. Kluczowe jest tutaj wywołanie kernela "add<<<1, 1>>>". Drugi parametr oznacza liczbę wątków. W tym przypadku kernel wykonany jest w jednym wątku. Procesory na karcie graficznej są wolniejsze niż CPU, do tego dochodzi kopiowanie danych, dlatego w tym przypadku wywołanie kodu na GPU jest wielokrotnie wolniejsze. W kolejnym przykładzie zwiększymy liczbę wątków do 256:**

In [9]:
%%cuda
#include <iostream>
#include <vector>
#include <random>
#include <chrono>

#define N 10000000

void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
        out[i] = a[i] + b[i];
    }
}

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y, float *out)
{
  int index = threadIdx.x;
  int stride = blockDim.x;
  // stride == 256
  // cała tablica dzielona jest na 256 bloków
  for (int i = index; i < n; i += stride)
      out[i] = x[i] + y[i];
}

int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    // Allocate memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);

    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = 1.0f; b[i] = 2.0f;
    }

    std::chrono::steady_clock::time_point beginCPU = std::chrono::steady_clock::now();
    // add vectors
    vector_add(out, a, b, N);
    std::chrono::steady_clock::time_point endCPU = std::chrono::steady_clock::now();
    std::cout << "Time difference for CPU= " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCPU - beginCPU).count() << "[µs]\n";

    // print result
    for(int i = 0; i < 10; i++){
        std::cout << out[i] << "\n";
    }

    // GPU CUDA
    // Allocate Unified Memory – accessible from CPU or GPU
    cudaMallocManaged(&d_a, N*sizeof(float));
    cudaMallocManaged(&d_b, N*sizeof(float));
    cudaMallocManaged(&d_out, N*sizeof(float));

    // Initialize array
    for(int i = 0; i < N; i++){
        d_a[i] = 1.0f; d_b[i] = 2.0f;
    }

    std::chrono::steady_clock::time_point beginCUDA = std::chrono::steady_clock::now();
    // add vectors
    add<<<1, 256>>>(N, d_a, d_b, d_out);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    std::chrono::steady_clock::time_point endCUDA = std::chrono::steady_clock::now();
    std::cout << "Time difference for GPU= " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCUDA - beginCUDA).count() << "[µs]\n";

    // print result
    for(int i = 0; i < 10; i++){
        std::cout << d_out[i] << "\n";
    }

    // Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);

    std::cout << "Finished\n";
}

Time difference for CPU= 51431[µs]
3
3
3
3
3
3
3
3
3
3
Time difference for GPU= 78645[µs]
3
3
3
3
3
3
3
3
3
3
Finished



**Dzięki wykorzystaniu 256 wątków (add<<<1, 256>>>) udało się znacząco przyspieszyć obliczenia na GPU. Powinny być nieznacznie szybsze niż obliczenia na CPU. Kluczowa jest również zawartość kernela:**

```
  int index = threadIdx.x;
  int stride = blockDim.x;
  for (int i = index; i < n; i += stride)
      out[i] = x[i] + y[i];
```

**Dzięki temu, każy z wątków niezależnie operuje na fragmentach tablicy.**

Zadanie 1: **Dla lepszego zrozumienia podziału tablicy w kernelu wyświetl wartości 'index', 'stride' i ewentualnie zmienną 'i' za pomocą funkcji 'printf' w kernelu.**

**Karty graficzne z CUDA GPU mają wiele procesorów pogrupowanych w tzw. Streaming Multiprocessors (SMs). Każdy z nich wspiera n wątków. Żeby w pełni wykorzystać możliwości karty graficznej, konieczne jest rozdzielenie zadania na bloki i na wątki wewnątrz bloków <<<liczba_blokow, liczba_watkow>>> (<<<liczba_blokow, rozmiar_bloku>>>):**


In [14]:
%%cuda
#include <iostream>
#include <vector>
#include <random>
#include <chrono>

#define N 100000

void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
        out[i] = a[i] + b[i];
    }
}

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y, float *out)
{
    // zwróć uwagę jak obliczany jest index
    // Czy nie przypomina odwołania do mapy zajętości (tablicy dwuwymiarowej)?
    // Liczbą kolumn w wierszu jest wtedy 'blockDim.x', numerem kolumny 'threadIdx.x',
    // a numerem wiersza jest 'blockIdx.x'.
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    printf("index %d \n", index);
    int stride = blockDim.x * gridDim.x;
    printf("stride %d \n", stride);
    for (int i = index; i < n; i += stride)
    {
      printf("i %d \n", i);
      out[i] = x[i] + y[i];
    }
}

int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    // Allocate memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);

    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = 1.0f; b[i] = 2.0f;
    }

    std::chrono::steady_clock::time_point beginCPU = std::chrono::steady_clock::now();
    // add vectors
    vector_add(out, a, b, N);
    std::chrono::steady_clock::time_point endCPU = std::chrono::steady_clock::now();
    std::cout << "Time difference for CPU= " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCPU - beginCPU).count() << "[µs]\n";

    // print result
    for(int i = 0; i < 10; i++){
        std::cout << out[i] << "\n";
    }

    // GPU CUDA
    // Allocate Unified Memory – accessible from CPU or GPU
    cudaMallocManaged(&d_a, N*sizeof(float));
    cudaMallocManaged(&d_b, N*sizeof(float));
    cudaMallocManaged(&d_out, N*sizeof(float));

    // Initialize array
    for(int i = 0; i < N; i++){
        d_a[i] = 1.0f; d_b[i] = 2.0f;
    }

    std::chrono::steady_clock::time_point beginCUDA = std::chrono::steady_clock::now();
    // add vectors
    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    std::cout << "numBlocks " << numBlocks << "\n";
    std::cout << "blockSize " << blockSize << "\n";
    add<<<numBlocks, blockSize>>>(N, d_a, d_b, d_out);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    std::chrono::steady_clock::time_point endCUDA = std::chrono::steady_clock::now();
    std::cout << "Time difference for GPU= " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCUDA - beginCUDA).count() << "[µs]\n";

    // print result
    for(int i = 0; i < 10; i++){
        std::cout << d_out[i] << "\n";
    }

    // Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);

    std::cout << "Finished\n";
}

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
i 89893 
i 89894 
i 89895 
i 89896 
i 89897 
i 89898 
i 89899 
i 89900 
i 89901 
i 89902 
i 89903 
i 89904 
i 89905 
i 89906 
i 89907 
i 89908 
i 89909 
i 89910 
i 89911 
i 89912 
i 89913 
i 89914 
i 89915 
i 89916 
i 89917 
i 89918 
i 89919 
i 89856 
i 89857 
i 89858 
i 89859 
i 89860 
i 89861 
i 89862 
i 89863 
i 89864 
i 89865 
i 89866 
i 89867 
i 89868 
i 89869 
i 89870 
i 89871 
i 89872 
i 89873 
i 89874 
i 89875 
i 89876 
i 89877 
i 89878 
i 89879 
i 89880 
i 89881 
i 89882 
i 89883 
i 89884 
i 89885 
i 89886 
i 89887 
i 99168 
i 99169 
i 99170 
i 99171 
i 99172 
i 99173 
i 99174 
i 99175 
i 99176 
i 99177 
i 99178 
i 99179 
i 99180 
i 99181 
i 99182 
i 99183 
i 99184 
i 99185 
i 99186 
i 99187 
i 99188 
i 99189 
i 99190 
i 99191 
i 99192 
i 99193 
i 99194 
i 99195 
i 99196 
i 99197 
i 99198 
i 99199 
i 99200 
i 99201 
i 99202 
i 99203 
i 99204 
i 99205 
i 99206 
i 99207 
i 99208 
i 99209 
i 99210 
i 99211 
i 99212 

**Tym razem udało się uzyskać ponad dwukrotne przyspieszenie.**

**W poprzednich przykładach używaliśmy funkcji cudaMallocManaged() do zarezerwowania pamięci na zmienne przetwarzane przez GPU. Jednocześnie używaliśmy CPU do inicjalizacji tych zmiennych. W tym przypadku używaliśmy tzw. "unified memory", do której ma dostęp CPU i GPU. W tym przypadku drivery do karty graficznej same decydują, gdzie fizycznie umieścić dane. Możliwe jest też używanie funkcji 'cudaMemPrefetchAsync()', która zarezerwuje pamięć na karcie graficznej (nie wszystkie karty to wspierają). W takiej sytaucji inicjalizacja zmiennych przez CPU nie będzie możliwa.**

**W kolejnym przykładzie zmierzymy również czas potrzebny na inicjalizację danych. W przypadku GPU zainicjalizujemy zmienne również na karcie graficznej. Dlatego dodany został kernel do inicjalizacji tych zmiennych na GPU:**

In [30]:
%%cuda
#include <iostream>
#include <vector>
#include <random>
#include <chrono>

#define N 100000

void vector_add(float *out, float *a, float *b, int n) {
    for(int i = 0; i < n; i++){
        out[i] = a[i] + b[i];
    }
}

// Kernel function to initialize three arrays
__global__
void init(int n, float *x, float *y, float *out)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride){
        x[i] = 1.0;
        y[i] = 2.0;
        out[i] = 0.0;
    }
}

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y, float *out)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < n; i += stride){
      out[i] = x[i] + y[i];
    }
}

int main(){
    float *a, *b, *out;
    float *d_a, *d_b, *d_out;

    // Allocate memory
    a   = (float*)malloc(sizeof(float) * N);
    b   = (float*)malloc(sizeof(float) * N);
    out = (float*)malloc(sizeof(float) * N);

    std::chrono::steady_clock::time_point beginCPU = std::chrono::steady_clock::now();
    // Initialize array
    for(int i = 0; i < N; i++){
        a[i] = 1.0f; b[i] = 2.0f;
    }

    // add vectors
    vector_add(out, a, b, N);
    std::chrono::steady_clock::time_point endCPU = std::chrono::steady_clock::now();
    std::cout << "Time difference for CPU= " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCPU - beginCPU).count() << "[µs]\n";

    // print result
    for(int i = 0; i < 10; i++){
        std::cout << out[i] << "\n";
    }

    // GPU CUDA
    // Allocate Unified Memory – accessible from CPU or GPU
    cudaMallocManaged(&d_a, N*sizeof(float));
    cudaMallocManaged(&d_b, N*sizeof(float));
    cudaMallocManaged(&d_out, N*sizeof(float));

    // Initialize and add arrays
    std::chrono::steady_clock::time_point beginCUDA = std::chrono::steady_clock::now();
    // add vectors
    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    init<<<numBlocks, blockSize>>>(N, d_a, d_b, d_out);
    add<<<numBlocks, blockSize>>>(N, d_a, d_b, d_out);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    std::chrono::steady_clock::time_point endCUDA = std::chrono::steady_clock::now();
    std::cout << "Time difference for GPU= " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCUDA - beginCUDA).count() << "[µs]\n";

    // print result
    for(int i = 0; i < 10; i++){
        std::cout << d_out[i] << "\n";
    }

    // Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_out);
    free(a);
    free(b);
    free(out);

    std::cout << "Finished\n";
}

Time difference for CPU= 1117[µs]
3
3
3
3
3
3
3
3
3
3
Time difference for GPU= 4858[µs]
3
3
3
3
3
3
3
3
3
3
Finished



**Tym razem różnica pomiędzy GPU a CPU jest znacząca na korzyść GPU.**

**Zadanie: zaimplementować prostą sieć neuronową składającą się z neuronów typu LeakyReLU. Wyjście z poprzedniego neuronu jest podawane na wejście neuronu w kolejnej warstwie z wagą równą 1. Parametry neuronów, liczba warstw i liczba neuronów w każdej warstwie powinna być taka sama jak w instrukcji dotyczącej OpenCL. Porównaj implementację CPU i GPU (czas wykonywania i wartości na wyjściu dla 5 pierwszych neuronów).**


In [20]:
%%cuda
#include <iostream>
#include <vector>
#include <random>
#include <chrono>
#include <cmath>

#define N 1000
#define LEAKY_ALPHA 0.01f

// LeakyReLU activation function
__device__ float leaky_relu(float x) {
    return x > 0 ? x : LEAKY_ALPHA * x;
}

// Kernel function to initialize weights and biases
__global__
void init_params(float *weights, float *biases, int input_size, int output_size) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < input_size * output_size; i += stride) {
        weights[i] = 0.1f;  // initialize weights to 0.1
    }
    if (index < output_size) {
        biases[index] = 0.0f;  // initialize biases to 0.0
    }
}

// Kernel function to perform forward pass through the neural network
__global__
void forward_pass(float *input, float *output, float *weights, float *biases, int input_size, int output_size) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    for (int i = index; i < output_size; i += stride) {
        float neuron_output = biases[i];
        for (int j = 0; j < input_size; j++) {
            neuron_output += input[j] * weights[j * output_size + i];
        }
        output[i] = leaky_relu(neuron_output);
    }
}

void cpu_init_params_here(float *weights, float *biases, int input_size, int output_size) {
    for (int i = 0; i < input_size * output_size; i++) {
        weights[i] = 0.1f;  // initialize weights to 0.1
    }
    for (int i = 0; i < output_size; i++) {
        biases[i] = 0.0f;  // initialize biases to 0.0
    }
}

void cpu_forward_pass(float *input, float *output, float *weights, float *biases, int input_size, int output_size) {
    for (int i = 0; i < output_size; i++) {
        float neuron_output = biases[i];
        for (int j = 0; j < input_size; j++) {
            neuron_output += input[j] * weights[j * output_size + i];
        }
        if (neuron_output > 0){
            output[i] = LEAKY_ALPHA * neuron_output;
        }
        else{
            output[i] = 0;
        }
    }
}

int main() {
    float *input, *output;
    float *weights, *biases;
    float *d_input, *d_output, *d_weights, *d_biases;
    float *c_input, *c_output, *c_weights, *c_biases;

    // Allocate memory for input, output, weights, and biases
    input   = (float*)malloc(sizeof(float) * N);
    output  = (float*)malloc(sizeof(float) * N);
    weights = (float*)malloc(sizeof(float) * N);
    biases  = (float*)malloc(sizeof(float) * N);

    // Initialize input (for simplicity, initialize to 1.0)
    for (int i = 0; i < N; i++) {
        input[i] = 1.0f;
    }

    // Allocate Unified Memory for input, output, weights, and biases
    cudaMallocManaged(&d_input, sizeof(float) * N);
    cudaMallocManaged(&d_output, sizeof(float) * N);
    cudaMallocManaged(&d_weights, sizeof(float) * N * N);
    cudaMallocManaged(&d_biases, sizeof(float) * N);

    // Initialize parameters (weights and biases)
    int blockSize = 256;
    int numBlocks = (N + blockSize - 1) / blockSize;
    init_params<<<numBlocks, blockSize>>>(d_weights, d_biases, N, N);

    // some issue with cpu initialization
    // cpu_init_params_here(c_weights, c_biases, N, N);

    // Perform forward pass on CPU
    auto beginCPU = std::chrono::steady_clock::now();
    cpu_forward_pass(input, output, weights, biases, N, N);
    auto endCPU = std::chrono::steady_clock::now();
    std::cout << "Time difference for CPU: " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endCPU - beginCPU).count() << " µs\n";

    // Perform forward pass on GPU
    cudaMemcpy(d_input, input, sizeof(float) * N, cudaMemcpyHostToDevice);
    auto beginGPU = std::chrono::steady_clock::now();
    forward_pass<<<numBlocks, blockSize>>>(d_input, d_output, d_weights, d_biases, N, N);
    cudaDeviceSynchronize();
    auto endGPU = std::chrono::steady_clock::now();
    std::cout << "Time difference for GPU: " <<
    std::chrono::duration_cast<std::chrono::microseconds>(endGPU - beginGPU).count() << " µs\n";

    // Print initialized weights and biases on GPU
    //std::cout << "Initialized weights and biases on GPU:\n";
    //for (int i = 0; i < 5; i++) {
    //    std::cout << "Weight " << i << ": " << d_weights[i] << ", Bias " << i << ": " << d_biases[i] << "\n";
    //}

    // Print initialized weights and biases on CPU
    //std::cout << "Initialized weights and biases on CPU:\n";
    //for (int i = 0; i < 5; i++) {
    //    std::cout << "Weight " << i << ": " << weights[i] << ", Bias " << i << ": " << biases[i] << "\n";
    //}


    // Print first 5 outputs
    std::cout << "Outputs for CPU:\n";
    for (int i = 0; i < 5; i++) {
        std::cout << output[i] << "\n";
    }

    std::cout << "Outputs for GPU:\n";
    for (int i = 0; i < 5; i++) {
        std::cout << d_output[i] << "\n";
    }

    // Cleanup
    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_weights);
    cudaFree(d_biases);
    free(input);
    free(output);
    free(weights);
    free(biases);

    std::cout << "Finished\n";

    return 0;
}


Time difference for CPU: 15159 µs
Time difference for GPU: 101 µs
Outputs for CPU:
0
0
0
0
0
Outputs for GPU:
99.999
99.999
99.999
99.999
99.999
Finished



**ACKNOWLEDGEMENTS**
- https://stackoverflow.com/questions/51194303/how-to-run-a-python-script-in-a-py-file-from-a-google-colab-notebook
- https://devblogs.nvidia.com/even-easier-introduction-cuda/
- https://devblogs.nvidia.com/unified-memory-cuda-beginners/
