In [3]:
!nvidia-smi

Wed Jan 26 20:30:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
!pip install git+git://github.com/andreinechaev/nvcc4jupyter.git

Collecting git+git://github.com/andreinechaev/nvcc4jupyter.git
  Cloning git://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-6yjh378a
  Running command git clone -q git://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-6yjh378a
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=fdf5d657acfc8abc52ea8b3a73cd8bf3f9ef54367288e5f86eb7a21884ac5176
  Stored in directory: /tmp/pip-ephem-wheel-cache-h29cepr2/wheels/c5/2b/c0/87008e795a14bbcdfc7c846a00d06981916331eb980b6c8bdf
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [5]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [6]:
%cd /usr/local/

/usr/local


In [10]:
%ls

[0m[01;34mbin[0m/        [01;34mcuda-11.0[0m/  [01;34m_gcs_config_ops.so[0m/  [01;34mlicensing[0m/  [01;34mshare[0m/
[01;34mcuda-10.0[0m/  [01;34mcuda-11.1[0m/  [01;34minclude[0m/             [01;36mman[0m@        [01;34msrc[0m/
[01;34mcuda-10.1[0m/  [01;34metc[0m/        [01;34mlib[0m/                 [01;34msbin[0m/       [01;34mxgboost[0m/
[01;36mcuda-11[0m@    [01;34mgames[0m/      LICENSE.txt          setup.cfg


In [12]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [9]:
!rm -rf cuda

In [11]:
!ln -s /usr/local/cuda-10.1 /usr/local/cuda

In [48]:
%%cu

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
 
#define SRAND_VALUE 2137
#define BLOCK_SIZE 128

// na podstawie https://www.olcf.ornl.gov/tutorials/cuda-game-of-life/

__global__ void ghostRows(int dim, int *grid)
{
    // granice góra dół
    int id = blockDim.x * blockIdx.x + threadIdx.x + 1;
    if (id <= dim)
    {
        grid[(dim+2)*(dim+1)+id] = grid[(dim+2)+id];
        grid[id] = grid[(dim+2)*dim + id];
    }
}


__global__ void ghostCols(int dim, int *grid)
{
    // granice lewo prawo
    int id = blockDim.x * blockIdx.x + threadIdx.x;
    if (id <= dim+1)
    {
        grid[id*(dim+2)+dim+1] = grid[id*(dim+2)+1];
        grid[id*(dim+2)] = grid[id*(dim+2) + dim];   
    }
}


__global__ void GOL(int dim, int *grid, int *newGrid)
{
    int iy = blockDim.y * blockIdx.y + threadIdx.y + 1;
    int ix = blockDim.x * blockIdx.x + threadIdx.x + 1;
    int id = iy * (dim+2) + ix;
    int numNeighbors;
 
    if (iy <= dim && ix <= dim) {
        // check sąsiadów
        numNeighbors = grid[id+(dim+2)] + grid[id-(dim+2)]
                     + grid[id+1] + grid[id-1]             
                     + grid[id+(dim+3)] + grid[id-(dim+3)]
                     + grid[id-(dim+1)] + grid[id+(dim+1)];
        int cell = grid[id];
        // zasady gry w życia
        if (cell == 1 && numNeighbors < 2)
            newGrid[id] = 0;
        else if (cell == 1 && (numNeighbors == 2 || numNeighbors == 3))
            newGrid[id] = 1;
        else if (cell == 1 && numNeighbors > 3)
            newGrid[id] = 0;
        else if (cell == 0 && numNeighbors == 3)
            newGrid[id] = 1;
        else
            newGrid[id] = cell;
    }
}


int main()
{
    // deklaracje
    int i,j,iter;
    int* h_grid; //Grid on host
    int* d_grid; //Grid on device
    int* d_newGrid; //Second grid used on device only
    int* d_tmpGrid; //tmp grid pointer used to switch between grid and newGrid
 
    // stałe
    int dim = 1024;
    /*printf("Enter nmber of game steps: ");
    int steps;
    scanf("%d", &steps);
    printf("\nYou entered: %d ", steps);*/
    int steps = 1<<10;
 
    // tablica do gry z paskami granicznymi
    size_t bytes = sizeof(int)*(dim+2)*(dim+2);
    h_grid = (int*)malloc(bytes);
 

    /****************************
    M E T O D A   P I E R W S Z A
    ****************************/
 
    clock_t t;
    t = clock();

    cudaMalloc(&d_grid, bytes);
    cudaMalloc(&d_newGrid, bytes);
 
    // losowy stan początkowy
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            h_grid[i*(dim+2)+j] = rand() % 2;
        }
    }
 
    // kopiowanie
    cudaMemcpy(d_grid, h_grid, bytes, cudaMemcpyHostToDevice);
 
    // bloki, gridy i inne - wystarczy raz!
    dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE,1);
    int linGrid = (int)ceil(dim/(float)BLOCK_SIZE);
    dim3 gridSize(linGrid,linGrid,1);
    dim3 cpyBlockSize(BLOCK_SIZE,1,1);
    dim3 cpyGridRowsGridSize((int)ceil(dim/(float)cpyBlockSize.x),1,1);
    dim3 cpyGridColsGridSize((int)ceil((dim+2)/(float)cpyBlockSize.x),1,1);
 
    // gra w życie
    for (iter = 0; iter<steps; iter++) {
        ghostRows<<<cpyGridRowsGridSize, cpyBlockSize>>>(dim, d_grid);
        ghostCols<<<cpyGridColsGridSize, cpyBlockSize>>>(dim, d_grid);
        GOL<<<gridSize, blockSize>>>(dim, d_grid, d_newGrid);
        // iterowanie
        d_tmpGrid = d_grid;
        d_grid = d_newGrid;
        d_newGrid = d_tmpGrid;
    }
 
    // wyniki
    cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);
    int total1 = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total1 += h_grid[i*(dim+2)+j];
        }
    }
    printf("Total Alive: %d\n", total1);

    memset(h_grid, 0, bytes);
    free(h_grid);

    t = clock() - t;
    double time_taken = ((double)t)/CLOCKS_PER_SEC;
    printf("Metoda 1 took %f seconds to execute \n", time_taken);


    /****************************
       M E T O D A   D R U G A
    ****************************/
      
    t = clock();
 
    cudaMallocHost(&h_grid, bytes);
 
    // losowy stan początkowy
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            h_grid[i*(dim+2)+j] = rand() % 2;
        }
    }
 
    cudaMemcpy(d_grid, h_grid, bytes, cudaMemcpyHostToDevice);

    // gra w życie
    for (iter = 0; iter<steps; iter++) {
        ghostRows<<<cpyGridRowsGridSize, cpyBlockSize>>>(dim, d_grid);
        ghostCols<<<cpyGridColsGridSize, cpyBlockSize>>>(dim, d_grid);
        GOL<<<gridSize, blockSize>>>(dim, d_grid, d_newGrid);
        // iterowanie
        d_tmpGrid = d_grid;
        d_grid = d_newGrid;
        d_newGrid = d_tmpGrid;
    }
 
    // wyniki
    cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);
    int total2 = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total2 += h_grid[i*(dim+2)+j];
        }
    }
    printf("Total Alive: %d\n", total2);

    memset(h_grid, 0, bytes);
    cudaFreeHost(h_grid);
 
    t = clock() - t;
    time_taken = ((double)t)/CLOCKS_PER_SEC;
    printf("Metoda 2 took %f seconds to execute \n", time_taken);


    /****************************
     M E T O D A   T R Z E C I A
    ****************************/
 
    t = clock();
 
    cudaHostAlloc(&h_grid, bytes, cudaHostAllocMapped);
    cudaHostGetDevicePointer(&d_grid, h_grid, 0);
    cudaHostGetDevicePointer(&d_newGrid, h_grid, 0);
    cudaHostGetDevicePointer(&d_tmpGrid, h_grid, 0);

    // losowy stan początkowy
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            h_grid[i*(dim+2)+j] = rand() % 2;
        }
    }

    // gra w życie
    for (iter = 0; iter<steps; iter++) {
        ghostRows<<<cpyGridRowsGridSize, cpyBlockSize>>>(dim, d_grid);
        ghostCols<<<cpyGridColsGridSize, cpyBlockSize>>>(dim, d_grid);
        GOL<<<gridSize, blockSize>>>(dim, d_grid, d_newGrid);
        // iterowanie
        d_tmpGrid = d_grid;
        d_grid = d_newGrid;
        d_newGrid = d_tmpGrid;
    }
    cudaDeviceSynchronize();
 
    // wyniki
    cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);
    int total3 = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total3 += h_grid[i*(dim+2)+j];
        }
    }
    printf("Total Alive: %d\n", total3);
 
    memset(h_grid, 0, bytes);
    cudaFreeHost(h_grid);
 
    t = clock() - t;
    time_taken = ((double)t)/CLOCKS_PER_SEC;
    printf("Metoda 3 took %f seconds to execute \n", time_taken);
 

    /****************************
     M E T O D A   C Z W A R T A
    ****************************/
 
    /*t = clock();
 
    cudaMallocManaged(&h_grid, bytes);
 
    // losowy stan początkowy
    srand(SRAND_VALUE);
    for(i = 1; i<=dim; i++) {
        for(j = 1; j<=dim; j++) {
            h_grid[i*(dim+2)+j] = rand() % 2;
        }
    }

    // gra w życie
    for (iter = 0; iter<steps; iter++) {
        ghostRows<<<cpyGridRowsGridSize, cpyBlockSize>>>(dim, d_grid);
        ghostCols<<<cpyGridColsGridSize, cpyBlockSize>>>(dim, d_grid);
        GOL<<<gridSize, blockSize>>>(dim, d_grid, d_newGrid);
        // iterowanie
        d_tmpGrid = d_grid;
        d_grid = d_newGrid;
        d_newGrid = d_tmpGrid;
    }
 
    // wyniki
    cudaMemcpy(h_grid, d_grid, bytes, cudaMemcpyDeviceToHost);
    int total4 = 0;
    for (i = 1; i<=dim; i++) {
        for (j = 1; j<=dim; j++) {
            total4 += h_grid[i*(dim+2)+j];
        }
    }
    printf("Total Alive: %d\n", total4);
 
    memset(h_grid, 0, bytes);
    cudaFree(h_grid);
 
    t = clock() - t;
    time_taken = ((double)t)/CLOCKS_PER_SEC;
    printf("Metoda 4 took %f seconds to execute \n", time_taken);*/

    // ostatnie usuwania
    cudaFree(d_grid);
    cudaFree(d_newGrid);
    cudaFree(d_tmpGrid);
    return 0;
}

Total Alive: 523969
Metoda 1 took 0.163928 seconds to execute 
Total Alive: 523969
Metoda 2 took 0.046805 seconds to execute 
Total Alive: 523969
Metoda 3 took 0.085711 seconds to execute 

