**ZADATAK 1**: *Jednostavno zbrajanje.*

In [None]:
%%writefile z1.cu

#include <iostream>

__global__ void add(int *a, int *b, int *c) {
  *c = *a + *b;
}

int main(void) {
  int a, b, c; // host copies of a, b, c
  int *d_a, *d_b, *d_c; // device copies of a, b, c
  int size = sizeof(int);

  // Allocate space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  // Setup input values
  a = 2;
  b = 7;

  // Copy inputs to device
  // cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice)
  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

  // Launch add() kernel on GPU
  add<<<1,1>>>(d_a, d_b, d_c);

  // Copy result back to host
  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

  std::cout << a << "+" << b << "=" << c << std::endl;

  // Cleanup
  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

  return 0;
}

Overwriting z1.cu


In [None]:
!nvcc z1.cu -o z1.out
!./z1.out

2+7=9


**ZADATAK 2**: *Umjesto izvođenja funkcije add() jednom, izvrši N puta paralelno.*

In [None]:
%%writefile z2.cu

#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>
#define N 64

__global__ void add(int *a, int *b, int *c) {
  c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}

void random_ints(int *a){
  for (int i = 0; i < N; i++){
    *a = rand() % N;
    a++;
  }
}


int main(void) {
  int *a, *b, *c; // host copies of a, b, c
  int *d_a, *d_b, *d_c; // device copies of a, b, c
  int size = N * sizeof(int);

  // Alloc space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  // Alloc space for host copies of a, b, c and setup input values
  a = (int *)malloc(size); random_ints(a);
  b = (int *)malloc(size); random_ints(b);
  c = (int *)malloc(size);

  // Copy inputs to device
  cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

  // Start time measurement
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start);

  // Launch add() kernel on GPU with N blocks
  add<<<N,1>>>(d_a, d_b, d_c);

   // Stop time measurement
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float milliseconds = 0;
  cudaEventElapsedTime(&milliseconds, start, stop);

  // Copy result back to host
  cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

  // print results
  for(int i = 0; i < N; i++){
    printf("%i + %i = %i \t",a[i], b[i], c[i]);
    if (i%8 == 7)
      printf("\n");
  }

  std::cout << "Execution time: " << milliseconds << " milliseconds" << std::endl;

  // Cleanup
  free(a); free(b); free(c);
  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

  return 0;
}


Overwriting z2.cu


In [None]:
!nvcc z2.cu -o z2.out
!./z2.out

39 + 48 = 87 	6 + 41 = 47 	41 + 62 = 103 	51 + 33 = 84 	17 + 1 = 18 	63 + 33 = 96 	10 + 60 = 70 	44 + 39 = 83 	
41 + 62 = 103 	13 + 1 = 14 	58 + 62 = 120 	43 + 23 = 66 	50 + 42 = 92 	59 + 28 = 87 	35 + 43 = 78 	6 + 22 = 28 	
60 + 15 = 75 	2 + 56 = 58 	20 + 28 = 48 	56 + 42 = 98 	27 + 44 = 71 	40 + 48 = 88 	39 + 59 = 98 	13 + 59 = 72 	
54 + 50 = 104 	26 + 47 = 73 	46 + 60 = 106 	35 + 20 = 55 	51 + 44 = 95 	31 + 24 = 55 	9 + 27 = 36 	26 + 28 = 54 	
38 + 2 = 40 	50 + 26 = 76 	13 + 62 = 75 	55 + 3 = 58 	49 + 59 = 108 	24 + 58 = 82 	35 + 42 = 77 	26 + 58 = 84 	
37 + 59 = 96 	29 + 41 = 70 	5 + 17 = 22 	23 + 38 = 61 	24 + 5 = 29 	41 + 60 = 101 	30 + 60 = 90 	20 + 20 = 40 	
43 + 53 = 96 	50 + 24 = 74 	13 + 62 = 75 	6 + 33 = 39 	27 + 9 = 36 	52 + 57 = 109 	20 + 28 = 48 	17 + 59 = 76 	
14 + 40 = 54 	2 + 25 = 27 	52 + 15 = 67 	1 + 21 = 22 	33 + 49 = 82 	61 + 43 = 104 	28 + 49 = 77 	7 + 51 = 58 	
Execution time: 0.192768 milliseconds


**ZADATAK 3**: *Računanje koristeći niti.*

In [None]:
%%writefile z3.cu

#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>

__global__ void add(int *a, int *b, int *c) {
  c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

#define N 64

void random_ints(int *a, int n) {
    for (int i = 0; i < n; ++i) {
        a[i] = rand() % 1000;
    }
}

int main(void) {
  int *a, *b, *c; // host copies of a, b, c
  int *d_a, *d_b, *d_c; // device copies of a, b, c
  int size = N * sizeof(int);

  // Alloc space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  // Alloc space for host copies of a, b, c and setup input values
  a = (int *)malloc(size); random_ints(a, N);
  b = (int *)malloc(size); random_ints(b, N);
  c = (int *)malloc(size);

  // Copy inputs to device
  cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

  // Start time measurement
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start);

  // Launch add() kernel on GPU with N threads
  add<<<1,N>>>(d_a, d_b, d_c);

   // Stop time measurement
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float milliseconds = 0;
  cudaEventElapsedTime(&milliseconds, start, stop);

  // Copy result back to host
  cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

  // print results
  for(int i = 0; i < N; i++){
    printf("%i + %i = %i \t",a[i], b[i], c[i]);
    if (i%8 == 7)
    printf("\n");
  }
  std::cout << "Execution time: " << milliseconds << " milliseconds" << std::endl;

  // Cleanup
  free(a); free(b); free(c);
  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

  return 0;
}


Overwriting z3.cu


In [None]:
!nvcc z3.cu -o z3.out
!./z3.out

383 + 336 = 719 	886 + 505 = 1391 	777 + 846 = 1623 	915 + 729 = 1644 	793 + 313 = 1106 	335 + 857 = 1192 	386 + 124 = 510 	492 + 895 = 1387 	
649 + 582 = 1231 	421 + 545 = 966 	362 + 814 = 1176 	27 + 367 = 394 	690 + 434 = 1124 	59 + 364 = 423 	763 + 43 = 806 	926 + 750 = 1676 	
540 + 87 = 627 	426 + 808 = 1234 	172 + 276 = 448 	736 + 178 = 914 	211 + 788 = 999 	368 + 584 = 952 	567 + 403 = 970 	429 + 651 = 1080 	
782 + 754 = 1536 	530 + 399 = 929 	862 + 932 = 1794 	123 + 60 = 183 	67 + 676 = 743 	135 + 368 = 503 	929 + 739 = 1668 	802 + 12 = 814 	
22 + 226 = 248 	58 + 586 = 644 	69 + 94 = 163 	167 + 539 = 706 	393 + 795 = 1188 	456 + 570 = 1026 	11 + 434 = 445 	42 + 378 = 420 	
229 + 467 = 696 	373 + 601 = 974 	421 + 97 = 518 	919 + 902 = 1821 	784 + 317 = 1101 	537 + 492 = 1029 	198 + 652 = 850 	324 + 756 = 1080 	
315 + 301 = 616 	370 + 280 = 650 	413 + 286 = 699 	526 + 441 = 967 	91 + 865 = 956 	980 + 689 = 1669 	956 + 444 = 1400 	873 + 619 = 1492 	
862 + 440 = 1302 	170 + 729 = 89

**ZADATAK 4**: *Kombinacija blokova i niti.*

In [None]:
%%writefile z4.cu

#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>

#define N (32*32)
#define THREADS_PER_BLOCK 32

__global__ void add(int *a, int *b, int *c) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  c[index] = a[index] + b[index];
}

void random_ints(int *a){
  for (int i = 0; i < N; i++){
    *a = rand() % N;
    a++;
  }
}

int main(void) {
  int *a, *b, *c; // host copies of a, b, c
  int *d_a, *d_b, *d_c; // device copies of a, b, c
  int size = N * sizeof(int);

  // Alloc space for device copies of a, b, c
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  // Alloc space for host copies of a, b, c and setup input values
  a = (int *)malloc(size); random_ints(a);
  b = (int *)malloc(size); random_ints(b);
  c = (int *)malloc(size);

  // Copy inputs to device
  cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

  // Start time measurement
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  cudaEventRecord(start);

  // Launch add() kernel on GPU
  add<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);

   // Stop time measurement
  cudaEventRecord(stop);
  cudaEventSynchronize(stop);
  float milliseconds = 0;
  cudaEventElapsedTime(&milliseconds, start, stop);

  // Copy result back to host
  cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

  // print results
  for(int i = 0; i < N; i++){
    printf("%i + %i = %i \t",a[i], b[i], c[i]);
    if (i%8 == 7)
      printf("\n");
  }
  std::cout << "Execution time: " << milliseconds << " milliseconds" << std::endl;

  // Cleanup
  free(a); free(b); free(c);
  cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

  return 0;
}

Overwriting z4.cu


In [None]:
!nvcc z4.cu -o z4.out
!./z4.out

359 + 299 = 658 	966 + 908 = 1874 	105 + 694 = 799 	115 + 903 = 1018 	81 + 27 = 108 	255 + 100 = 355 	74 + 1013 = 1087 	236 + 97 = 333 	
809 + 683 = 1492 	205 + 28 = 233 	186 + 487 = 673 	939 + 656 = 1595 	498 + 91 = 589 	763 + 144 = 907 	483 + 542 = 1025 	326 + 997 = 1323 	
124 + 770 = 894 	706 + 936 = 1642 	84 + 17 = 101 	1016 + 887 = 1903 	795 + 589 = 1384 	488 + 717 = 1205 	487 + 737 = 1224 	909 + 315 = 1224 	
886 + 135 = 1021 	346 + 864 = 1210 	302 + 372 = 674 	611 + 138 = 749 	563 + 630 = 1193 	927 + 475 = 1402 	201 + 372 = 573 	922 + 929 = 1851 	
870 + 360 = 1230 	306 + 42 = 348 	13 + 808 = 821 	951 + 387 = 1338 	561 + 143 = 704 	88 + 797 = 885 	163 + 484 = 647 	346 + 826 = 1172 	
293 + 825 = 1118 	349 + 972 = 1321 	261 + 458 = 719 	791 + 916 = 1707 	88 + 92 = 180 	745 + 1000 = 1745 	94 + 889 = 983 	212 + 862 = 1074 	
427 + 913 = 1340 	178 + 906 = 1084 	205 + 726 = 931 	198 + 478 = 676 	667 + 599 = 1266 	692 + 439 = 1131 	84 + 793 = 877 	529 + 735 = 1264 	
14 + 280 = 294 	386 + 

**ZADATAK 5**: *Korištenje dijeljene memorije.*

In [None]:
%%writefile z5.cu

#include <iostream>
#include <cstdlib>
#include <cuda_runtime.h>

#define BLOCK_SIZE 256
#define RADIUS 3
#define N (2048 * 2048)

__global__ void stencil_1d(int *in, int *out) {
  __shared__ int temp[BLOCK_SIZE + 2 * RADIUS];
  int gindex = threadIdx.x + blockIdx.x * blockDim.x;
  int lindex = threadIdx.x + RADIUS;

  // Read input elements into shared memory
  temp[lindex] = in[gindex];
  if (threadIdx.x < RADIUS) {
    temp[lindex - RADIUS] = (gindex - RADIUS >= 0) ? in[gindex - RADIUS] : 0;
    temp[lindex + BLOCK_SIZE] = (gindex + BLOCK_SIZE < N) ? in[gindex + BLOCK_SIZE] : 0;
  }

  // Synchronize (ensure all the data is available)
  __syncthreads();

  // Apply the stencil
  int result = 0;
  for (int offset = -RADIUS; offset <= RADIUS; offset++)
    result += temp[lindex + offset];

  // Store the result
  out[gindex] = result;
}


void random_ints(int *a, int n) {
    for (int i = 0; i < n; ++i) {
        a[i] = rand() % 1000;
    }
}

int main(void) {
    int *a, *b; // host copies of input and output arrays
    int *d_a, *d_b; // device copies of input and output arrays
    int size = N * sizeof(int);

    // Alloc space for device copies of input and output arrays
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);

    // Alloc space for host copies of input and output arrays
    a = (int *)malloc(size);
    b = (int *)malloc(size);

    // Initialize input array with random values
    random_ints(a, N);

    // Copy inputs to device
    cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);

    // Launch stencil_1d kernel on GPU
    stencil_1d<<<(N + BLOCK_SIZE - 1) / BLOCK_SIZE, BLOCK_SIZE>>>(d_a, d_b);

    // Copy result back to host
    cudaMemcpy(b, d_b, size, cudaMemcpyDeviceToHost);

    std::cout << a[0] << " -> " << b[0] << std::endl;

    // Cleanup
    free(a);
    free(b);
    cudaFree(d_a);
    cudaFree(d_b);

    return 0;
}


Overwriting z5.cu


In [None]:
!nvcc z5.cu -o z5.out
!./z5.out

383 -> 2961
