<a href="https://colab.research.google.com/github/markbojic/parallel-alg/blob/main/Projekat2_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/46/61/47d3235a4c13eec5a5f03594ddb268f4858734e02980afbcd806e6242fa5/pycuda-2020.1.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 19.4MB/s 
[?25hCollecting pytools>=2011.2
[?25l  Downloading https://files.pythonhosted.org/packages/fb/fc/9628f0d2ec698360f4475bea0a88cb767b935f5347e6687bae0ffa342aab/pytools-2021.2.tar.gz (65kB)
[K     |████████████████████████████████| 71kB 10.4MB/s 
Collecting mako
[?25l  Downloading https://files.pythonhosted.org/packages/f3/54/dbc07fbb20865d3b78fdb7cf7fa713e2cba4f87f71100074ef2dc9f9d1f7/Mako-1.1.4-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 11.8MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (setup.py) ... [?25l[?25hdone
  Created wheel for pycuda: filename=pycuda-2020.1-cp37-cp37m-linux_x86_64.whl size=621283 sha256=03ad3808a5636d4b4fe8acb914c82d2163c915aea2fbc2288bdcc865dbbbc4a3
 

In [3]:
!nvidia-smi

Thu Mar 18 11:31:42 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [5]:
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
import gc
import pycuda.autoinit

kernel_code_template = """
__global__ void MatrixMulKernel(int *A, int *B, int MATRIX_SIZE_1, int MATRIX_SIZE_2, int B_SIZE, int *C)
{
  int wA = MATRIX_SIZE_1;
  int wB = MATRIX_SIZE_2;

  //Block indeks
  int bx = blockIdx.x;
  int by = blockIdx.y;

  // Thread index
  int tx = threadIdx.x;
  int ty = threadIdx.y;

  // Pocetak prve podmatrice A
  int aBegin = wA * B_SIZE * by;
  // Krajnji element poslednje podmatrice A
  int aEnd = aBegin + wA - 1;
  //Korak za sledece mnozenje
  int aStep = B_SIZE;

  //Prvi indeks za matricu B
  int bBegin = B_SIZE * bx;
  //Korak za matricu B
  int bStep = B_SIZE * wB;

  //Elemebt koji se racuna i posle stavlja u rezultujucu matricu C
  int Csub = 0;
  //Prolazak korz sve podmatrice za dati blok
  
  for (int a = aBegin, b = bBegin;a <= aEnd; a += aStep, b += bStep)
    {
      //Deljena memorija za podmatricu A
      __shared__ int As[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];
      //Deljena memorija za podmatricu B
      __shared__ int Bs[%(BLOCK_SIZE)s][%(BLOCK_SIZE)s];

      //Ucitaj po jedan element u deljenu memoriju 
      As[ty][tx] = A[a + wA * ty + tx];
      Bs[ty][tx] = B[b + wB * ty + tx];

      //Mora sinhronizacija za thredove da se ucitaju svi lepo
      __syncthreads();

      //Mnozenje matrica kao ranije, jedan thread racuna jedan element
      for (int k = 0; k < B_SIZE; ++k)
        Csub += As[ty][k] * Bs[k][tx];

      //Ponovo sinhronizacija
      __syncthreads();
    }

  //Upis izracunate podmatrice u matricu rezultata
  int c = wB * B_SIZE * by + B_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
}
"""

# Velicina matrice - kvadratna
MATRIX_SIZE_1 = 32
MATRIX_SIZE_2 = 32
MATRIX_SIZE_3 = 32
#Velicina bloka i tile koji se koristi za uzimanje delova matrica 
#Vrednost mora da deli velicinu matrice radi postizanja tacnog rezultata prilikom mnozenja
TILE_SIZE = 16
BLOCK_SIZE = TILE_SIZE

#Random kvadratne matrice
a_cpu = np.random.randn(MATRIX_SIZE_1, MATRIX_SIZE_2).astype(np.int32)
b_cpu = np.random.randn(MATRIX_SIZE_2, MATRIX_SIZE_3).astype(np.int32)

#C_cpu se koristi radi provere-sadrzi tacne rezultate mnozenja matrica A i B
c_cpu = np.matmul(a_cpu, b_cpu)

#Prebacivanje na GPU i pravljenje c_gpu niza
a_gpu = gpuarray.to_gpu(a_cpu)
b_gpu = gpuarray.to_gpu(b_cpu)
c_gpu = gpuarray.empty((MATRIX_SIZE_1, MATRIX_SIZE_3), np.int32)


#Uzmi kernel kod iz tamplate-a i stavi vrednosti za ucitavanje - posle se korist iza pristupanju ovim vrednostima u kernel kodu sa '%(VALUE)s' komandom
kernel_code = kernel_code_template % {
    'BLOCK_SIZE': BLOCK_SIZE,
    }
#Compile kod
mod = compiler.SourceModule(kernel_code)
c_gpu = gpuarray.empty((MATRIX_SIZE_1, MATRIX_SIZE_3), np.int32)

matrixmul = mod.get_function("MatrixMulKernel")
msize1 = np.int32(MATRIX_SIZE_1)
msize2 = np.int32(MATRIX_SIZE_3)
bsize = np.int32(TILE_SIZE)
matrixmul(
    #IN
    a_gpu, b_gpu, msize1, msize2, bsize,
    #OUT
    c_gpu,
    #GRID
    grid = (MATRIX_SIZE_1 // TILE_SIZE, MATRIX_SIZE_3 // TILE_SIZE),
    #BLOCK
    block = (TILE_SIZE, TILE_SIZE, 1),
    )

#Stampanje rezultata mnozenja
print ("Matrica A:")
print (a_gpu)

print( "\n")
print("Matrca B:")
print(b_gpu)

print( "\n")
print( "Matrica C:")
print(c_gpu.get())

print("\n")
print("Ocekivana matrica:")
print(c_cpu)


a_gpu.gpudata.free()
b_gpu.gpudata.free()
c_gpu.gpudata.free()
del a_gpu
del b_gpu
del c_gpu
del a_cpu
del b_cpu
#Zato sto zeza nakon nekoliko pustanja- javlja Illegal memory access (Logic Error), zato je gore i brisanje elemenata
#Force garbage Collector, posle ovoga radi normalno bez potrebe da se resetuje RUNTIME.
gc.collect()

Matrica A:
[[ 0  1  0 ... -1  1  0]
 [ 1  1  0 ... -1  0  0]
 [ 1  0  1 ...  0  0 -1]
 ...
 [-2  0  0 ...  0  0  0]
 [ 0  1 -1 ... -1 -1  0]
 [ 1  0  0 ...  1  0  0]]


Matrca B:
[[-1  0  0 ...  0  0 -1]
 [ 0  0  0 ...  0  0 -1]
 [-1  1  0 ...  0 -1  0]
 ...
 [ 0  0 -1 ...  0 -1  0]
 [ 0  0  0 ... -1  0  0]
 [ 0  0  0 ...  0  0  0]]


Matrica C:
[[ 3  2  0 ... -5 -3 -6]
 [ 2  2  3 ...  0  4  1]
 [-6  3 -2 ... -2 -1 -5]
 ...
 [-1 -2  0 ... -1 -1  2]
 [ 1  0  0 ...  0  2  0]
 [-3  2 -2 ...  2 -1 -2]]


Ocekivana matrica:
[[ 3  2  0 ... -5 -3 -6]
 [ 2  2  3 ...  0  4  1]
 [-6  3 -2 ... -2 -1 -5]
 ...
 [-1 -2  0 ... -1 -1  2]
 [ 1  0  0 ...  0  2  0]
 [-3  2 -2 ...  2 -1 -2]]


47