<a href="https://colab.research.google.com/github/markbojic/parallel-alg/blob/main/Projekat2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/5e/3f/5658c38579b41866ba21ee1b5020b8225cec86fe717e4b1c5c972de0a33c/pycuda-2019.1.2.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 2.8MB/s 
[?25hCollecting pytools>=2011.2
[?25l  Downloading https://files.pythonhosted.org/packages/00/96/00416762a3eda8876a17d007df4a946f46b2e4ee1057e0b9714926472ef8/pytools-2019.1.1.tar.gz (58kB)
[K     |████████████████████████████████| 61kB 8.7MB/s 
Collecting appdirs>=1.4.0
  Downloading https://files.pythonhosted.org/packages/56/eb/810e700ed1349edde4cbdc1b2a21e28cdf115f9faf263f6bbf8447c1abf3/appdirs-1.4.3-py2.py3-none-any.whl
Collecting mako
[?25l  Downloading https://files.pythonhosted.org/packages/b0/3c/8dcd6883d009f7cae0f3157fb53e9afb05a0d3d33b3db1268ec2e6f4a56b/Mako-1.1.0.tar.gz (463kB)
[K     |████████████████████████████████| 471kB 13.8MB/s 
Building wheels for collected packages: pycuda, pytools, mako
  Building wheel for pycuda (setup.py) ... [?

In [None]:
!nvidia-smi

Thu Jan  9 10:08:52 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [None]:
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
import pycuda.autoinit
import gc

kernel_code_template = """
__global__ void MatrixMulKernel(int *A, int *B, int m, int n, int colsA,  int *C)
{
    int row = threadIdx.x;
    int col = threadIdx.y;

    int sum = 0;

    if ((row < m) && (col < n)){
      for (int i = 0; i < colsA; ++i)
        sum += A[colsA*row + i] * B[i*n+col];
      C[row*n+col] = sum;
    }

}
"""

# Velicina matrica za mnozenje - Matrice dele MATRIX_SIZE_2
MATRIX_SIZE_1 = 5
MATRIX_SIZE_2 = 3
MATRIX_SIZE_3 = 4

# Popunjavanje matrica prosledjene velicine sa random elementima
a_cpu = np.random.randn(MATRIX_SIZE_1, MATRIX_SIZE_2).astype(np.int32)
b_cpu = np.random.randn(MATRIX_SIZE_2, MATRIX_SIZE_3).astype(np.int32)

# Prebacivanje matrica na gpu 
a_gpu = gpuarray.to_gpu(a_cpu) 
b_gpu = gpuarray.to_gpu(b_cpu)

# Prazna matrica finalnih dimenizja - Koristi se za cuvanje rezultata mnozenja
c_gpu = gpuarray.empty((MATRIX_SIZE_1, MATRIX_SIZE_3), np.int32)

# Kompajlovanje kernel koda u SOurceMOdule
mod = compiler.SourceModule(kernel_code_template)

# Uzmi kernel funkciju uz mod-a
matrixmul = mod.get_function("MatrixMulKernel")

# Konvertovanje dimenzija kako bi se poslalo u kernel funkciju posle
row1 = np.int32(MATRIX_SIZE_1)
col1 = np.int32(MATRIX_SIZE_2)
row2 = np.int32(MATRIX_SIZE_2)
col2 = np.int32(MATRIX_SIZE_3)

# pozovi kernel
matrixmul(
    # In
    a_gpu, b_gpu, row1, col2, row2, 
    # Out
    c_gpu, 
    #koliko thredova da koristi blok
    block = (MATRIX_SIZE_1, MATRIX_SIZE_3, 1),
    )

# Stampanje rezultata mnozenja
print ("Matrica A:")
print (a_gpu)

print( "\n")
print("Matrca B:")
print(b_gpu)

print( "\n")
print( "Matrica C:")
print(c_gpu)

print( "\n")
print( "Ocekivana matrica:")
c_cpu = np.matmul(a_cpu, b_cpu)
print(c_cpu)

a_gpu.gpudata.free()
b_gpu.gpudata.free()
c_gpu.gpudata.free()
del a_gpu
del b_gpu
del c_gpu
del a_cpu
del b_cpu

#Zato sto zeza nakon nekoliko pustanja- javlja Illegal memory access (Logic Error), zato je gore i brisanje elemenata
#Force garbage Collector za svaki slucaj
gc.collect()

Matrica A:
[[-1  0  1]
 [ 0  0  1]
 [ 0  1  0]
 [-2 -1  0]
 [ 0 -1  0]]


Matrca B:
[[ 0  0  1  0]
 [-1 -1  0  0]
 [ 0  1  2  1]]


Matrica C:
[[ 0  1  1  1]
 [ 0  1  2  1]
 [-1 -1  0  0]
 [ 1  1 -2  0]
 [ 1  1  0  0]]


Ocekivana matrica:
[[ 0  1  1  1]
 [ 0  1  2  1]
 [-1 -1  0  0]
 [ 1  1 -2  0]
 [ 1  1  0  0]]


3163