<a href="https://colab.research.google.com/github/markbojic/parallel-alg/blob/main/Projekat2_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pycuda

Collecting pycuda
[?25l  Downloading https://files.pythonhosted.org/packages/46/61/47d3235a4c13eec5a5f03594ddb268f4858734e02980afbcd806e6242fa5/pycuda-2020.1.tar.gz (1.6MB)
[K     |████████████████████████████████| 1.6MB 18.0MB/s 
[?25hCollecting pytools>=2011.2
[?25l  Downloading https://files.pythonhosted.org/packages/b7/30/c9362a282ef89106768cba9d884f4b2e4f5dc6881d0c19b478d2a710b82b/pytools-2020.4.3.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 8.8MB/s 
Collecting appdirs>=1.4.0
  Downloading https://files.pythonhosted.org/packages/3b/00/2344469e2084fb287c2e0b57b72910309874c3245463acd6cf5e3db69324/appdirs-1.4.4-py2.py3-none-any.whl
Collecting mako
[?25l  Downloading https://files.pythonhosted.org/packages/a6/37/0e706200d22172eb8fa17d68a7ae22dec7631a0a92266634fb518a88a5b2/Mako-1.1.3-py2.py3-none-any.whl (75kB)
[K     |████████████████████████████████| 81kB 11.5MB/s 
Building wheels for collected packages: pycuda, pytools
  Building wheel for pycuda (setup.py) ..

In [None]:
!nvidia-smi

Sat Jan 11 08:04:24 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [None]:
import numpy as np
from pycuda import driver, compiler, gpuarray, tools
import pycuda.autoinit
import gc
import math

kernel_code_template = """
__global__ void MatrixMulKernel(int *A, int *B, int m, int n, int colsA,  int *C)
{
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    int col = blockIdx.y * blockDim.y + threadIdx.y;
    int sum = 0;

    if ((row < m) && (col < n)){
      for (int i = 0; i < colsA; ++i)
        sum += A[colsA*row + i] * B[i*n+col];
      C[row*n+col] = sum;
    }
}
"""

# Velicina matrica za mnozenje - Matrice dele MATRIX_SIZE_2
MATRIX_SIZE_1 = 100
MATRIX_SIZE_2 = 80
MATRIX_SIZE_3 = 100

# Generisanje 2 matrice odredjene velicine sa random elementima
a_cpu = np.random.randn(MATRIX_SIZE_1, MATRIX_SIZE_2).astype(np.int32)
b_cpu = np.random.randn(MATRIX_SIZE_2, MATRIX_SIZE_3).astype(np.int32)

# Prebacivanje matrica na gpu 
a_gpu = gpuarray.to_gpu(a_cpu) 
b_gpu = gpuarray.to_gpu(b_cpu)

# Prazna matrica istih dimenizja- Koristi se za cuvanje rezultata mnozenja
c_gpu = gpuarray.empty((MATRIX_SIZE_1, MATRIX_SIZE_3), np.int32)

c_cpu = np.random.randn(MATRIX_SIZE_1, MATRIX_SIZE_3).astype(np.int32)

# Kompajlovanje kernel koda u SOurceMOdule
mod = compiler.SourceModule(kernel_code_template)

# Uzmi kernel funkciju uz mod-a
matrixmul = mod.get_function("MatrixMulKernel")

#Konvertovanje dimenzija kako bi se poslalo u kernel funkciju posle
row1 = np.int32(MATRIX_SIZE_1)
col1 = np.int32(MATRIX_SIZE_2)
row2 = np.int32(MATRIX_SIZE_2)
col2 = np.int32(MATRIX_SIZE_3)

#pozovi kernel
matrixmul(
    # In
    a_gpu, b_gpu, row1, col2, col1, 
    # Out
    c_gpu, 
    #koliko thredova da koristi blok
    block = (32, 32, 1),
    grid = (math.ceil(c_cpu.shape[0]/32), math.ceil(c_cpu.shape[0]/32), 1)
    )


# Stampanje rezultata mnozenja
print ("Matrica A:")
print (a_gpu)

print( "\n")
print("Matrca B:")
print(b_gpu)

print( "\n")
print( "Matrica C:")
print(c_gpu)

print("\n")
print("Ocekivano:")
c_cpu = np.matmul(a_cpu, b_cpu)
print(c_cpu)
print("\n")


a_gpu.gpudata.free()
b_gpu.gpudata.free()
c_gpu.gpudata.free()
del a_gpu
del b_gpu
del c_gpu
del a_cpu
del b_cpu


#Zato sto zeza nakon nekoliko pustanja- javlja Illegal memory access (Logic Error), zato je gore i brisanje elemenata
#Force garbage Collector za svaki slucaj
gc.collect()

Matrica A:
[[ 0 -1  0 ...  0  0  2]
 [ 0 -3  0 ...  0  0  0]
 [-1  2  0 ...  0  0  1]
 ...
 [ 0  0  1 ...  0  0  0]
 [ 0  0 -1 ...  0 -1  0]
 [ 0  0  0 ...  0  0  0]]


Matrca B:
[[ 0  0  0 ...  0 -2  0]
 [ 0  0  0 ... -1  0  0]
 [ 0  0  0 ...  1  0  0]
 ...
 [ 2 -1  0 ...  1  0  0]
 [ 2 -2  1 ...  1  1  0]
 [ 0  0  0 ...  0  0  0]]


Matrica C:
[[-3 -4  1 ...  0 -5 -2]
 [ 6 -6 -3 ...  0 -5 -1]
 [-3  6 -2 ... -8 -2 -1]
 ...
 [-4 -5  0 ...  1 -4 -2]
 [-4  4  0 ...  6  2 -4]
 [ 1 -3 -1 ...  0 -2  3]]


Ocekivano:
[[-3 -4  1 ...  0 -5 -2]
 [ 6 -6 -3 ...  0 -5 -1]
 [-3  6 -2 ... -8 -2 -1]
 ...
 [-4 -5  0 ...  1 -4 -2]
 [-4  4  0 ...  6  2 -4]
 [ 1 -3 -1 ...  0 -2  3]]




3690