# Tutorial OpenCL 

Requisitos:
- Python (pyopencl, numpy, pandas)
- OpenCL 

In [1]:
#
# Matrix Multiplication Driver
#
# This is a driver program to test various ways of computing
# the product:
#                 C = A * B
#
# A and B are constant matrices, square and the order is
# set as a constant, ORDER (see definitions.py). This is so
# we can make a quick test of the multiplication result.
#
# History:   C++ version written by Tim Mattson, August 2010 
#            Modified by Simon McIntosh-Smith, September 2011
#            Modified by Tom Deakin and Simon McIntosh-Smith, October 2012
#            Ported to Python by Tom Deakin, July 2013
#            Ported to Jupyter by Ricardo Menotti, May 2020

from time import time
import pyopencl as cl
import pandas as pd
import numpy
import os

# Order of the square matrices A, B and C
ORDER = 1024

# A elemetns are constant and equal to AVAL
AVAL = 3.0

# B elemetns are constant and equal to BVAL
BVAL = 5.0

# tolerance used in floating point comparisons
TOL = 0.001

# Max dim for NDRange
DIM = 2

# File to read/write results
FILENAME = 'results.csv'

# Set the environment variable to choose platform/device 
os.environ["PYOPENCL_CTX"] = '0:1'

# number of times to do each multiplication
COUNT = 10

# Creating Pandas DataFrame 
if os.path.isfile(FILENAME):
    df = pd.read_csv(FILENAME)
else:
    df = pd.DataFrame({"version": [], "execution_time": []})

#  Function to compute the matrix product (sequential algorithm, dot prod)
def seq_mat_mul_sdot(N, A, B, C):
    for i in range(N):
        for j in range(N):
            tmp = 0.0
            for k in range(N):
                tmp += A[i*N+k] * B[k*N+j]
            C[i*N+j] = tmp

#  Function to compute errors of the product matrix
def error(N, C):
   cval = float(N) * AVAL * BVAL
   errsq = 0.0
   for i in range(N):
       for j in range(N):
            err = C[i*N+j] - cval
            errsq += err * err
   return errsq;

# Function to analyze and output results
def results(N, C, run_time):
    mflops = 2.0 * N * N * N/(1000000.0* run_time)
    print run_time, "seconds at", mflops, "MFLOPS"
    errsq = error(N, C)
    if (errsq > TOL):
        print "Errors in multiplication:", errsq

# A[N][N], B[N][N], C[N][N]
N = ORDER;

# Number of elements in the matrix
size = N * N

# A matrix
h_A = numpy.empty(size).astype(numpy.float32)
h_A.fill(AVAL)

# B matrix
h_B = numpy.empty(size).astype(numpy.float32)
h_B.fill(BVAL)

# C matrix
h_C = numpy.empty(size).astype(numpy.float32)

Em Python o OpenCL parece um pouco mais limpo, por exemplo, basta as duas primeiras linhas abaixo para criar um fila associada a um dispositivo. 

In [2]:
# Setup OpenCL
context = cl.create_some_context()
queue = cl.CommandQueue(context)

# Reset host buffers - just to play it safe
h_A = numpy.empty(size).astype(numpy.float32)
h_A.fill(AVAL)
h_B = numpy.empty(size).astype(numpy.float32)
h_B.fill(BVAL)
h_C = numpy.empty(size).astype(numpy.float32)

# Create OpenCL buffers
d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_A)
d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_B)
d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)

Na versão mais simples, cada _thread_ calcula um elemento da matriz:

In [3]:
!cat C_elem.cl


__kernel void mmul(
    const int N,
    __global float* A,
    __global float* B,
    __global float* C)
{
    int k;
    int i = get_global_id(0);
    int j = get_global_id(1);
    float tmp;
    if ((i < N) && (j < N))
    {
        tmp = 0.0;
        for (k = 0; k < N; k++)
            tmp += A[i*N+k] * B[k*N+j];
        C[i*N+j] = tmp;
    }
}


In [4]:
#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... Naive
#--------------------------------------------------------------------------------

kernelsource = open("C_elem.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])
print "\n===== OpenCL, matrix mult, C(i,j) per work item, order", N, "======\n"

# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()
    mmul(queue, (N, N), None, N, d_a, d_b, d_c)
    queue.finish()
    run_time = time() - start_time
    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)
    df = df.append(pd.DataFrame({"version": ["naive"], 
                                 "execution_time": [run_time]}))
df.to_csv(FILENAME, index = False, header = True)



0.373422145844 seconds at 5750.82027647 MFLOPS
0.364454030991 seconds at 5892.33062442 MFLOPS
0.363171100616 seconds at 5913.14574413 MFLOPS
0.361845016479 seconds at 5934.81615111 MFLOPS
0.362810134888 seconds at 5919.0288294 MFLOPS
0.36500120163 seconds at 5883.49747456 MFLOPS
0.364015102386 seconds at 5899.43558364 MFLOPS
0.368123054504 seconds at 5833.60270899 MFLOPS
0.367523908615 seconds at 5843.11278167 MFLOPS
0.370308876038 seconds at 5799.16871283 MFLOPS


Nesta versão, cada _work item_ calcula uma linha da matriz:

In [5]:
!cat C_row.cl


__kernel void mmul(
    const int N,
    __global float* A,
    __global float* B,
    __global float* C)
{
    int k, j;
    int i = get_global_id(0);
    float tmp;
    if (i < N) {
        for (j = 0; j < N; j++) {
            tmp = 0.0;
            for (k = 0; k < N; k++)
                tmp += A[i*N+k] * B[k*N+j];
            C[i*N+j] = tmp;
        }
    }
}


In [6]:
#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... C row per work item
#--------------------------------------------------------------------------------

kernelsource = open("C_row.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])
print "\n===== OpenCL, matrix mult, C row per work item, order", N, "======\n"
# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()
    mmul(queue, (N,), (ORDER/16,), N, d_a, d_b, d_c)
    queue.finish()
    run_time = time() - start_time
    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)
    df = df.append(pd.DataFrame({"version": ["C row per work item"], 
                                 "execution_time": [run_time]}))
df.to_csv(FILENAME, index = False, header = True)



0.796031951904 seconds at 2697.73549022 MFLOPS
0.793258905411 seconds at 2707.16613876 MFLOPS
0.793953895569 seconds at 2704.79641196 MFLOPS
0.807863950729 seconds at 2658.22437808 MFLOPS
0.801470994949 seconds at 2679.42777909 MFLOPS
0.797060966492 seconds at 2694.25268365 MFLOPS
0.794291973114 seconds at 2703.64515907 MFLOPS
0.791192054749 seconds at 2714.23813613 MFLOPS
0.798470020294 seconds at 2689.498157 MFLOPS
0.798018932343 seconds at 2691.01842195 MFLOPS


Agora fazemos o mesmo, mas antes copiamos uma linha da matriz A em memória privada 

In [7]:
!cat C_row_priv.cl


__kernel void mmul(
    const int N,
    __global float* A,
    __global float* B,
    __global float* C)
{
    int k, j;
    int i = get_global_id(0);
    float Awrk[1024];
    float tmp;
    if (i < N) {
        for (k = 0; k < N; k++)
            Awrk[k] = A[i*N+k];

        for (j = 0; j < N; j++) {
            tmp = 0.0f;
            for (k = 0; k < N; k++)
                tmp += Awrk[k] * B[k*N+j];
            C[i*N+j] = tmp;
        }
    }
}


In [8]:
#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... C row per work item, A row in private memory
#--------------------------------------------------------------------------------

kernelsource = open("C_row_priv.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None])
print "\n===== OpenCL, matrix mult, C row, A row in priv mem, order", N, "======\n"
# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()
    mmul(queue, (N,), (ORDER/16,), N, d_a, d_b, d_c)
    queue.finish()
    run_time = time() - start_time
    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)
    df = df.append(pd.DataFrame({"version": ["C row, A row in priv mem"], 
                                 "execution_time": [run_time]}))
df.to_csv(FILENAME, index = False, header = True)



0.786987066269 seconds at 2728.74071258 MFLOPS
0.776756048203 seconds at 2764.68223578 MFLOPS
0.784663200378 seconds at 2736.82217665 MFLOPS
0.78093290329 seconds at 2749.89520733 MFLOPS
0.791730165482 seconds at 2712.39336535 MFLOPS
0.795069932938 seconds at 2700.99969705 MFLOPS
0.776432037354 seconds at 2765.83595819 MFLOPS
0.780077934265 seconds at 2752.90910519 MFLOPS
0.786521911621 seconds at 2730.35450923 MFLOPS
0.781866788864 seconds at 2746.6106485 MFLOPS


Agora vamos usar a memória local para a coluna B da matriz a ser calculada 

In [9]:
!cat C_row_priv_bloc.cl


__kernel void mmul(
    const int N,
    __global float* A,
    __global float* B,
    __global float* C,
    __local float* Bwrk)
{
    int k, j;
    int i    = get_global_id(0);
    int iloc = get_local_id(0);
    int nloc = get_local_size(0);
    float Awrk[1024];
    float tmp;
    if (i < N) {
        for (k = 0; k < N; k++)
            Awrk[k] = A[i*N+k];

        for (j = 0; j < N; j++) {
            barrier(CLK_LOCAL_MEM_FENCE);
            for (k = iloc; k < N; k += nloc)
                Bwrk[k] = B[k*N+j];
            barrier(CLK_LOCAL_MEM_FENCE);
            tmp = 0.0f;
            for (k = 0; k < N; k++)
                tmp += Awrk[k] * Bwrk[k];
            C[i*N+j] = tmp;
            barrier(CLK_LOCAL_MEM_FENCE);
        }
    }
}


In [10]:
#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... C row per work item, A row private, B col local
#--------------------------------------------------------------------------------

kernelsource = open("C_row_priv_bloc.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None, None])
print "\n===== OpenCL, mat mult, C row, priv A, B cols loc, order", N, "======\n"
# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()
    localmem = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * N)
    mmul(queue, (N,), (ORDER/16,), N, d_a, d_b, d_c, localmem)
    queue.finish()
    run_time = time() - start_time
    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)
    df = df.append(pd.DataFrame({"version": ["C row, priv A, B cols loc"], 
                                 "execution_time": [run_time]}))
df.to_csv(FILENAME, index = False, header = True)



0.509839057922 seconds at 4212.08146891 MFLOPS
0.469090938568 seconds at 4577.96872938 MFLOPS
0.45113492012 seconds at 4760.1804964 MFLOPS
0.461159944534 seconds at 4656.70029119 MFLOPS
0.456497907639 seconds at 4704.25737351 MFLOPS
0.455199956894 seconds at 4717.67102671 MFLOPS
0.449820041656 seconds at 4774.09508054 MFLOPS
0.449256896973 seconds at 4780.07942109 MFLOPS
0.45102596283 seconds at 4761.33044432 MFLOPS
0.452090024948 seconds at 4750.12393438 MFLOPS


Por fim, temos uma versão que calcula por blocos:

In [11]:
!cat C_block_form.cl

//-------------------------------------------------------------
//
//  PROGRAM: Blocked Matrix Multipliplication kernel
//
//  PURPOSE: Computes an element of the proudct matrix
//
//              C = A * B
//
//           Using the well known blocked algorithm.  
//
//           To derive this algorithm, start with the naive
//           triply nested loop algorithm with a dot product 
//           for each element of C.  Decompose each loop 
//           into blocks of size blcksz.  This gives you 6
//           nested loops with three loops over blocks
//           and three loops over indices inside the blocks.
// 
//           Rearrange the loops to put the 3 loops over blocks 
//           at the outermost loops of the loop nest.  You'll
//           see that the three "inner" loops are just the 
//           regular matrix product between blocks.
//
//           The algorithms is simple.  Keeping all the indices
//           straight is not.  We will use t

In [12]:
#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... blocked
#--------------------------------------------------------------------------------

kernelsource = open("C_block_form.cl").read()
program = cl.Program(context, kernelsource).build()
mmul = program.mmul
mmul.set_scalar_arg_dtypes([numpy.int32, None, None, None, None, None])
print "\n==== Parallel matrix mult (blocked), order {0} on device ======\n".format(N)
# Do the multiplication COUNT times
for i in range(COUNT):
    h_C.fill(0.0)
    start_time = time()
    # Work-group computes a block of C. This size is also set
    # in a #define inside the kernel function. Note this blocksize
    # must evenly divide the matrix order
    blocksize = 16
    A_block = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blocksize * blocksize)
    B_block = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize * blocksize * blocksize)
    mmul(queue, (N,N), (blocksize,blocksize), N,
        d_a, d_b, d_c, A_block, B_block)
    queue.finish()
    run_time = time() - start_time
    cl.enqueue_copy(queue, h_C, d_c)
    results(N, h_C, run_time)
    df = df.append(pd.DataFrame({"version": ["blocked"], 
                                 "execution_time": [run_time]}))
df.to_csv(FILENAME, index = False, header = True)



0.0722081661224 seconds at 29740.1770924 MFLOPS
0.071408033371 seconds at 30073.4181663 MFLOPS
0.0721929073334 seconds at 29746.4630159 MFLOPS
0.0695888996124 seconds at 30859.5718564 MFLOPS
0.0695638656616 seconds at 30870.6772917 MFLOPS
0.0693562030792 seconds at 30963.1085996 MFLOPS
0.069286108017 seconds at 30994.4332199 MFLOPS
0.0690960884094 seconds at 31079.6703176 MFLOPS
0.0724558830261 seconds at 29638.4994332 MFLOPS
0.0698919296265 seconds at 30725.7741985 MFLOPS


In [13]:
# Plot results
df = pd.read_csv(FILENAME)
by_version = df.groupby(by="version", sort=False)
avg_time = by_version.mean()
std = by_version.std()
plt = avg_time.plot(legend=False, kind="bar", yerr=std);
plt.set_title("Average execution time");
plt.set_ylabel("Execution time (seconds)");
plt.get_figure().savefig("results.pdf")