# Measuring peformance in OpenCL applications

Understanding how well OpenCL perform is a vital part of the development process. There are a few options available in OpenCL, depending on the desired level of granularity. 

* Using OpenCL events to get time spent by a command in a queue
* Using a profiling program to get all information

## Event based profiling

In order to time commands sent to an OpenCL command queue we enable a profiling flag **CL_QUEUE_PROFILING_ENABLE** during command queue creation. Then time elapsed may be extracted directly from profiling events. In the code [mat_mult_profiling.cpp](mat_mult_profiling.cpp) we set the profiling flag to CL_TRUE.

```C++
    // mat_mult_profiling.cpp source

    // Do we enable profiling?
    cl_bool profiling = CL_TRUE;
```

Then from within **h_create_command_queues** in <a href="../include/cl_helper.hpp">cl_helper.hpp</a>, during the call to , the profiling flag CL_QUEUE_PROFILING_ENABLE is incorporated into the command queue properties and passed to [clCreateCommandQueue](https://www.khronos.org/registry/OpenCL/sdk/3.0/docs/man/html/clCreateCommandQueue.html).

```C++
    // cl_helper.hpp source

    // Manage bit fields for the command queue properties
    if (profiling_enable == CL_TRUE) {
        queue_properties = queue_properties | CL_QUEUE_PROFILING_ENABLE;    
    }

    // Allocate memory for the command queues
    cl_command_queue *command_queues = (cl_command_queue*)calloc(num_command_queues, sizeof(cl_command_queue));

    // Fill command queues in a Round-Robin fashion
    for (cl_uint n=0; n<num_command_queues; n++) {
        command_queues[n] = clCreateCommandQueue(
            contexts[n % num_devices],
            devices[n % num_devices],
            queue_properties,
            &errcode    
        );
        h_errchk(errcode, "Creating a command queue");        
    }
```

The function [clGetEventProfilingInfo](https://www.khronos.org/registry/OpenCL/sdk/3.0/docs/man/html/clGetEventProfilingInfo.html) extracts information such as start and end walltimes (in nanoseconds) for an OpenCL event associated with a queued command. We use the helper function **h_get_event_time_ms** in <a href="../include/cl_helper.hpp">cl_helper.hpp</a> to extract the elapsed time.

```C++

// cl_helper.hpp source

cl_double h_get_event_time_ms(
        cl_event *event, 
        const char* message, 
        size_t* nbytes) {
    
    // Make sure the event has finished
    h_errchk(clWaitForEvents(1, event), message);
    
    // Start and end times
    cl_ulong t1, t2;
        
    // Fetch the start and end times in nanoseconds
    h_errchk(
        clGetEventProfilingInfo(
            *event,
            CL_PROFILING_COMMAND_START,
            sizeof(cl_ulong),
            &t1,
            NULL
        ),
        "Fetching start time for event"
    );

    h_errchk(
        clGetEventProfilingInfo(
            *event,
            CL_PROFILING_COMMAND_END,
            sizeof(cl_ulong),
            &t2,
            NULL
        ),
        "Fetching end time for event"
    );
    
    // Convert the time into milliseconds
    cl_double elapsed = (cl_double)(t2-t1)*(cl_double)1.0e-6;
        
    // Print the timing message if necessary
    if (strlen(message)>0) {
        std::printf("Time for event \"%s\": %.3f ms", message, elapsed);
        
        // Print transfer rate if nbytes is specified
        if (nbytes != NULL) {
            cl_double io_rate_MBs = h_get_io_rate_MBs(
                elapsed, 
                *nbytes
            );
            std::printf(" (%.2f MB/s)", io_rate_MBs);
        }
        std::printf("\n");
    }
    
    return elapsed;
}
```

Every command submitted to a command queue may have an event associated with it. We construct a **cl_event** object and use that event to collect timing information. For example, during buffer writes to device the following code goes from this

```C++
    // mat_mult.cpp source

    h_errchk(
        clEnqueueWriteBuffer(command_queue,
                            buffer_A,
                            blocking,
                            0,
                            nbytes_A,
                            array_A,
                            0,
                            NULL,
                            NULL), 
        "Writing to buffer_A from host"
    );
```

to this

```C++
    // mat_mult_profiling.cpp source

    // Event for the uploads and downloads
    cl_event io_event;
    
    h_errchk(
        clEnqueueWriteBuffer(command_queue,
                            buffer_A,
                            blocking,
                            0,
                            nbytes_A,
                            array_A,
                            0,
                            NULL,
                            &io_event), 
        "Writing to buffer_A from host"
    );

    // Time how long it takes to complete event
    cl_double upload_A_ms = h_get_event_time_ms(
        &io_event, 
        "Uploading Buffer A",
        &nbytes_A
    );
```

Similarly, the kernel enqueue code goes from this

```C++
    // mat_mult.cpp source

    // Event for the kernel
    cl_event kernel_event;

    // Now enqueue the kernel
    h_errchk(
        clEnqueueNDRangeKernel(command_queue,
                                kernel,
                                work_dim,
                                NULL,
                                global_size,
                                local_size,
                                0,
                                NULL,
                                &kernel_event), 
        "Running the kernel"
    );

    // Wait on the kernel to finish
    h_errchk(
        clWaitForEvents(1, &kernel_event),
        "Waiting on the kernel"
    );
```

to this

```C++
    // mat_mult_profiling.cpp source
    
    // Event for the kernel
    cl_event kernel_event;

    // Now enqueue the kernel
    h_errchk(
        clEnqueueNDRangeKernel(command_queue,
                                kernel,
                                work_dim,
                                NULL,
                                global_size,
                                local_size,
                                0,
                                NULL,
                                &kernel_event), 
        "Running the kernel"
    );

    // Time how long it takes to complete event
    cl_double run_kernel_ms = h_get_event_time_ms(
        &kernel_event, 
        "Running kernel",
        NULL
    );
```

In this manner we instrument the uploads, downloads, and kernel execution in the source file [mat_mult_profiling.cpp](mat_mult_profiling.cpp). Now we run the instrumented code and print out the results.

## Problem setup

In [27]:
import numpy as np

from matplotlib import pyplot as plt

%matplotlib widget

# A is of size (NROWS_C, NCOLS_A)
# B is of size (NCOLS_A, NCOLS_C)    
# C is of size (NROWS_C, NCOLS_C)

NCOLS_A = 256
NROWS_C = 520
NCOLS_C = 1032

# Data type
dtype = np.float32

# Make up the arrays A, B, and C
A = np.random.random(size = (NROWS_C, NCOLS_A)).astype(dtype)
B = np.random.random(size = (NCOLS_A, NCOLS_C)).astype(dtype)

# Make up the answer
C = np.matmul(A, B, dtype = dtype)

# Write out the arrays as binary files
A.tofile("array_A.dat")
B.tofile("array_B.dat")

In [28]:
!make clean; make; ./mat_mult_profiling.exe

rm -r *.exe
g++ -std=c++11 -g -O2 -fopenmp -I/usr/local/cuda/include -I../include -L/usr/local/cuda/lib64 mat_mult_profiling.cpp\
	-o mat_mult_profiling.exe -lOpenCL -lomp
In file included from [01m[K../include/cl_helper.hpp:11[m[K,
                 from [01m[Kmat_mult_profiling.cpp:16[m[K:
 5085 |         VECTOR_CLASS<cl_int[01;35m[K>[m[K* binaryStatus = NULL,
      |                            [01;35m[K^[m[K
In file included from [01m[Kmat_mult_profiling.cpp:16[m[K:
   17 | std::map<cl_int, const char*[01;35m[K>[m[K error_codes {
      |                             [01;35m[K^[m[K
	               name: NVIDIA GeForce RTX 3060 
	 global memory size: 12636 MB
	    max buffer size: 3159 MB
	     max local size: (1024,1024,64)
	     max work-items: 1024
Time for event "Uploading Buffer A": 0.042 ms (12530.12 MB/s)
Time for event "Uploading Buffer B": 0.084 ms (12513.83 MB/s)
Time for event "Running kernel": 2.889 ms
Time for event "Downloading Buffer C": 0.520 

In [29]:
# Import axes machinery
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Read in the output from OpenCL
C_ocl = np.fromfile("array_C.dat", dtype=dtype).reshape((NROWS_C, NCOLS_C))

# Make plots
fig, axes = plt.subplots(3, 1, figsize=(6,8), sharex=True, sharey=True)

# Data to plot
data = [C, C_ocl, np.abs(C-C_ocl)]

# Labels to plot
labels = ["Numpy", "OpenCL", "Absolute residual"]

for n, value in enumerate(data):
    # Plot the graph
    ax = axes[n]
    im = ax.imshow(value)
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.1)

    # Set labels on things
    ax.set_xlabel("Dimension 1 (columns)")
    ax.set_ylabel("Dimension 0 (rows)")
    ax.set_title(labels[n])

    # Put a color bar on the plot
    plt.colorbar(mappable=im, cax=cax)

fig.tight_layout()
plt.show()