From 45922a23f728e750189fe64491a83ceb14baed31 Mon Sep 17 00:00:00 2001
From: Peter Caday <peter.caday@intel.com>
Date: Thu, 19 Oct 2023 12:25:45 -0700
Subject: [PATCH] matrix_mul_mkl: revamp sample

---
 Libraries/oneMKL/matrix_mul_mkl/GNUmakefile   |  19 +-
 Libraries/oneMKL/matrix_mul_mkl/makefile      |  19 +-
 .../oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp  | 336 +++++++-----------
 Libraries/oneMKL/matrix_mul_mkl/utilities.hpp |  47 +++
 4 files changed, 196 insertions(+), 225 deletions(-)
 create mode 100644 Libraries/oneMKL/matrix_mul_mkl/utilities.hpp
diff --git a/Libraries/oneMKL/matrix_mul_mkl/GNUmakefile b/Libraries/oneMKL/matrix_mul_mkl/GNUmakefile
index 5ff8249e6e..06d00c54e0 100644
--- a/Libraries/oneMKL/matrix_mul_mkl/GNUmakefile
+++ b/Libraries/oneMKL/matrix_mul_mkl/GNUmakefile
@@ -1,26 +1,23 @@
 # Makefile for GNU Make
 
-default: all
+default: run
 
-all: sgemm.mkl dgemm.mkl
+all: matrix_mul_mkl
 
-run: sgemm.mkl dgemm.mkl
-	./sgemm.mkl
-	./dgemm.mkl
+run: matrix_mul_mkl
+	./matrix_mul_mkl single
+	./matrix_mul_mkl double
 
 INCLUDE_COMMON=../../../common
 MKL_COPTS = -DMKL_ILP64  -qmkl=sequential
 MKL_LIBS  = -lsycl -lOpenCL -lpthread -lm -ldl
 
-DPCPP_OPTS = -O3 $(MKL_COPTS) $(MKL_LIBS)
+DPCPP_OPTS = -O2 $(MKL_COPTS) $(MKL_LIBS)
 
-sgemm.mkl: matrix_mul_mkl.cpp
+matrix_mul_mkl: matrix_mul_mkl.cpp
 	icpx -fsycl -I$(INCLUDE_COMMON) $< -o $@ $(DPCPP_OPTS)
 
-dgemm.mkl: matrix_mul_mkl.cpp
-	icpx -fsycl -I$(INCLUDE_COMMON) $< -o $@ $(DPCPP_OPTS) -DUSE_DOUBLE
-
 clean:
-	-rm -f sgemm.mkl dgemm.mkl
+	-rm -f matrix_mul_mkl
 
 .PHONY: clean run all
diff --git a/Libraries/oneMKL/matrix_mul_mkl/makefile b/Libraries/oneMKL/matrix_mul_mkl/makefile
index d425236e20..70fda7c4d7 100644
--- a/Libraries/oneMKL/matrix_mul_mkl/makefile
+++ b/Libraries/oneMKL/matrix_mul_mkl/makefile
@@ -1,22 +1,19 @@
 # Makefile for NMAKE
 
-default: all
+default: run
 
-all: sgemm.exe dgemm.exe
+all: matrix_mul_mkl.exe
 
-run: sgemm.exe dgemm.exe
-	.\sgemm.exe
-	.\dgemm.exe
+run: matrix_mul_mkl.exe
+	.\matrix_mul_mkl.exe single
+	.\matrix_mul_mkl.exe double
 
 DPCPP_OPTS=/I"$(MKLROOT)\include" /Qmkl /EHsc -fsycl-device-code-split=per_kernel OpenCL.lib
 
-sgemm.exe: matrix_mul_mkl.cpp
-	icx-cl -fsycl matrix_mul_mkl.cpp /Fesgemm.exe $(DPCPP_OPTS)
-
-dgemm.exe: matrix_mul_mkl.cpp
-	icx-cl -fsycl matrix_mul_mkl.cpp /Fedgemm.exe $(DPCPP_OPTS) -DUSE_DOUBLE
+matrix_mul_mkl.exe: matrix_mul_mkl.cpp
+	icx-cl -fsycl matrix_mul_mkl.cpp /Fematrix_mul_mkl.exe $(DPCPP_OPTS)
 
 clean:
-	del /q sgemm.exe sgemm.exp sgemm.lib dgemm.exe dgemm.exp dgemm.lib
+	del /q matrix_mul_mkl.exe matrix_mul_mkl.exp matrix_mul_mkl.lib
 
 pseudo: clean run all
diff --git a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
index 91d0e38e60..6583384dba 100644
--- a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
+++ b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp
@@ -1,225 +1,155 @@
 //==============================================================
-// Copyright © 2020 Intel Corporation
+// Copyright © 2023 Intel Corporation
 //
 // SPDX-License-Identifier: MIT
 // =============================================================
 //
-// Matrix Multiplication is a simple program that multiplies together two
-//     large matrices and verifies the results.
-// This samples uses the oneAPI Math Kernel Library (oneMKL) to accelerate
-//     the computation.
-
-// The test is updated based on oneAPI samples oneAPI-samples/Libraries/oneMKL/matrix_mul_mkl
-#include <iostream>
-#include <iomanip>
-#include <limits>
+// Contents:
+//     A simple matrix multiplication benchmark, using the oneAPI Math Kernel
+//     Library (oneMKL).
+//
 
 #include <sycl/sycl.hpp>
-#include "oneapi/mkl.hpp"
-#include "dpc_common.hpp"
-
-#ifndef USE_DOUBLE
-#define FLOAT   float
-#else
-#define FLOAT   double
-#endif
-
-FLOAT rand_uniform();
-bool verify_result(int m, int n, int k, int ldc, FLOAT *C, FLOAT *C_reference);
-
-#define WARMUP	 10
-#define LOOPS   100
-//default matrix size 8192x8192
-#define MSIZE   8192
-#define VERIFY_RESULT   False
+#include <oneapi/mkl.hpp>
+#include <chrono>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <string>
 
+#include "utilities.hpp"
 
-using namespace std ;
+using namespace sycl;
 
-int main(int argc, char* argv[])
+template <typename T>
+void test(queue &Q, int M, int N, int K)
 {
-    try {
-
-        int msize = MSIZE;
-        int loops = LOOPS;
-        int verify = 0;
-
-        // Initialize data for GEMM. The full GEMM operation is:
-        //
-        //      C = alpha * op(A) * op(B) + beta * C
-        //
-        // where alpha, beta are scalar values, and op(...) represents
-        // optional matrix transposition.
-        //
-        // For this simple matrix multiplication, no transposition is needed.
-        // 
-        // By choosing alpha = 1, beta = 0, GEMM will calculate C = A * B.
-        //
-        // In this example, matrices are stored in row-major layout.
-
-        auto transA = oneapi::mkl::transpose::nontrans;
-        auto transB = oneapi::mkl::transpose::nontrans;
-
-        // Matrix data sizes.
-        // 
-        // A is m x k
-        // B is k x n  --> product C is m x n
-        int m = msize;
-        int k = msize;
-        int n = msize;
-
-        cout << "Problem size: "
-                  << " A (" << m << 'x' << k << ") *"
-                  << " B (" << k << 'x' << n << ")  --> "
-                  << " C (" << m << 'x' << n << ")\n";
-        
-        cout << "Benchmark interations: " << loops << endl;
-
-        // Leading dimensions of data. For row-major matrices, the leading
-        // dimension is the stride between adjacent rows.
-        int lda = k;
-        int ldb = n;
-        int ldc = n;
-
-        // Scaling factors.
-        FLOAT alpha = 1.0f;
-        FLOAT beta = 0.0f;
-
-        // Create a queue on the default device.
-        sycl::queue device_queue{sycl::default_selector_v};
-
-        std::cout << "Device: "
-                  << device_queue.get_device().get_info<sycl::info::device::name>()
-                  << std::endl;
-
-        // Allocate shared memory for matrices.
-        const size_t alignment = 4096;
-        auto a = sycl::aligned_alloc_host<FLOAT>(alignment, m * k, device_queue);
-        auto b = sycl::aligned_alloc_host<FLOAT>(alignment, k * n, device_queue);
-        auto c = sycl::aligned_alloc_host<FLOAT>(alignment, m * n, device_queue);
-
-        auto C_reference = (FLOAT *) calloc(m * n, sizeof(FLOAT));
-
-        if (!a || !b || !c || !C_reference) {
-            std::cerr << "Could not allocate memory for matrices." << std::endl;
-            exit(1);
-        }
-
-        // Initialize matrix data.
-        for (int i = 0; i < m; i++)
-            for (int j = 0; j < k; j++)
-                a[i * lda + j] = rand_uniform();
-
-        for (int i = 0; i < k; i++)
-            for (int j = 0; j < n; j++)
-                b[i * ldb + j] = rand_uniform();
-
-        auto A = sycl::aligned_alloc_device<FLOAT>(alignment, m * k, device_queue);
-        auto B = sycl::aligned_alloc_device<FLOAT>(alignment, m * n, device_queue);
-        auto C = sycl::aligned_alloc_device<FLOAT>(alignment, m * n, device_queue);
-        device_queue.wait();
-
-        device_queue.memcpy(A, &(a[0]), m * k * sizeof(FLOAT));
-        device_queue.memcpy(B, &(b[0]), k * n * sizeof(FLOAT));
-        device_queue.memcpy(C, &(c[0]), m * n * sizeof(FLOAT));
-        device_queue.wait();
-        
-
-        // Call GEMM to do matrix multiplication, asynchronously.
-        std::cerr << "Launching oneMKL GEMM calculation..." << std::endl;
-        dpc_common::TimeInterval timer;
-        double start_time = 0.0;
-
-        //warm up
-	for (int w=0; w < WARMUP; w++)
-	{
-            oneapi::mkl::blas::row_major::gemm(device_queue, transA, transB, m, n, k,
-                                            alpha, A, lda, B, ldb, beta, C, ldc);
-	}
-	device_queue.wait_and_throw();
-
-        start_time = timer.Elapsed();
-        for (int l=0; l < loops; l++)
-        {
-            oneapi::mkl::blas::row_major::gemm(device_queue, transA, transB, m, n, k,
-                                            alpha, A, lda, B, ldb, beta, C, ldc);
-        }
-        // Wait for oneMKL computation to complete.        
-        device_queue.wait_and_throw();
-
-        double stop_time = timer.Elapsed();
-        double avg_gemm_time = (stop_time - start_time)/loops;
-
-        double gflops = 2.0 * (double)m * (double)m * (double)m;
-        #ifdef USE_DOUBLE
-            cout << "DGEMM performance : " << gflops / avg_gemm_time * 1.e-9 << " GFLOPS" << endl;
-        #else
-            cout << "SGEMM performance : " << gflops / avg_gemm_time * 1.e-9 << " GFLOPS" << endl;
-        #endif
-
-        
-        if(verify)
-        {
-            // While calculation occurs, compute reference result to check accuracy.
-            std::cerr << "Performing reference calculation..." << std::endl;
-            for (int i = 0; i < m; i++)
-                for (int h = 0; h < k; h++)
-                    for (int j = 0; j < n; j++)
-                        C_reference[i * ldc + j] += a[i * lda + h] * b[h * ldb + j];        
-            // Check results for accuracy.
-           device_queue.memcpy(&(c[0]), C, m*n*sizeof(FLOAT)).wait();
-           verify_result(m, n, k, ldc, c, C_reference);
-        }
-        
-
-        // Free memory.
-        free(A, device_queue);
-        free(B, device_queue);
-        free(C, device_queue);
-        free(C_reference);        
-	free(a, device_queue);
-	free(b, device_queue);
-	free(c, device_queue);
-
-    } catch (const std::exception &e) {
-        std::cerr << "An exception occurred: "
-                  << e.what() << std::endl;
-        exit(1);
+    std::cout << "\nBenchmarking (" << M << " x " << K << ") x (" << K << " x " << N << ") matrix multiplication, " << type_string<T>() << std::endl;;
+
+    std::cout << " -> Initializing data...\n";
+
+    /* Allocate A/B/C matrices */
+    int lda = nice_ld<T>(M);
+    int ldb = nice_ld<T>(K);
+    int ldc = nice_ld<T>(M);
+
+    auto A = malloc_device<T>(lda * K, Q);
+    auto B = malloc_device<T>(ldb * N, Q);
+    auto C = malloc_device<T>(ldc * N, Q);
+
+    /* Fill A/B with random data */
+    constexpr int rd_size = 1048576;
+    auto random_data = malloc_host<T>(rd_size, Q);
+    generate_random_data(rd_size, random_data);
+
+    replicate_data(Q, A, lda * K, random_data, rd_size);
+    replicate_data(Q, B, ldb * N, random_data, rd_size);
+
+    /* Measure time for a given number of GEMM calls */
+    auto time_gemms = [=, &Q](int runs) -> double {
+        using namespace oneapi::mkl;
+        using namespace std::chrono;
+        auto start = steady_clock::now();
+        for (int i = 0; i < runs; i++)
+            blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc);
+        Q.wait_and_throw();
+        auto end = steady_clock::now();
+        return duration<double>(end - start).count();
+    };
+
+    /* Do a warmup call to initialize MKL and ensure kernels are JIT'ed if needed */
+    std::cout << " -> Warmup...\n";
+    (void) time_gemms(1);
+
+    /* Time one GEMM call, and estimate how many calls will be required to keep the
+     * GPU busy for 1s. */
+    auto tare = time_gemms(1);
+    int ncalls = std::max(4, std::min(1000, int(1. / tare)));
+
+    /* Time that many GEMMs, subtracting the first call time to remove host overhead.
+     * This gives a better idea of device performance. */
+    std::cout << " -> Timing...\n";
+    auto time = time_gemms(ncalls + 1) - tare;
+    auto avg = time / ncalls;
+
+    /* Calculate and display performance */
+    auto op_count = double(M) * double(N) * double(K) * 2;
+    auto flops = op_count / avg;
+
+    flops *= 1e-9;
+    char unit = 'G';
+    if (flops >= 1000.) {
+        flops *= 1e-3;
+        unit = 'T';
     }
+    if (flops >= 1000.) {
+        flops *= 1e-3;
+        unit = 'P';
+    }
+
+    std::cout << "\nAverage performance: " << flops << unit << 'F' << std::endl;
+
+    /* Free data */
+    free(A, Q);
+    free(B, Q);
+    free(C, Q);
+    free(random_data, Q);
 }
 
-FLOAT rand_uniform()
+void usage(const char *pname)
 {
-    return static_cast <FLOAT> (rand()) / static_cast <FLOAT> (RAND_MAX);
+    std::cerr << "Usage:\n"
+              << "  " << pname << " [type] N           benchmark (NxN) x (NxN) square matrix multiplication (default: N = 4096)\n"
+              << "  " << pname << " [type] M N K       benchmark (MxK) x (KxN) square matrix multiplication\n"
+              << "\n"
+              << "The optional [type] selects the data type:\n"
+              << "   double    [default]\n"
+              << "   single\n"
+              << "   half\n"
+              << "\n"
+              << "This benchmark uses the default DPC++ device, which can be controlled using\n"
+              << "  the ONEAPI_DEVICE_SELECTOR environment variable\n";
+    std::exit(1);
 }
 
-bool verify_result(int m, int n, int k, int ldc, FLOAT *C, FLOAT *C_reference)
+int main(int argc, char **argv)
 {
-    FLOAT tolerance = 1e-3;
-    bool ok = true;
-
-    // Compare host side results with the result buffer from device side: print
-    // fail data 5 times only.
-    int printf_count = 0;
-    for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-            auto idx = i * ldc + j;
-            auto abs_diff = std::abs(C[idx] - C_reference[idx]);
-
-            if (abs_diff > tolerance && printf_count++ < 5) {
-                std::cerr << "The result is incorrect for element "
-                          << '[' << i << ", " << j << ']'
-                          << ", expected: " << C_reference[idx]
-                          << ", but got: " << C[idx] << std::endl;
-                ok = false;
-            }
-        }
+    auto pname = argv[0];
+    int M = 4096, N = 4096, K = 4096;
+    std::string type = "double";
+
+    if (argc <= 1)
+        usage(pname);
+
+    if (argc > 1 && std::isalpha(argv[1][0])) {
+        type = argv[1];
+        argc--; argv++;
     }
 
-    if (ok)
-        std::cout << "Results are accurate with tolerance = " << tolerance << endl;
-    else
-        std::cout << "Results may not be accurate with tolerance = " << tolerance << endl;
+    if (argc > 1) M = N = K = std::atoi(argv[1]);
+
+    if (argc > 3) {
+        N = std::atoi(argv[2]);
+        K = std::atoi(argv[3]);
+    }
+
+    if (M <= 0 || N <= 0 || K <= 0)
+        usage(pname);
+
+    queue Q;
 
-    return ok;
+    std::cout << "oneMKL DPC++ GEMM benchmark\n"
+              << "---------------------------\n"
+              << "Device:                  " << Q.get_device().get_info<info::device::name>()                          << std::endl
+              << "Core/EU count:           " << Q.get_device().get_info<info::device::max_compute_units>()             << std::endl
+              << "Maximum clock frequency: " << Q.get_device().get_info<info::device::max_clock_frequency>() << " MHz" << std::endl;
+
+    if (type == "double")
+        test<double>(Q, M, N, K);
+    else if (type == "single" || type == "float")
+        test<float>(Q, M, N, K);
+    else if (type == "half")
+        test<half>(Q, M, N, K);
+    else
+        usage(pname);
 }
diff --git a/Libraries/oneMKL/matrix_mul_mkl/utilities.hpp b/Libraries/oneMKL/matrix_mul_mkl/utilities.hpp
new file mode 100644
index 0000000000..02d1c9fb34
--- /dev/null
+++ b/Libraries/oneMKL/matrix_mul_mkl/utilities.hpp
@@ -0,0 +1,47 @@
+//==============================================================
+// Copyright © 2023 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#include <sycl/sycl.hpp>
+#include <algorithm>
+
+template <typename T> const char *type_string()     { return "unknown type"; }
+template <> const char *type_string<sycl::half>()   { return "half precision"; }
+template <> const char *type_string<float>()        { return "single precision"; }
+template <> const char *type_string<double>()       { return "double precision"; }
+
+/* Choose inter-column padding for optimal performance */
+template <typename T>
+int nice_ld(int x)
+{
+    x = std::max(x, 1);
+    x *= sizeof(T);
+    x = (x + 511) & ~511;
+    x += 256;
+    x /= sizeof(T);
+    return x;
+}
+
+/* Random number generation helpers */
+template <typename T>
+void generate_random_data(size_t elems, T *v)
+{
+#pragma omp parallel for
+    for (size_t i = 0; i < elems; i++)
+        v[i] = double(std::rand()) / RAND_MAX;
+}
+
+template <typename T>
+void replicate_data(sycl::queue &Q, T *dst, size_t dst_elems, const T *src, size_t src_elems)
+{
+    while (dst_elems > 0) {
+        auto copy_elems = std::min(dst_elems, src_elems);
+        Q.copy(src,  dst, copy_elems);
+        dst += copy_elems;
+        dst_elems -= copy_elems;
+    }
+    Q.wait();
+}
+