From 45922a23f728e750189fe64491a83ceb14baed31 Mon Sep 17 00:00:00 2001 From: Peter Caday Date: Thu, 19 Oct 2023 12:25:45 -0700 Subject: [PATCH] matrix_mul_mkl: revamp sample --- Libraries/oneMKL/matrix_mul_mkl/GNUmakefile | 19 +- Libraries/oneMKL/matrix_mul_mkl/makefile | 19 +- .../oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp | 336 +++++++----------- Libraries/oneMKL/matrix_mul_mkl/utilities.hpp | 47 +++ 4 files changed, 196 insertions(+), 225 deletions(-) create mode 100644 Libraries/oneMKL/matrix_mul_mkl/utilities.hpp diff --git a/Libraries/oneMKL/matrix_mul_mkl/GNUmakefile b/Libraries/oneMKL/matrix_mul_mkl/GNUmakefile index 5ff8249e6e..06d00c54e0 100644 --- a/Libraries/oneMKL/matrix_mul_mkl/GNUmakefile +++ b/Libraries/oneMKL/matrix_mul_mkl/GNUmakefile @@ -1,26 +1,23 @@ # Makefile for GNU Make -default: all +default: run -all: sgemm.mkl dgemm.mkl +all: matrix_mul_mkl -run: sgemm.mkl dgemm.mkl - ./sgemm.mkl - ./dgemm.mkl +run: matrix_mul_mkl + ./matrix_mul_mkl single + ./matrix_mul_mkl double INCLUDE_COMMON=../../../common MKL_COPTS = -DMKL_ILP64 -qmkl=sequential MKL_LIBS = -lsycl -lOpenCL -lpthread -lm -ldl -DPCPP_OPTS = -O3 $(MKL_COPTS) $(MKL_LIBS) +DPCPP_OPTS = -O2 $(MKL_COPTS) $(MKL_LIBS) -sgemm.mkl: matrix_mul_mkl.cpp +matrix_mul_mkl: matrix_mul_mkl.cpp icpx -fsycl -I$(INCLUDE_COMMON) $< -o $@ $(DPCPP_OPTS) -dgemm.mkl: matrix_mul_mkl.cpp - icpx -fsycl -I$(INCLUDE_COMMON) $< -o $@ $(DPCPP_OPTS) -DUSE_DOUBLE - clean: - -rm -f sgemm.mkl dgemm.mkl + -rm -f matrix_mul_mkl .PHONY: clean run all diff --git a/Libraries/oneMKL/matrix_mul_mkl/makefile b/Libraries/oneMKL/matrix_mul_mkl/makefile index d425236e20..70fda7c4d7 100644 --- a/Libraries/oneMKL/matrix_mul_mkl/makefile +++ b/Libraries/oneMKL/matrix_mul_mkl/makefile @@ -1,22 +1,19 @@ # Makefile for NMAKE -default: all +default: run -all: sgemm.exe dgemm.exe +all: matrix_mul_mkl.exe -run: sgemm.exe dgemm.exe - .\sgemm.exe - .\dgemm.exe +run: matrix_mul_mkl.exe + .\matrix_mul_mkl.exe single + .\matrix_mul_mkl.exe double DPCPP_OPTS=/I"$(MKLROOT)\include" /Qmkl /EHsc -fsycl-device-code-split=per_kernel OpenCL.lib -sgemm.exe: matrix_mul_mkl.cpp - icx-cl -fsycl matrix_mul_mkl.cpp /Fesgemm.exe $(DPCPP_OPTS) - -dgemm.exe: matrix_mul_mkl.cpp - icx-cl -fsycl matrix_mul_mkl.cpp /Fedgemm.exe $(DPCPP_OPTS) -DUSE_DOUBLE +matrix_mul_mkl.exe: matrix_mul_mkl.cpp + icx-cl -fsycl matrix_mul_mkl.cpp /Fematrix_mul_mkl.exe $(DPCPP_OPTS) clean: - del /q sgemm.exe sgemm.exp sgemm.lib dgemm.exe dgemm.exp dgemm.lib + del /q matrix_mul_mkl.exe matrix_mul_mkl.exp matrix_mul_mkl.lib pseudo: clean run all diff --git a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp index 91d0e38e60..6583384dba 100644 --- a/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp +++ b/Libraries/oneMKL/matrix_mul_mkl/matrix_mul_mkl.cpp @@ -1,225 +1,155 @@ //============================================================== -// Copyright © 2020 Intel Corporation +// Copyright © 2023 Intel Corporation // // SPDX-License-Identifier: MIT // ============================================================= // -// Matrix Multiplication is a simple program that multiplies together two -// large matrices and verifies the results. -// This samples uses the oneAPI Math Kernel Library (oneMKL) to accelerate -// the computation. - -// The test is updated based on oneAPI samples oneAPI-samples/Libraries/oneMKL/matrix_mul_mkl -#include -#include -#include +// Contents: +// A simple matrix multiplication benchmark, using the oneAPI Math Kernel +// Library (oneMKL). +// #include -#include "oneapi/mkl.hpp" -#include "dpc_common.hpp" - -#ifndef USE_DOUBLE -#define FLOAT float -#else -#define FLOAT double -#endif - -FLOAT rand_uniform(); -bool verify_result(int m, int n, int k, int ldc, FLOAT *C, FLOAT *C_reference); - -#define WARMUP 10 -#define LOOPS 100 -//default matrix size 8192x8192 -#define MSIZE 8192 -#define VERIFY_RESULT False +#include +#include +#include +#include +#include +#include +#include "utilities.hpp" -using namespace std ; +using namespace sycl; -int main(int argc, char* argv[]) +template +void test(queue &Q, int M, int N, int K) { - try { - - int msize = MSIZE; - int loops = LOOPS; - int verify = 0; - - // Initialize data for GEMM. The full GEMM operation is: - // - // C = alpha * op(A) * op(B) + beta * C - // - // where alpha, beta are scalar values, and op(...) represents - // optional matrix transposition. - // - // For this simple matrix multiplication, no transposition is needed. - // - // By choosing alpha = 1, beta = 0, GEMM will calculate C = A * B. - // - // In this example, matrices are stored in row-major layout. - - auto transA = oneapi::mkl::transpose::nontrans; - auto transB = oneapi::mkl::transpose::nontrans; - - // Matrix data sizes. - // - // A is m x k - // B is k x n --> product C is m x n - int m = msize; - int k = msize; - int n = msize; - - cout << "Problem size: " - << " A (" << m << 'x' << k << ") *" - << " B (" << k << 'x' << n << ") --> " - << " C (" << m << 'x' << n << ")\n"; - - cout << "Benchmark interations: " << loops << endl; - - // Leading dimensions of data. For row-major matrices, the leading - // dimension is the stride between adjacent rows. - int lda = k; - int ldb = n; - int ldc = n; - - // Scaling factors. - FLOAT alpha = 1.0f; - FLOAT beta = 0.0f; - - // Create a queue on the default device. - sycl::queue device_queue{sycl::default_selector_v}; - - std::cout << "Device: " - << device_queue.get_device().get_info() - << std::endl; - - // Allocate shared memory for matrices. - const size_t alignment = 4096; - auto a = sycl::aligned_alloc_host(alignment, m * k, device_queue); - auto b = sycl::aligned_alloc_host(alignment, k * n, device_queue); - auto c = sycl::aligned_alloc_host(alignment, m * n, device_queue); - - auto C_reference = (FLOAT *) calloc(m * n, sizeof(FLOAT)); - - if (!a || !b || !c || !C_reference) { - std::cerr << "Could not allocate memory for matrices." << std::endl; - exit(1); - } - - // Initialize matrix data. - for (int i = 0; i < m; i++) - for (int j = 0; j < k; j++) - a[i * lda + j] = rand_uniform(); - - for (int i = 0; i < k; i++) - for (int j = 0; j < n; j++) - b[i * ldb + j] = rand_uniform(); - - auto A = sycl::aligned_alloc_device(alignment, m * k, device_queue); - auto B = sycl::aligned_alloc_device(alignment, m * n, device_queue); - auto C = sycl::aligned_alloc_device(alignment, m * n, device_queue); - device_queue.wait(); - - device_queue.memcpy(A, &(a[0]), m * k * sizeof(FLOAT)); - device_queue.memcpy(B, &(b[0]), k * n * sizeof(FLOAT)); - device_queue.memcpy(C, &(c[0]), m * n * sizeof(FLOAT)); - device_queue.wait(); - - - // Call GEMM to do matrix multiplication, asynchronously. - std::cerr << "Launching oneMKL GEMM calculation..." << std::endl; - dpc_common::TimeInterval timer; - double start_time = 0.0; - - //warm up - for (int w=0; w < WARMUP; w++) - { - oneapi::mkl::blas::row_major::gemm(device_queue, transA, transB, m, n, k, - alpha, A, lda, B, ldb, beta, C, ldc); - } - device_queue.wait_and_throw(); - - start_time = timer.Elapsed(); - for (int l=0; l < loops; l++) - { - oneapi::mkl::blas::row_major::gemm(device_queue, transA, transB, m, n, k, - alpha, A, lda, B, ldb, beta, C, ldc); - } - // Wait for oneMKL computation to complete. - device_queue.wait_and_throw(); - - double stop_time = timer.Elapsed(); - double avg_gemm_time = (stop_time - start_time)/loops; - - double gflops = 2.0 * (double)m * (double)m * (double)m; - #ifdef USE_DOUBLE - cout << "DGEMM performance : " << gflops / avg_gemm_time * 1.e-9 << " GFLOPS" << endl; - #else - cout << "SGEMM performance : " << gflops / avg_gemm_time * 1.e-9 << " GFLOPS" << endl; - #endif - - - if(verify) - { - // While calculation occurs, compute reference result to check accuracy. - std::cerr << "Performing reference calculation..." << std::endl; - for (int i = 0; i < m; i++) - for (int h = 0; h < k; h++) - for (int j = 0; j < n; j++) - C_reference[i * ldc + j] += a[i * lda + h] * b[h * ldb + j]; - // Check results for accuracy. - device_queue.memcpy(&(c[0]), C, m*n*sizeof(FLOAT)).wait(); - verify_result(m, n, k, ldc, c, C_reference); - } - - - // Free memory. - free(A, device_queue); - free(B, device_queue); - free(C, device_queue); - free(C_reference); - free(a, device_queue); - free(b, device_queue); - free(c, device_queue); - - } catch (const std::exception &e) { - std::cerr << "An exception occurred: " - << e.what() << std::endl; - exit(1); + std::cout << "\nBenchmarking (" << M << " x " << K << ") x (" << K << " x " << N << ") matrix multiplication, " << type_string() << std::endl;; + + std::cout << " -> Initializing data...\n"; + + /* Allocate A/B/C matrices */ + int lda = nice_ld(M); + int ldb = nice_ld(K); + int ldc = nice_ld(M); + + auto A = malloc_device(lda * K, Q); + auto B = malloc_device(ldb * N, Q); + auto C = malloc_device(ldc * N, Q); + + /* Fill A/B with random data */ + constexpr int rd_size = 1048576; + auto random_data = malloc_host(rd_size, Q); + generate_random_data(rd_size, random_data); + + replicate_data(Q, A, lda * K, random_data, rd_size); + replicate_data(Q, B, ldb * N, random_data, rd_size); + + /* Measure time for a given number of GEMM calls */ + auto time_gemms = [=, &Q](int runs) -> double { + using namespace oneapi::mkl; + using namespace std::chrono; + auto start = steady_clock::now(); + for (int i = 0; i < runs; i++) + blas::gemm(Q, transpose::N, transpose::N, M, N, K, 1, A, lda, B, ldb, 0, C, ldc); + Q.wait_and_throw(); + auto end = steady_clock::now(); + return duration(end - start).count(); + }; + + /* Do a warmup call to initialize MKL and ensure kernels are JIT'ed if needed */ + std::cout << " -> Warmup...\n"; + (void) time_gemms(1); + + /* Time one GEMM call, and estimate how many calls will be required to keep the + * GPU busy for 1s. */ + auto tare = time_gemms(1); + int ncalls = std::max(4, std::min(1000, int(1. / tare))); + + /* Time that many GEMMs, subtracting the first call time to remove host overhead. + * This gives a better idea of device performance. */ + std::cout << " -> Timing...\n"; + auto time = time_gemms(ncalls + 1) - tare; + auto avg = time / ncalls; + + /* Calculate and display performance */ + auto op_count = double(M) * double(N) * double(K) * 2; + auto flops = op_count / avg; + + flops *= 1e-9; + char unit = 'G'; + if (flops >= 1000.) { + flops *= 1e-3; + unit = 'T'; } + if (flops >= 1000.) { + flops *= 1e-3; + unit = 'P'; + } + + std::cout << "\nAverage performance: " << flops << unit << 'F' << std::endl; + + /* Free data */ + free(A, Q); + free(B, Q); + free(C, Q); + free(random_data, Q); } -FLOAT rand_uniform() +void usage(const char *pname) { - return static_cast (rand()) / static_cast (RAND_MAX); + std::cerr << "Usage:\n" + << " " << pname << " [type] N benchmark (NxN) x (NxN) square matrix multiplication (default: N = 4096)\n" + << " " << pname << " [type] M N K benchmark (MxK) x (KxN) square matrix multiplication\n" + << "\n" + << "The optional [type] selects the data type:\n" + << " double [default]\n" + << " single\n" + << " half\n" + << "\n" + << "This benchmark uses the default DPC++ device, which can be controlled using\n" + << " the ONEAPI_DEVICE_SELECTOR environment variable\n"; + std::exit(1); } -bool verify_result(int m, int n, int k, int ldc, FLOAT *C, FLOAT *C_reference) +int main(int argc, char **argv) { - FLOAT tolerance = 1e-3; - bool ok = true; - - // Compare host side results with the result buffer from device side: print - // fail data 5 times only. - int printf_count = 0; - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - auto idx = i * ldc + j; - auto abs_diff = std::abs(C[idx] - C_reference[idx]); - - if (abs_diff > tolerance && printf_count++ < 5) { - std::cerr << "The result is incorrect for element " - << '[' << i << ", " << j << ']' - << ", expected: " << C_reference[idx] - << ", but got: " << C[idx] << std::endl; - ok = false; - } - } + auto pname = argv[0]; + int M = 4096, N = 4096, K = 4096; + std::string type = "double"; + + if (argc <= 1) + usage(pname); + + if (argc > 1 && std::isalpha(argv[1][0])) { + type = argv[1]; + argc--; argv++; } - if (ok) - std::cout << "Results are accurate with tolerance = " << tolerance << endl; - else - std::cout << "Results may not be accurate with tolerance = " << tolerance << endl; + if (argc > 1) M = N = K = std::atoi(argv[1]); + + if (argc > 3) { + N = std::atoi(argv[2]); + K = std::atoi(argv[3]); + } + + if (M <= 0 || N <= 0 || K <= 0) + usage(pname); + + queue Q; - return ok; + std::cout << "oneMKL DPC++ GEMM benchmark\n" + << "---------------------------\n" + << "Device: " << Q.get_device().get_info() << std::endl + << "Core/EU count: " << Q.get_device().get_info() << std::endl + << "Maximum clock frequency: " << Q.get_device().get_info() << " MHz" << std::endl; + + if (type == "double") + test(Q, M, N, K); + else if (type == "single" || type == "float") + test(Q, M, N, K); + else if (type == "half") + test(Q, M, N, K); + else + usage(pname); } diff --git a/Libraries/oneMKL/matrix_mul_mkl/utilities.hpp b/Libraries/oneMKL/matrix_mul_mkl/utilities.hpp new file mode 100644 index 0000000000..02d1c9fb34 --- /dev/null +++ b/Libraries/oneMKL/matrix_mul_mkl/utilities.hpp @@ -0,0 +1,47 @@ +//============================================================== +// Copyright © 2023 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +#include + +template const char *type_string() { return "unknown type"; } +template <> const char *type_string() { return "half precision"; } +template <> const char *type_string() { return "single precision"; } +template <> const char *type_string() { return "double precision"; } + +/* Choose inter-column padding for optimal performance */ +template +int nice_ld(int x) +{ + x = std::max(x, 1); + x *= sizeof(T); + x = (x + 511) & ~511; + x += 256; + x /= sizeof(T); + return x; +} + +/* Random number generation helpers */ +template +void generate_random_data(size_t elems, T *v) +{ +#pragma omp parallel for + for (size_t i = 0; i < elems; i++) + v[i] = double(std::rand()) / RAND_MAX; +} + +template +void replicate_data(sycl::queue &Q, T *dst, size_t dst_elems, const T *src, size_t src_elems) +{ + while (dst_elems > 0) { + auto copy_elems = std::min(dst_elems, src_elems); + Q.copy(src, dst, copy_elems); + dst += copy_elems; + dst_elems -= copy_elems; + } + Q.wait(); +} +