diff --git a/.gitmodules b/.gitmodules index c158edf6..87b94fb7 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "extlibs/googletest"] path = extlibs/googletest - url = https://github.com/google/googletest.git \ No newline at end of file + url = https://github.com/google/googletest.git +[submodule "extlibs/benchmark"] + path = extlibs/benchmark + url = https://github.com/google/benchmark.git diff --git a/.idea/vcs.xml b/.idea/vcs.xml index adc159a8..455d6bf0 100644 --- a/.idea/vcs.xml +++ b/.idea/vcs.xml @@ -2,6 +2,7 @@ + \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index e98cb975..8cfc231c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,7 @@ include(CMakePackageConfigHelpers) option(OMATH_BUILD_TESTS "Build unit tests" ${PROJECT_IS_TOP_LEVEL}) +option(OMATH_BUILD_BENCHMARK "Build benchmarks" ${PROJECT_IS_TOP_LEVEL}) option(OMATH_THREAT_WARNING_AS_ERROR "Set highest level of warnings and force compiler to treat them as errors" ON) option(OMATH_BUILD_AS_SHARED_LIBRARY "Build Omath as .so or .dll" OFF) option(OMATH_USE_AVX2 "Omath will use AVX2 to boost performance" ON) @@ -16,9 +17,10 @@ option(OMATH_SUPRESS_SAFETY_CHECKS "Supress some safety checks in release build option(OMATH_USE_UNITY_BUILD "Will enable unity build to speed up compilation" OFF) option(OMATH_ENABLE_LEGACY "Will enable legacy classes that MUST be used ONLY for backward compatibility" OFF) -message(STATUS "[${PROJECT_NAME}]: Building on ${CMAKE_HOST_SYSTEM_NAME}") +message(STATUS "[${PROJECT_NAME}]: Building on ${CMAKE_HOST_SYSTEM_NAME}, compiler ${CMAKE_CXX_COMPILER_ID}") message(STATUS "[${PROJECT_NAME}]: Warnings as errors ${OMATH_THREAT_WARNING_AS_ERROR}") message(STATUS "[${PROJECT_NAME}]: Build unit tests ${OMATH_BUILD_TESTS}") +message(STATUS "[${PROJECT_NAME}]: Build benchmark ${OMATH_BUILD_BENCHMARK}") message(STATUS "[${PROJECT_NAME}]: As dynamic library ${OMATH_BUILD_AS_SHARED_LIBRARY}") message(STATUS "[${PROJECT_NAME}]: Static C++ runtime ${OMATH_STATIC_MSVC_RUNTIME_LIBRARY}") message(STATUS "[${PROJECT_NAME}]: CMake unity build ${OMATH_USE_UNITY_BUILD}") @@ -90,19 +92,25 @@ if (OMATH_STATIC_MSVC_RUNTIME_LIBRARY) ) endif () -if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - target_compile_options(${PROJECT_NAME} PRIVATE -mavx2 -mfma) +if (OMATH_USE_AVX2 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + target_compile_options(${PROJECT_NAME} PUBLIC -mavx2 -mavx -mfma) endif () target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_23) +if (OMATH_BUILD_TESTS OR OMATH_BUILD_BENCHMARK) + add_subdirectory(extlibs) +endif () if (OMATH_BUILD_TESTS) - add_subdirectory(extlibs) add_subdirectory(tests) target_compile_definitions(${PROJECT_NAME} PUBLIC OMATH_BUILD_TESTS) endif () +if (OMATH_BUILD_BENCHMARK) + add_subdirectory(benchmark) +endif () + if (OMATH_BUILD_EXAMPLES) add_subdirectory(examples) endif () diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 00000000..124e35a2 --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,15 @@ +project(omath_benchmark) + + +file(GLOB_RECURSE OMATH_BENCHMARK_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") +add_executable(${PROJECT_NAME} ${OMATH_BENCHMARK_SOURCES}) + +set_target_properties(${PROJECT_NAME} PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}" + LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}" + CXX_STANDARD 23 + CXX_STANDARD_REQUIRED ON) + + +target_link_libraries(${PROJECT_NAME} PRIVATE benchmark::benchmark omath) \ No newline at end of file diff --git a/benchmark/benchmark_mat.cpp b/benchmark/benchmark_mat.cpp new file mode 100644 index 00000000..f2eda9ac --- /dev/null +++ b/benchmark/benchmark_mat.cpp @@ -0,0 +1,66 @@ +// +// Created by Vlad on 9/17/2025. +// +#include + +#include +#include +using namespace omath; + + +void mat_float_multiplication_col_major(benchmark::State& state) +{ + using MatType = Mat<128, 128, float, MatStoreType::COLUMN_MAJOR>; + MatType a; + MatType b; + a.set(3.f); + b.set(7.f); + + + for (auto _ : state) + std::ignore = a * b; +} +void mat_float_multiplication_row_major(benchmark::State& state) +{ + using MatType = Mat<128, 128, float, MatStoreType::ROW_MAJOR>; + MatType a; + MatType b; + a.set(3.f); + b.set(7.f); + + + for (auto _ : state) + std::ignore = a * b; +} + +void mat_double_multiplication_row_major(benchmark::State& state) +{ + using MatType = Mat<128, 128, double, MatStoreType::ROW_MAJOR>; + MatType a; + MatType b; + a.set(3.f); + b.set(7.f); + + + for (auto _ : state) + std::ignore = a * b; +} + +void mat_double_multiplication_col_major(benchmark::State& state) +{ + using MatType = Mat<128, 128, double, MatStoreType::COLUMN_MAJOR>; + MatType a; + MatType b; + a.set(3.f); + b.set(7.f); + + + for (auto _ : state) + std::ignore = a * b; +} + +BENCHMARK(mat_float_multiplication_col_major)->Iterations(5000); +BENCHMARK(mat_float_multiplication_row_major)->Iterations(5000); + +BENCHMARK(mat_double_multiplication_col_major)->Iterations(5000); +BENCHMARK(mat_double_multiplication_row_major)->Iterations(5000); \ No newline at end of file diff --git a/benchmark/main.cpp b/benchmark/main.cpp new file mode 100644 index 00000000..790aeaea --- /dev/null +++ b/benchmark/main.cpp @@ -0,0 +1,5 @@ +// +// Created by Vlad on 9/17/2025. +// +#include +BENCHMARK_MAIN(); \ No newline at end of file diff --git a/extlibs/CMakeLists.txt b/extlibs/CMakeLists.txt index bf73c7af..54eda0ad 100644 --- a/extlibs/CMakeLists.txt +++ b/extlibs/CMakeLists.txt @@ -1 +1,2 @@ -add_subdirectory(googletest) \ No newline at end of file +add_subdirectory(googletest) +add_subdirectory(benchmark) \ No newline at end of file diff --git a/extlibs/benchmark b/extlibs/benchmark new file mode 160000 index 00000000..2948b6a2 --- /dev/null +++ b/extlibs/benchmark @@ -0,0 +1 @@ +Subproject commit 2948b6a2e61ccabecc952c24794c6960d86c9ed6 diff --git a/include/omath/linear_algebra/mat.hpp b/include/omath/linear_algebra/mat.hpp index 99f87757..d6f2d332 100644 --- a/include/omath/linear_algebra/mat.hpp +++ b/include/omath/linear_algebra/mat.hpp @@ -10,7 +10,7 @@ #include #include #include - +#include namespace omath { @@ -155,10 +155,17 @@ namespace omath constexpr Mat operator*(const Mat& other) const { +#ifdef OMATH_USE_AVX2 + if constexpr (StoreType == MatStoreType::ROW_MAJOR) + return avx_multiply_row_major(other); + if constexpr (StoreType == MatStoreType::COLUMN_MAJOR) + return avx_multiply_col_major(other); +#else if constexpr (StoreType == MatStoreType::ROW_MAJOR) return cache_friendly_multiply_row_major(other); if constexpr (StoreType == MatStoreType::COLUMN_MAJOR) return cache_friendly_multiply_col_major(other); +#endif std::unreachable(); } @@ -391,6 +398,160 @@ namespace omath } return result; } +#ifdef OMATH_USE_AVX2 + template [[nodiscard]] + constexpr Mat + avx_multiply_col_major(const Mat& other) const + { + Mat result; + + const Type* this_mat_data = this->raw_array().data(); + const Type* other_mat_data = other.raw_array().data(); + Type* result_mat_data = result.raw_array().data(); + + if constexpr (std::is_same_v) + { + // ReSharper disable once CppTooWideScopeInitStatement + constexpr std::size_t vector_size = 8; + for (std::size_t j = 0; j < OtherColumns; ++j) + { + auto* c_col = reinterpret_cast(result_mat_data + j * Rows); + for (std::size_t k = 0; k < Columns; ++k) + { + const float bkj = reinterpret_cast(other_mat_data)[k + j * Columns]; + __m256 bkjv = _mm256_set1_ps(bkj); + + const auto* a_col_k = reinterpret_cast(this_mat_data + k * Rows); + + std::size_t i = 0; + for (; i + vector_size <= Rows; i += vector_size) + { + __m256 cvec = _mm256_loadu_ps(c_col + i); + __m256 avec = _mm256_loadu_ps(a_col_k + i); +#if defined(__FMA__) + cvec = _mm256_fmadd_ps(avec, bkjv, cvec); +#else + cvec = _mm256_add_ps(cvec, _mm256_mul_ps(avec, bkjv)); +#endif + _mm256_storeu_ps(c_col + i, cvec); + } + for (; i < Rows; ++i) + c_col[i] += a_col_k[i] * bkj; + } + } + } + else if (std::is_same_v) + { // double + // ReSharper disable once CppTooWideScopeInitStatement + constexpr std::size_t vector_size = 4; + for (std::size_t j = 0; j < OtherColumns; ++j) + { + auto* c_col = reinterpret_cast(result_mat_data + j * Rows); + for (std::size_t k = 0; k < Columns; ++k) + { + const double bkj = reinterpret_cast(other_mat_data)[k + j * Columns]; + __m256d bkjv = _mm256_set1_pd(bkj); + + const auto* a_col_k = reinterpret_cast(this_mat_data + k * Rows); + + std::size_t i = 0; + for (; i + vector_size <= Rows; i += vector_size) + { + __m256d cvec = _mm256_loadu_pd(c_col + i); + __m256d avec = _mm256_loadu_pd(a_col_k + i); +#if defined(__FMA__) + cvec = _mm256_fmadd_pd(avec, bkjv, cvec); +#else + cvec = _mm256_add_pd(cvec, _mm256_mul_pd(avec, bkjv)); +#endif + _mm256_storeu_pd(c_col + i, cvec); + } + for (; i < Rows; ++i) + c_col[i] += a_col_k[i] * bkj; + } + } + } + else + std::unreachable(); + + return result; + } + + template [[nodiscard]] + constexpr Mat + avx_multiply_row_major(const Mat& other) const + { + Mat result; + + const Type* this_mat_data = this->raw_array().data(); + const Type* other_mat_data = other.raw_array().data(); + Type* result_mat_data = result.raw_array().data(); + + if constexpr (std::is_same_v) + { + // ReSharper disable once CppTooWideScopeInitStatement + constexpr std::size_t vector_size = 8; + for (std::size_t i = 0; i < Rows; ++i) + { + Type* c_row = result_mat_data + i * OtherColumns; + for (std::size_t k = 0; k < Columns; ++k) + { + const auto aik = static_cast(this_mat_data[i * Columns + k]); + __m256 aikv = _mm256_set1_ps(aik); + const auto* b_row = reinterpret_cast(other_mat_data + k * OtherColumns); + + std::size_t j = 0; + for (; j + vector_size <= OtherColumns; j += vector_size) + { + __m256 cvec = _mm256_loadu_ps(c_row + j); + __m256 bvec = _mm256_loadu_ps(b_row + j); +#if defined(__FMA__) + cvec = _mm256_fmadd_ps(bvec, aikv, cvec); +#else + cvec = _mm256_add_ps(cvec, _mm256_mul_ps(bvec, aikv)); +#endif + _mm256_storeu_ps(c_row + j, cvec); + } + for (; j < OtherColumns; ++j) + c_row[j] += aik * b_row[j]; + } + } + } + else if (std::is_same_v) + { // double + // ReSharper disable once CppTooWideScopeInitStatement + constexpr std::size_t vector_size = 4; + for (std::size_t i = 0; i < Rows; ++i) + { + Type* c_row = result_mat_data + i * OtherColumns; + for (std::size_t k = 0; k < Columns; ++k) + { + const auto aik = static_cast(this_mat_data[i * Columns + k]); + __m256d aikv = _mm256_set1_pd(aik); + const auto* b_row = reinterpret_cast(other_mat_data + k * OtherColumns); + + std::size_t j = 0; + for (; j + vector_size <= OtherColumns; j += vector_size) + { + __m256d cvec = _mm256_loadu_pd(c_row + j); + __m256d bvec = _mm256_loadu_pd(b_row + j); +#if defined(__FMA__) + cvec = _mm256_fmadd_pd(bvec, aikv, cvec); +#else + cvec = _mm256_add_pd(cvec, _mm256_mul_pd(bvec, aikv)); +#endif + _mm256_storeu_pd(c_row + j, cvec); + } + for (; j < OtherColumns; ++j) + c_row[j] += aik * b_row[j]; + } + } + } + else + std::unreachable(); + return result; + } +#endif }; template [[nodiscard]]