diff --git a/.gitmodules b/.gitmodules
index c158edf6..87b94fb7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
[submodule "extlibs/googletest"]
path = extlibs/googletest
- url = https://github.com/google/googletest.git
\ No newline at end of file
+ url = https://github.com/google/googletest.git
+[submodule "extlibs/benchmark"]
+ path = extlibs/benchmark
+ url = https://github.com/google/benchmark.git
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
index adc159a8..455d6bf0 100644
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -2,6 +2,7 @@
+
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e98cb975..8cfc231c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,7 @@ include(CMakePackageConfigHelpers)
option(OMATH_BUILD_TESTS "Build unit tests" ${PROJECT_IS_TOP_LEVEL})
+option(OMATH_BUILD_BENCHMARK "Build benchmarks" ${PROJECT_IS_TOP_LEVEL})
option(OMATH_THREAT_WARNING_AS_ERROR "Set highest level of warnings and force compiler to treat them as errors" ON)
option(OMATH_BUILD_AS_SHARED_LIBRARY "Build Omath as .so or .dll" OFF)
option(OMATH_USE_AVX2 "Omath will use AVX2 to boost performance" ON)
@@ -16,9 +17,10 @@ option(OMATH_SUPRESS_SAFETY_CHECKS "Supress some safety checks in release build
option(OMATH_USE_UNITY_BUILD "Will enable unity build to speed up compilation" OFF)
option(OMATH_ENABLE_LEGACY "Will enable legacy classes that MUST be used ONLY for backward compatibility" OFF)
-message(STATUS "[${PROJECT_NAME}]: Building on ${CMAKE_HOST_SYSTEM_NAME}")
+message(STATUS "[${PROJECT_NAME}]: Building on ${CMAKE_HOST_SYSTEM_NAME}, compiler ${CMAKE_CXX_COMPILER_ID}")
message(STATUS "[${PROJECT_NAME}]: Warnings as errors ${OMATH_THREAT_WARNING_AS_ERROR}")
message(STATUS "[${PROJECT_NAME}]: Build unit tests ${OMATH_BUILD_TESTS}")
+message(STATUS "[${PROJECT_NAME}]: Build benchmark ${OMATH_BUILD_BENCHMARK}")
message(STATUS "[${PROJECT_NAME}]: As dynamic library ${OMATH_BUILD_AS_SHARED_LIBRARY}")
message(STATUS "[${PROJECT_NAME}]: Static C++ runtime ${OMATH_STATIC_MSVC_RUNTIME_LIBRARY}")
message(STATUS "[${PROJECT_NAME}]: CMake unity build ${OMATH_USE_UNITY_BUILD}")
@@ -90,19 +92,25 @@ if (OMATH_STATIC_MSVC_RUNTIME_LIBRARY)
)
endif ()
-if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
- target_compile_options(${PROJECT_NAME} PRIVATE -mavx2 -mfma)
+if (OMATH_USE_AVX2 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ target_compile_options(${PROJECT_NAME} PUBLIC -mavx2 -mavx -mfma)
endif ()
target_compile_features(${PROJECT_NAME} PUBLIC cxx_std_23)
+if (OMATH_BUILD_TESTS OR OMATH_BUILD_BENCHMARK)
+ add_subdirectory(extlibs)
+endif ()
if (OMATH_BUILD_TESTS)
- add_subdirectory(extlibs)
add_subdirectory(tests)
target_compile_definitions(${PROJECT_NAME} PUBLIC OMATH_BUILD_TESTS)
endif ()
+if (OMATH_BUILD_BENCHMARK)
+ add_subdirectory(benchmark)
+endif ()
+
if (OMATH_BUILD_EXAMPLES)
add_subdirectory(examples)
endif ()
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 00000000..124e35a2
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,15 @@
+project(omath_benchmark)
+
+
+file(GLOB_RECURSE OMATH_BENCHMARK_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp")
+add_executable(${PROJECT_NAME} ${OMATH_BENCHMARK_SOURCES})
+
+set_target_properties(${PROJECT_NAME} PROPERTIES
+ ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}"
+ LIBRARY_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}"
+ RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/out/${CMAKE_BUILD_TYPE}"
+ CXX_STANDARD 23
+ CXX_STANDARD_REQUIRED ON)
+
+
+target_link_libraries(${PROJECT_NAME} PRIVATE benchmark::benchmark omath)
\ No newline at end of file
diff --git a/benchmark/benchmark_mat.cpp b/benchmark/benchmark_mat.cpp
new file mode 100644
index 00000000..f2eda9ac
--- /dev/null
+++ b/benchmark/benchmark_mat.cpp
@@ -0,0 +1,66 @@
+//
+// Created by Vlad on 9/17/2025.
+//
+#include
+
+#include
+#include
+using namespace omath;
+
+
+void mat_float_multiplication_col_major(benchmark::State& state)
+{
+ using MatType = Mat<128, 128, float, MatStoreType::COLUMN_MAJOR>;
+ MatType a;
+ MatType b;
+ a.set(3.f);
+ b.set(7.f);
+
+
+ for (auto _ : state)
+ std::ignore = a * b;
+}
+void mat_float_multiplication_row_major(benchmark::State& state)
+{
+ using MatType = Mat<128, 128, float, MatStoreType::ROW_MAJOR>;
+ MatType a;
+ MatType b;
+ a.set(3.f);
+ b.set(7.f);
+
+
+ for (auto _ : state)
+ std::ignore = a * b;
+}
+
+void mat_double_multiplication_row_major(benchmark::State& state)
+{
+ using MatType = Mat<128, 128, double, MatStoreType::ROW_MAJOR>;
+ MatType a;
+ MatType b;
+ a.set(3.f);
+ b.set(7.f);
+
+
+ for (auto _ : state)
+ std::ignore = a * b;
+}
+
+void mat_double_multiplication_col_major(benchmark::State& state)
+{
+ using MatType = Mat<128, 128, double, MatStoreType::COLUMN_MAJOR>;
+ MatType a;
+ MatType b;
+ a.set(3.f);
+ b.set(7.f);
+
+
+ for (auto _ : state)
+ std::ignore = a * b;
+}
+
+BENCHMARK(mat_float_multiplication_col_major)->Iterations(5000);
+BENCHMARK(mat_float_multiplication_row_major)->Iterations(5000);
+
+BENCHMARK(mat_double_multiplication_col_major)->Iterations(5000);
+BENCHMARK(mat_double_multiplication_row_major)->Iterations(5000);
\ No newline at end of file
diff --git a/benchmark/main.cpp b/benchmark/main.cpp
new file mode 100644
index 00000000..790aeaea
--- /dev/null
+++ b/benchmark/main.cpp
@@ -0,0 +1,5 @@
+//
+// Created by Vlad on 9/17/2025.
+//
+#include
+BENCHMARK_MAIN();
\ No newline at end of file
diff --git a/extlibs/CMakeLists.txt b/extlibs/CMakeLists.txt
index bf73c7af..54eda0ad 100644
--- a/extlibs/CMakeLists.txt
+++ b/extlibs/CMakeLists.txt
@@ -1 +1,2 @@
-add_subdirectory(googletest)
\ No newline at end of file
+add_subdirectory(googletest)
+add_subdirectory(benchmark)
\ No newline at end of file
diff --git a/extlibs/benchmark b/extlibs/benchmark
new file mode 160000
index 00000000..2948b6a2
--- /dev/null
+++ b/extlibs/benchmark
@@ -0,0 +1 @@
+Subproject commit 2948b6a2e61ccabecc952c24794c6960d86c9ed6
diff --git a/include/omath/linear_algebra/mat.hpp b/include/omath/linear_algebra/mat.hpp
index 99f87757..d6f2d332 100644
--- a/include/omath/linear_algebra/mat.hpp
+++ b/include/omath/linear_algebra/mat.hpp
@@ -10,7 +10,7 @@
#include
#include
#include
-
+#include
namespace omath
{
@@ -155,10 +155,17 @@ namespace omath
constexpr Mat
operator*(const Mat& other) const
{
+#ifdef OMATH_USE_AVX2
+ if constexpr (StoreType == MatStoreType::ROW_MAJOR)
+ return avx_multiply_row_major(other);
+ if constexpr (StoreType == MatStoreType::COLUMN_MAJOR)
+ return avx_multiply_col_major(other);
+#else
if constexpr (StoreType == MatStoreType::ROW_MAJOR)
return cache_friendly_multiply_row_major(other);
if constexpr (StoreType == MatStoreType::COLUMN_MAJOR)
return cache_friendly_multiply_col_major(other);
+#endif
std::unreachable();
}
@@ -391,6 +398,160 @@ namespace omath
}
return result;
}
+#ifdef OMATH_USE_AVX2
+ template [[nodiscard]]
+ constexpr Mat
+ avx_multiply_col_major(const Mat& other) const
+ {
+ Mat result;
+
+ const Type* this_mat_data = this->raw_array().data();
+ const Type* other_mat_data = other.raw_array().data();
+ Type* result_mat_data = result.raw_array().data();
+
+ if constexpr (std::is_same_v)
+ {
+ // ReSharper disable once CppTooWideScopeInitStatement
+ constexpr std::size_t vector_size = 8;
+ for (std::size_t j = 0; j < OtherColumns; ++j)
+ {
+ auto* c_col = reinterpret_cast(result_mat_data + j * Rows);
+ for (std::size_t k = 0; k < Columns; ++k)
+ {
+ const float bkj = reinterpret_cast(other_mat_data)[k + j * Columns];
+ __m256 bkjv = _mm256_set1_ps(bkj);
+
+ const auto* a_col_k = reinterpret_cast(this_mat_data + k * Rows);
+
+ std::size_t i = 0;
+ for (; i + vector_size <= Rows; i += vector_size)
+ {
+ __m256 cvec = _mm256_loadu_ps(c_col + i);
+ __m256 avec = _mm256_loadu_ps(a_col_k + i);
+#if defined(__FMA__)
+ cvec = _mm256_fmadd_ps(avec, bkjv, cvec);
+#else
+ cvec = _mm256_add_ps(cvec, _mm256_mul_ps(avec, bkjv));
+#endif
+ _mm256_storeu_ps(c_col + i, cvec);
+ }
+ for (; i < Rows; ++i)
+ c_col[i] += a_col_k[i] * bkj;
+ }
+ }
+ }
+ else if (std::is_same_v)
+ { // double
+ // ReSharper disable once CppTooWideScopeInitStatement
+ constexpr std::size_t vector_size = 4;
+ for (std::size_t j = 0; j < OtherColumns; ++j)
+ {
+ auto* c_col = reinterpret_cast(result_mat_data + j * Rows);
+ for (std::size_t k = 0; k < Columns; ++k)
+ {
+ const double bkj = reinterpret_cast(other_mat_data)[k + j * Columns];
+ __m256d bkjv = _mm256_set1_pd(bkj);
+
+ const auto* a_col_k = reinterpret_cast(this_mat_data + k * Rows);
+
+ std::size_t i = 0;
+ for (; i + vector_size <= Rows; i += vector_size)
+ {
+ __m256d cvec = _mm256_loadu_pd(c_col + i);
+ __m256d avec = _mm256_loadu_pd(a_col_k + i);
+#if defined(__FMA__)
+ cvec = _mm256_fmadd_pd(avec, bkjv, cvec);
+#else
+ cvec = _mm256_add_pd(cvec, _mm256_mul_pd(avec, bkjv));
+#endif
+ _mm256_storeu_pd(c_col + i, cvec);
+ }
+ for (; i < Rows; ++i)
+ c_col[i] += a_col_k[i] * bkj;
+ }
+ }
+ }
+ else
+ std::unreachable();
+
+ return result;
+ }
+
+ template [[nodiscard]]
+ constexpr Mat
+ avx_multiply_row_major(const Mat& other) const
+ {
+ Mat result;
+
+ const Type* this_mat_data = this->raw_array().data();
+ const Type* other_mat_data = other.raw_array().data();
+ Type* result_mat_data = result.raw_array().data();
+
+ if constexpr (std::is_same_v)
+ {
+ // ReSharper disable once CppTooWideScopeInitStatement
+ constexpr std::size_t vector_size = 8;
+ for (std::size_t i = 0; i < Rows; ++i)
+ {
+ Type* c_row = result_mat_data + i * OtherColumns;
+ for (std::size_t k = 0; k < Columns; ++k)
+ {
+ const auto aik = static_cast(this_mat_data[i * Columns + k]);
+ __m256 aikv = _mm256_set1_ps(aik);
+ const auto* b_row = reinterpret_cast(other_mat_data + k * OtherColumns);
+
+ std::size_t j = 0;
+ for (; j + vector_size <= OtherColumns; j += vector_size)
+ {
+ __m256 cvec = _mm256_loadu_ps(c_row + j);
+ __m256 bvec = _mm256_loadu_ps(b_row + j);
+#if defined(__FMA__)
+ cvec = _mm256_fmadd_ps(bvec, aikv, cvec);
+#else
+ cvec = _mm256_add_ps(cvec, _mm256_mul_ps(bvec, aikv));
+#endif
+ _mm256_storeu_ps(c_row + j, cvec);
+ }
+ for (; j < OtherColumns; ++j)
+ c_row[j] += aik * b_row[j];
+ }
+ }
+ }
+ else if (std::is_same_v)
+ { // double
+ // ReSharper disable once CppTooWideScopeInitStatement
+ constexpr std::size_t vector_size = 4;
+ for (std::size_t i = 0; i < Rows; ++i)
+ {
+ Type* c_row = result_mat_data + i * OtherColumns;
+ for (std::size_t k = 0; k < Columns; ++k)
+ {
+ const auto aik = static_cast(this_mat_data[i * Columns + k]);
+ __m256d aikv = _mm256_set1_pd(aik);
+ const auto* b_row = reinterpret_cast(other_mat_data + k * OtherColumns);
+
+ std::size_t j = 0;
+ for (; j + vector_size <= OtherColumns; j += vector_size)
+ {
+ __m256d cvec = _mm256_loadu_pd(c_row + j);
+ __m256d bvec = _mm256_loadu_pd(b_row + j);
+#if defined(__FMA__)
+ cvec = _mm256_fmadd_pd(bvec, aikv, cvec);
+#else
+ cvec = _mm256_add_pd(cvec, _mm256_mul_pd(bvec, aikv));
+#endif
+ _mm256_storeu_pd(c_row + j, cvec);
+ }
+ for (; j < OtherColumns; ++j)
+ c_row[j] += aik * b_row[j];
+ }
+ }
+ }
+ else
+ std::unreachable();
+ return result;
+ }
+#endif
};
template [[nodiscard]]