From 8e532fe1257e6e55944e49e4669b79734a7736bc Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 12 Sep 2025 10:07:25 -0700 Subject: [PATCH 01/10] Add MSVC build support on windows --- lib/meson.build | 8 +++---- lib/x86simdsort.cpp | 34 ++++++++++++++++-------------- lib/x86simdsort.h | 14 +++++++++--- lib/x86simdsortcpuid.h | 48 ++++++++++++++++++++++++++++++++++++++++++ meson.build | 1 + 5 files changed, 82 insertions(+), 23 deletions(-) create mode 100644 lib/x86simdsortcpuid.h diff --git a/lib/meson.build b/lib/meson.build index 44ced53..2f9f06c 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -6,7 +6,7 @@ if cpp.has_argument('-march=haswell') 'x86simdsort-avx2.cpp', ), include_directories : [src], - cpp_args : ['-march=haswell'], + cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) @@ -18,7 +18,7 @@ if cpp.has_argument('-march=skylake-avx512') 'x86simdsort-skx.cpp', ), include_directories : [src], - cpp_args : ['-march=skylake-avx512'], + cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) @@ -30,7 +30,7 @@ if cpp.has_argument('-march=icelake-client') 'x86simdsort-icl.cpp', ), include_directories : [src], - cpp_args : ['-march=icelake-client'], + cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) @@ -42,7 +42,7 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : ['-march=sapphirerapids'], + cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 8ef9aad..3ee12dd 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -1,6 +1,12 @@ +#if defined(_MSC_VER) +#define XSS_ATTRIBUTE_CONSTRUCTOR +#else +#define XSS_ATTRIBUTE_CONSTRUCTOR __attribute__((constructor)) +#endif #include "x86simdsort.h" #include "x86simdsort-internal.h" #include "x86simdsort-scalar.h" +#include "x86simdsortcpuid.h" #include #include #include @@ -12,23 +18,19 @@ static int check_cpu_feature_support(std::string_view cpufeature) if ((cpufeature == "avx512_spr") && (!disable_avx512)) #if defined(__FLT16_MAX__) && !defined(__INTEL_LLVM_COMPILER) \ && (!defined(__clang_major__) || __clang_major__ >= 18) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512fp16") - && __builtin_cpu_supports("avx512vbmi2"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512fp16") + && xss_cpu_supports("avx512vbmi2"); #else return 0; #endif else if ((cpufeature == "avx512_icl") && (!disable_avx512)) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512vbmi2") - && __builtin_cpu_supports("avx512bw") - && __builtin_cpu_supports("avx512vl"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512vbmi2") + && xss_cpu_supports("avx512bw") && xss_cpu_supports("avx512vl"); else if ((cpufeature == "avx512_skx") && (!disable_avx512)) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512dq") - && __builtin_cpu_supports("avx512vl"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512dq") + && xss_cpu_supports("avx512vl"); else if (cpufeature == "avx2") - return __builtin_cpu_supports("avx2"); + return xss_cpu_supports("avx2"); return 0; } @@ -121,11 +123,11 @@ constexpr bool IS_TYPE_FLOAT16() /* runtime dispatch mechanism */ #define DISPATCH(func, TYPE, ISA) \ - DECLARE_INTERNAL_##func(TYPE) static __attribute__((constructor)) void \ - CAT(CAT(resolve_, func), TYPE)(void) \ + DECLARE_INTERNAL_##func(TYPE) static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ + CAT(resolve_, func), TYPE)(void) \ { \ CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ - __builtin_cpu_init(); \ + xss_cpu_init(); \ std::string_view preferred_cpu = find_preferred_cpu(ISA); \ if constexpr (dispatch_requested("avx512", ISA)) { \ if (preferred_cpu.find("avx512") != std::string_view::npos) { \ @@ -248,12 +250,12 @@ DISPATCH_ALL(argselect, } #define DISPATCH_KV_FUNC(func, TYPE1, TYPE2, ISA) \ - static __attribute__((constructor)) void CAT( \ + static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ CAT(CAT(CAT(resolve_, func), _), TYPE1), TYPE2)(void) \ { \ CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ = &xss::scalar::func; \ - __builtin_cpu_init(); \ + xss_cpu_init(); \ std::string_view preferred_cpu = find_preferred_cpu(ISA); \ if constexpr (dispatch_requested("avx512", ISA)) { \ if (preferred_cpu.find("avx512") != std::string_view::npos) { \ diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index 2e47b6a..f35bd44 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -6,8 +6,13 @@ #include #include +#if defined(_MSC_VER) +#define XSS_EXPORT_SYMBOL __declspec(dllexport) +#define XSS_HIDE_SYMBOL +#else #define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) #define XSS_HIDE_SYMBOL __attribute__((visibility("hidden"))) +#endif #define UNUSED(x) (void)(x) namespace x86simdsort { @@ -73,11 +78,14 @@ XSS_EXPORT_SYMBOL void keyvalue_partial_sort(T1 *key, template XSS_EXPORT_SYMBOL void object_qsort(T *arr, U arrsize, Func key_func) { - static_assert(std::is_integral::value, "arrsize must be an integral type"); + static_assert(std::is_integral::value, + "arrsize must be an integral type"); static_assert(sizeof(U) == sizeof(int32_t) || sizeof(U) == sizeof(int64_t), "arrsize must be 32 or 64 bits"); - using return_type_of = typename decltype(std::function{key_func})::result_type; - static_assert(sizeof(return_type_of) == sizeof(int32_t) || sizeof(return_type_of) == sizeof(int64_t), + using return_type_of = + typename decltype(std::function {key_func})::result_type; + static_assert(sizeof(return_type_of) == sizeof(int32_t) + || sizeof(return_type_of) == sizeof(int64_t), "key_func return type must be 32 or 64 bits"); std::vector keys(arrsize); for (U ii = 0; ii < arrsize; ++ii) { diff --git a/lib/x86simdsortcpuid.h b/lib/x86simdsortcpuid.h new file mode 100644 index 0000000..a3dcd9e --- /dev/null +++ b/lib/x86simdsortcpuid.h @@ -0,0 +1,48 @@ +#ifndef X86SIMDSORT_CPUID_H +#define X86SIMDSORT_CPUID_H + +#ifdef _MSC_VER +#include +#include +#include + +static std::unordered_map xss_cpu_features; + +inline void xss_cpu_init() +{ + int cpuInfo[4] = {0}; + // Check AVX2 + __cpuid(cpuInfo, 0); + int nIds = cpuInfo[0]; + __cpuid(cpuInfo, 1); + bool osxsave = (cpuInfo[2] & (1 << 27)) != 0; + bool avx = (cpuInfo[2] & (1 << 28)) != 0; + __cpuid(cpuInfo, 7); + bool avx2 = (cpuInfo[1] & (1 << 5)) != 0; + bool avx512f = (cpuInfo[1] & (1 << 16)) != 0; + bool avx512dq = (cpuInfo[1] & (1 << 17)) != 0; + bool avx512bw = (cpuInfo[1] & (1 << 30)) != 0; + bool avx512vl = (cpuInfo[1] & (1 << 31)) != 0; + bool avx512vbmi2 = (cpuInfo[2] & (1 << 6)) != 0; + bool avx512fp16 = (cpuInfo[3] & (1 << 23)) != 0; + // Store results + xss_cpu_features["avx2"] = avx2; + xss_cpu_features["avx512f"] = avx512f; + xss_cpu_features["avx512dq"] = avx512dq; + xss_cpu_features["avx512bw"] = avx512bw; + xss_cpu_features["avx512vl"] = avx512vl; + xss_cpu_features["avx512vbmi2"] = avx512vbmi2; + xss_cpu_features["avx512fp16"] = avx512fp16; +} + +inline bool xss_cpu_supports(const char *feature) +{ + auto it = xss_cpu_features.find(feature); + return it != xss_cpu_features.end() && it->second; +} + +#else +#define xss_cpu_init() __builtin_cpu_init() +#define xss_cpu_supports(feature) __builtin_cpu_supports(feature) +#endif // _MSC_VER +#endif // X86SIMDSORT_CPUID_H diff --git a/meson.build b/meson.build index 0b826f0..70c9fef 100644 --- a/meson.build +++ b/meson.build @@ -1,3 +1,4 @@ + project('x86-simd-sort', 'cpp', version : '7.0.x', license : 'BSD 3-clause', From 15e60237d1e043db5d9b4ebb324e1cc2fae3a3d1 Mon Sep 17 00:00:00 2001 From: Raghuveer Date: Fri, 12 Sep 2025 13:40:32 -0700 Subject: [PATCH 02/10] CI: use one ASAN build and run it without SDE to improve speed --- .github/workflows/c-cpp.yml | 44 ++----------------------------------- 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 4cc0422..b4162c8 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -135,7 +135,7 @@ jobs: - name: Run test suite on SPR run: sde -spr -- ./builddir/testexe - ADL-ASAN-clang18: + ASAN-clang18: runs-on: intel-ubuntu-24.04 @@ -170,47 +170,7 @@ jobs: ninja - name: Run test suite on SPR - run: sde -adl -- ./builddir/testexe - - SPR-ASAN-clang18: - - runs-on: intel-ubuntu-24.04 - - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - - name: Install dependencies - run: | - sudo apt update - sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git - - - name: Install Intel SDE - run: | - curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz - mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ - sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde - - - name: Build examples - env: - CXX: clang++-18 - run: | - cd examples - make all - - - name: Build - env: - CXX: clang++-18 - run: | - make clean - meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir - cd builddir - ninja - - - name: Run test suite on SPR - run: sde -spr -- ./builddir/testexe - - name: Run ICL fp16 tests - # Note: This filters for the _Float16 tests based on the number assigned to it, which could change in the future - run: sde -icx -- ./builddir/testexe --gtest_filter="*/simdsort/2*" + run: ./builddir/testexe SKX-SKL-openmp: From 7264a847ccfaeab5eaf6622d0eb899b010e5acc6 Mon Sep 17 00:00:00 2001 From: Raghuveer Date: Fri, 12 Sep 2025 13:40:58 -0700 Subject: [PATCH 03/10] clang-format --- benchmarks/bench.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/bench.h b/benchmarks/bench.h index 3d77a51..63fb2f8 100644 --- a/benchmarks/bench.h +++ b/benchmarks/bench.h @@ -5,12 +5,12 @@ #define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \ BENCHMARK_PRIVATE_DECLARE(func) \ = (::benchmark::internal::RegisterBenchmarkInternal( \ - std::unique_ptr( \ - new ::benchmark::internal::FunctionBenchmark( \ - #func "/" #test_case_name "/" #T, \ - [](::benchmark::State &st) { \ - func(st, __VA_ARGS__); \ - })))) + std::unique_ptr( \ + new ::benchmark::internal::FunctionBenchmark( \ + #func "/" #test_case_name "/" #T, \ + [](::benchmark::State &st) { \ + func(st, __VA_ARGS__); \ + })))) #define BENCH_SORT(func, type) \ MY_BENCHMARK_CAPTURE(func, type, random_128, 128, std::string("random")); \ From ce6c288f7ebb2ba739879e919abab5b27e3650f5 Mon Sep 17 00:00:00 2001 From: Raghuveer Date: Fri, 12 Sep 2025 13:50:14 -0700 Subject: [PATCH 04/10] Simplify compiler checks --- lib/meson.build | 63 ++++++++++++++++++++++--------------------------- meson.build | 8 +++++++ 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/lib/meson.build b/lib/meson.build index 2f9f06c..29ee139 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -1,40 +1,33 @@ libtargets = [] +libtargets += static_library('libavx', + files( + 'x86simdsort-avx2.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) -if cpp.has_argument('-march=haswell') - libtargets += static_library('libavx', - files( - 'x86simdsort-avx2.cpp', - ), - include_directories : [src], - cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif - -if cpp.has_argument('-march=skylake-avx512') - libtargets += static_library('libskx', - files( - 'x86simdsort-skx.cpp', - ), - include_directories : [src], - cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif +libtargets += static_library('libskx', + files( + 'x86simdsort-skx.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) -if cpp.has_argument('-march=icelake-client') - libtargets += static_library('libicl', - files( - 'x86simdsort-icl.cpp', - ), - include_directories : [src], - cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif +libtargets += static_library('libicl', + files( + 'x86simdsort-icl.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) if cancompilefp16 libtargets += static_library('libspr', @@ -42,7 +35,7 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : meson.get_compiler('cpp').get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) diff --git a/meson.build b/meson.build index 70c9fef..38e84d5 100644 --- a/meson.build +++ b/meson.build @@ -11,6 +11,13 @@ bench = include_directories('benchmarks') utils = include_directories('utils') tests = include_directories('tests') +# check if compiler supports -march=haswell, -march=skylake-avx512 and -march=icelake-client and error out if not +if cpp.get_id() != 'msvc' + if not cpp.has_argument('-march=haswell') or not cpp.has_argument('-march=skylake-avx512') or not cpp.has_argument('-march=icelake-client') + error('Compiler does not support -march=haswell, -march=skylake-avx512 or -march=icelake-client. Please use a newer compiler version.') + endif +endif + # Add IPP sort to benchmarks: benchipp = false ipplink = [] @@ -38,6 +45,7 @@ if get_option('use_openmp') omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP']) endif + fp16code = '''#include int main() { __m512h temp = _mm512_set1_ph(1.0f); From 4dc9609a3a19af7166aeb83ec8a176f11197da6e Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Mon, 15 Sep 2025 12:36:59 -0700 Subject: [PATCH 05/10] Add avx512 fp16 header file when build with MSVC --- lib/x86simdsort-icl.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 6bbad2c..96456ab 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -1,6 +1,10 @@ // ICL specific routines: #include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" +#ifdef _MSC_VER +#include "avx512-16bit-qsort.hpp" +#endif + namespace xss { namespace avx512 { From c1994cce527d73019f4c15b92930575513fdc2fb Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 16 Sep 2025 09:38:14 -0700 Subject: [PATCH 06/10] CI: add windows msvc build --- .github/workflows/c-cpp.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index b4162c8..a76809b 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -9,6 +9,37 @@ on: permissions: read-all jobs: + build-windows-msvc: + runs-on: windows-latest + + steps: + - uses: actions/checkout@v4 + + # Set up MSVC environment + - name: Set up MSVC Developer Command Prompt + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + + # Install Python (Meson requires it) + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + # Install Meson and Ninja + - name: Install Meson + Ninja + run: | + python -m pip install --upgrade pip + pip install meson ninja + + # Configure and build with Meson (MSVC will be used automatically) + - name: Configure (Meson) + run: meson setup --warnlevel 2 --buildtype release builddir --backend=ninja + + - name: Build (Ninja) + run: ninja -C builddir + SKL-gcc9: runs-on: intel-ubuntu-24.04 From 1be8ef6e99497604e61133bf421046ae2a3d5d25 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 24 Sep 2025 12:37:38 -0700 Subject: [PATCH 07/10] Use XSS_EXPORT_SYMBOL on all template specializations --- lib/x86simdsort.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 3ee12dd..2f052ac 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -62,7 +62,7 @@ namespace x86simdsort { #define DECLARE_INTERNAL_qsort(TYPE) \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ template <> \ - void qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + void XSS_EXPORT_SYMBOL qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -71,7 +71,7 @@ namespace x86simdsort { static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void qselect( \ + void XSS_EXPORT_SYMBOL qselect( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qselect##TYPE)(arr, k, arrsize, hasnan, descending); \ @@ -82,7 +82,7 @@ namespace x86simdsort { TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void partial_qsort( \ + void XSS_EXPORT_SYMBOL partial_qsort( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan, descending); \ @@ -93,7 +93,7 @@ namespace x86simdsort { TYPE *, size_t, bool, bool) \ = NULL; \ template <> \ - std::vector argsort( \ + std::vector XSS_EXPORT_SYMBOL argsort( \ TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ @@ -104,7 +104,7 @@ namespace x86simdsort { TYPE *, size_t, size_t, bool) \ = NULL; \ template <> \ - std::vector argselect( \ + std::vector XSS_EXPORT_SYMBOL argselect( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ { \ return (*internal_argselect##TYPE)(arr, k, arrsize, hasnan); \ @@ -217,7 +217,7 @@ DISPATCH_ALL(argselect, TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void keyvalue_qsort(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ TYPE2 *val, \ size_t arrsize, \ bool hasnan, \ @@ -227,7 +227,7 @@ DISPATCH_ALL(argselect, key, val, arrsize, hasnan, descending); \ } \ template <> \ - void keyvalue_select(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ TYPE2 *val, \ size_t k, \ size_t arrsize, \ @@ -238,7 +238,7 @@ DISPATCH_ALL(argselect, key, val, k, arrsize, hasnan, descending); \ } \ template <> \ - void keyvalue_partial_sort(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ TYPE2 *val, \ size_t k, \ size_t arrsize, \ From 4e209f1b9f96a6aeba816c4745df7c6540f5d540 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 24 Sep 2025 13:14:06 -0700 Subject: [PATCH 08/10] Replace intel-ubuntu-24.04 with ubuntu-24.04 --- .github/workflows/build-numpy.yml | 4 ++-- .github/workflows/c-cpp.yml | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml index 9d2246e..61fe69c 100644 --- a/.github/workflows/build-numpy.yml +++ b/.github/workflows/build-numpy.yml @@ -14,7 +14,7 @@ jobs: np-multiarray-tgl: if: github.repository == 'intel/x86-simd-sort' - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - name: Checkout x86-simd-sort @@ -80,7 +80,7 @@ jobs: np-multiarray-spr: if: github.repository == 'intel/x86-simd-sort' - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - name: Checkout x86-simd-sort diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index a76809b..145ceca 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -42,7 +42,7 @@ jobs: SKL-gcc9: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -72,7 +72,7 @@ jobs: SKX-gcc10: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -102,7 +102,7 @@ jobs: TGL-gcc11: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -131,7 +131,7 @@ jobs: SPR-gcc13: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -168,7 +168,7 @@ jobs: ASAN-clang18: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -205,7 +205,7 @@ jobs: SKX-SKL-openmp: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -237,7 +237,7 @@ jobs: SPR-gcc13-special-cases: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 From 3c007d7688109855b868ee9f6823f6ab80023dba Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 24 Sep 2025 13:21:55 -0700 Subject: [PATCH 09/10] Fix CI numpy runs --- .github/workflows/build-numpy.yml | 4 ++-- .github/workflows/c-cpp.yml | 4 ++-- .github/workflows/linting.yml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml index 61fe69c..9f6c9b4 100644 --- a/.github/workflows/build-numpy.yml +++ b/.github/workflows/build-numpy.yml @@ -13,7 +13,7 @@ permissions: read-all jobs: np-multiarray-tgl: - if: github.repository == 'intel/x86-simd-sort' + if: github.repository == 'numpy/x86-simd-sort' runs-on: ubuntu-24.04 steps: @@ -79,7 +79,7 @@ jobs: np-multiarray-spr: - if: github.repository == 'intel/x86-simd-sort' + if: github.repository == 'numpy/x86-simd-sort' runs-on: ubuntu-24.04 steps: diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 145ceca..2830db3 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -272,7 +272,7 @@ jobs: manylinux-32bit: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -284,7 +284,7 @@ jobs: SPR-icpx: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 4333186..eebff04 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -11,7 +11,7 @@ permissions: read-all jobs: clang-format: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 From 4cb589ed63740e35d5b539304186ae421b2495a6 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Wed, 24 Sep 2025 13:51:21 -0700 Subject: [PATCH 10/10] Add checks for xsave and _xgetbv in cpuid checks --- lib/x86simdsortcpuid.h | 81 ++++++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/lib/x86simdsortcpuid.h b/lib/x86simdsortcpuid.h index a3dcd9e..6da177d 100644 --- a/lib/x86simdsortcpuid.h +++ b/lib/x86simdsortcpuid.h @@ -8,31 +8,66 @@ static std::unordered_map xss_cpu_features; -inline void xss_cpu_init() +static bool os_supports_avx() { - int cpuInfo[4] = {0}; - // Check AVX2 - __cpuid(cpuInfo, 0); - int nIds = cpuInfo[0]; + int cpuInfo[4]; __cpuid(cpuInfo, 1); - bool osxsave = (cpuInfo[2] & (1 << 27)) != 0; - bool avx = (cpuInfo[2] & (1 << 28)) != 0; - __cpuid(cpuInfo, 7); - bool avx2 = (cpuInfo[1] & (1 << 5)) != 0; - bool avx512f = (cpuInfo[1] & (1 << 16)) != 0; - bool avx512dq = (cpuInfo[1] & (1 << 17)) != 0; - bool avx512bw = (cpuInfo[1] & (1 << 30)) != 0; - bool avx512vl = (cpuInfo[1] & (1 << 31)) != 0; - bool avx512vbmi2 = (cpuInfo[2] & (1 << 6)) != 0; - bool avx512fp16 = (cpuInfo[3] & (1 << 23)) != 0; - // Store results - xss_cpu_features["avx2"] = avx2; - xss_cpu_features["avx512f"] = avx512f; - xss_cpu_features["avx512dq"] = avx512dq; - xss_cpu_features["avx512bw"] = avx512bw; - xss_cpu_features["avx512vl"] = avx512vl; - xss_cpu_features["avx512vbmi2"] = avx512vbmi2; - xss_cpu_features["avx512fp16"] = avx512fp16; + + bool osxsaveSupported = (cpuInfo[2] & (1 << 27)) != 0; // OSXSAVE bit + bool avxSupported = (cpuInfo[2] & (1 << 28)) != 0; // AVX bit + if (!(avxSupported && osxsaveSupported)) + return false; + + // Check XCR0[2:1] (XMM and YMM state) + unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0x6) == 0x6; +} + +static bool os_supports_avx512() +{ + if (!os_supports_avx()) + return false; + + // Need XCR0[7:5] = opmask/ZMM/YMM state enabled + unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0xE0) == 0xE0; +} + +void xss_cpu_init() +{ + int cpuInfo[4]; + __cpuid(cpuInfo, 0); + int maxLeaf = cpuInfo[0]; + + bool hasAVX2 = false; + bool hasAVX512F = false, hasAVX512DQ = false, hasAVX512BW = false, hasAVX512VL = false; + bool hasAVX512VBMI2 = false, hasAVX512FP16 = false; + + if (maxLeaf >= 7) + { + __cpuidex(cpuInfo, 7, 0); + + // EBX bits + hasAVX2 = os_supports_avx() && (cpuInfo[1] & (1 << 5)); + hasAVX512F = os_supports_avx512() && (cpuInfo[1] & (1 << 16)); + hasAVX512DQ = os_supports_avx512() && (cpuInfo[1] & (1 << 17)); + hasAVX512BW = os_supports_avx512() && (cpuInfo[1] & (1 << 30)); + hasAVX512VL = os_supports_avx512() && (cpuInfo[1] & (1 << 31)); + + // ECX bits + hasAVX512VBMI2 = os_supports_avx512() && (cpuInfo[2] & (1 << 6)); + + // EDX bits + hasAVX512FP16 = os_supports_avx512() && (cpuInfo[3] & (1 << 23)); + } + + xss_cpu_features["avx2"] = hasAVX2; + xss_cpu_features["avx512f"] = hasAVX512F; + xss_cpu_features["avx512dq"] = hasAVX512DQ; + xss_cpu_features["avx512bw"] = hasAVX512BW; + xss_cpu_features["avx512vl"] = hasAVX512VL; + xss_cpu_features["avx512vbmi2"] = hasAVX512VBMI2; + xss_cpu_features["avx512fp16"] = hasAVX512FP16; } inline bool xss_cpu_supports(const char *feature)