diff --git a/.github/workflows/build-numpy.yml b/.github/workflows/build-numpy.yml index 9d2246e..9f6c9b4 100644 --- a/.github/workflows/build-numpy.yml +++ b/.github/workflows/build-numpy.yml @@ -13,8 +13,8 @@ permissions: read-all jobs: np-multiarray-tgl: - if: github.repository == 'intel/x86-simd-sort' - runs-on: intel-ubuntu-24.04 + if: github.repository == 'numpy/x86-simd-sort' + runs-on: ubuntu-24.04 steps: - name: Checkout x86-simd-sort @@ -79,8 +79,8 @@ jobs: np-multiarray-spr: - if: github.repository == 'intel/x86-simd-sort' - runs-on: intel-ubuntu-24.04 + if: github.repository == 'numpy/x86-simd-sort' + runs-on: ubuntu-24.04 steps: - name: Checkout x86-simd-sort diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml index 4cc0422..2830db3 100644 --- a/.github/workflows/c-cpp.yml +++ b/.github/workflows/c-cpp.yml @@ -9,9 +9,40 @@ on: permissions: read-all jobs: + build-windows-msvc: + runs-on: windows-latest + + steps: + - uses: actions/checkout@v4 + + # Set up MSVC environment + - name: Set up MSVC Developer Command Prompt + uses: ilammy/msvc-dev-cmd@v1 + with: + arch: x64 + + # Install Python (Meson requires it) + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + # Install Meson and Ninja + - name: Install Meson + Ninja + run: | + python -m pip install --upgrade pip + pip install meson ninja + + # Configure and build with Meson (MSVC will be used automatically) + - name: Configure (Meson) + run: meson setup --warnlevel 2 --buildtype release builddir --backend=ninja + + - name: Build (Ninja) + run: ninja -C builddir + SKL-gcc9: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -41,7 +72,7 @@ jobs: SKX-gcc10: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -71,7 +102,7 @@ jobs: TGL-gcc11: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -100,7 +131,7 @@ jobs: SPR-gcc13: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -135,9 +166,9 @@ jobs: - name: Run test suite on SPR run: sde -spr -- ./builddir/testexe - ADL-ASAN-clang18: + ASAN-clang18: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -170,51 +201,11 @@ jobs: ninja - name: Run test suite on SPR - run: sde -adl -- ./builddir/testexe - - SPR-ASAN-clang18: - - runs-on: intel-ubuntu-24.04 - - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - - name: Install dependencies - run: | - sudo apt update - sudo apt -y install clang-18 libomp-18-dev libgtest-dev meson curl git - - - name: Install Intel SDE - run: | - curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/859732/sde-external-9.58.0-2025-06-16-lin.tar.xz - mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/ - sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde - - - name: Build examples - env: - CXX: clang++-18 - run: | - cd examples - make all - - - name: Build - env: - CXX: clang++-18 - run: | - make clean - meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Dasan_ci_dont_validate=true -Db_lundef=false --warnlevel 0 --buildtype release builddir - cd builddir - ninja - - - name: Run test suite on SPR - run: sde -spr -- ./builddir/testexe - - name: Run ICL fp16 tests - # Note: This filters for the _Float16 tests based on the number assigned to it, which could change in the future - run: sde -icx -- ./builddir/testexe --gtest_filter="*/simdsort/2*" + run: ./builddir/testexe SKX-SKL-openmp: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -246,7 +237,7 @@ jobs: SPR-gcc13-special-cases: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -281,7 +272,7 @@ jobs: manylinux-32bit: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 @@ -293,7 +284,7 @@ jobs: SPR-icpx: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 4333186..eebff04 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -11,7 +11,7 @@ permissions: read-all jobs: clang-format: - runs-on: intel-ubuntu-24.04 + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 diff --git a/benchmarks/bench.h b/benchmarks/bench.h index 3d77a51..63fb2f8 100644 --- a/benchmarks/bench.h +++ b/benchmarks/bench.h @@ -5,12 +5,12 @@ #define MY_BENCHMARK_CAPTURE(func, T, test_case_name, ...) \ BENCHMARK_PRIVATE_DECLARE(func) \ = (::benchmark::internal::RegisterBenchmarkInternal( \ - std::unique_ptr( \ - new ::benchmark::internal::FunctionBenchmark( \ - #func "/" #test_case_name "/" #T, \ - [](::benchmark::State &st) { \ - func(st, __VA_ARGS__); \ - })))) + std::unique_ptr( \ + new ::benchmark::internal::FunctionBenchmark( \ + #func "/" #test_case_name "/" #T, \ + [](::benchmark::State &st) { \ + func(st, __VA_ARGS__); \ + })))) #define BENCH_SORT(func, type) \ MY_BENCHMARK_CAPTURE(func, type, random_128, 128, std::string("random")); \ diff --git a/lib/meson.build b/lib/meson.build index 44ced53..29ee139 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -1,40 +1,33 @@ libtargets = [] +libtargets += static_library('libavx', + files( + 'x86simdsort-avx2.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX2'] : ['-march=haswell'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) -if cpp.has_argument('-march=haswell') - libtargets += static_library('libavx', - files( - 'x86simdsort-avx2.cpp', - ), - include_directories : [src], - cpp_args : ['-march=haswell'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif - -if cpp.has_argument('-march=skylake-avx512') - libtargets += static_library('libskx', - files( - 'x86simdsort-skx.cpp', - ), - include_directories : [src], - cpp_args : ['-march=skylake-avx512'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif +libtargets += static_library('libskx', + files( + 'x86simdsort-skx.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=skylake-avx512'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) -if cpp.has_argument('-march=icelake-client') - libtargets += static_library('libicl', - files( - 'x86simdsort-icl.cpp', - ), - include_directories : [src], - cpp_args : ['-march=icelake-client'], - gnu_symbol_visibility : 'inlineshidden', - dependencies: [omp_dep], - ) -endif +libtargets += static_library('libicl', + files( + 'x86simdsort-icl.cpp', + ), + include_directories : [src], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=icelake-client'], + gnu_symbol_visibility : 'inlineshidden', + dependencies: [omp_dep], + ) if cancompilefp16 libtargets += static_library('libspr', @@ -42,7 +35,7 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : ['-march=sapphirerapids'], + cpp_args : cpp.get_id() == 'msvc' ? ['/arch:AVX512'] : ['-march=sapphirerapids'], gnu_symbol_visibility : 'inlineshidden', dependencies: [omp_dep], ) diff --git a/lib/x86simdsort-icl.cpp b/lib/x86simdsort-icl.cpp index 6bbad2c..96456ab 100644 --- a/lib/x86simdsort-icl.cpp +++ b/lib/x86simdsort-icl.cpp @@ -1,6 +1,10 @@ // ICL specific routines: #include "x86simdsort-static-incl.h" #include "x86simdsort-internal.h" +#ifdef _MSC_VER +#include "avx512-16bit-qsort.hpp" +#endif + namespace xss { namespace avx512 { diff --git a/lib/x86simdsort.cpp b/lib/x86simdsort.cpp index 8ef9aad..2f052ac 100644 --- a/lib/x86simdsort.cpp +++ b/lib/x86simdsort.cpp @@ -1,6 +1,12 @@ +#if defined(_MSC_VER) +#define XSS_ATTRIBUTE_CONSTRUCTOR +#else +#define XSS_ATTRIBUTE_CONSTRUCTOR __attribute__((constructor)) +#endif #include "x86simdsort.h" #include "x86simdsort-internal.h" #include "x86simdsort-scalar.h" +#include "x86simdsortcpuid.h" #include #include #include @@ -12,23 +18,19 @@ static int check_cpu_feature_support(std::string_view cpufeature) if ((cpufeature == "avx512_spr") && (!disable_avx512)) #if defined(__FLT16_MAX__) && !defined(__INTEL_LLVM_COMPILER) \ && (!defined(__clang_major__) || __clang_major__ >= 18) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512fp16") - && __builtin_cpu_supports("avx512vbmi2"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512fp16") + && xss_cpu_supports("avx512vbmi2"); #else return 0; #endif else if ((cpufeature == "avx512_icl") && (!disable_avx512)) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512vbmi2") - && __builtin_cpu_supports("avx512bw") - && __builtin_cpu_supports("avx512vl"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512vbmi2") + && xss_cpu_supports("avx512bw") && xss_cpu_supports("avx512vl"); else if ((cpufeature == "avx512_skx") && (!disable_avx512)) - return __builtin_cpu_supports("avx512f") - && __builtin_cpu_supports("avx512dq") - && __builtin_cpu_supports("avx512vl"); + return xss_cpu_supports("avx512f") && xss_cpu_supports("avx512dq") + && xss_cpu_supports("avx512vl"); else if (cpufeature == "avx2") - return __builtin_cpu_supports("avx2"); + return xss_cpu_supports("avx2"); return 0; } @@ -60,7 +62,7 @@ namespace x86simdsort { #define DECLARE_INTERNAL_qsort(TYPE) \ static void (*internal_qsort##TYPE)(TYPE *, size_t, bool, bool) = NULL; \ template <> \ - void qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ + void XSS_EXPORT_SYMBOL qsort(TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qsort##TYPE)(arr, arrsize, hasnan, descending); \ } @@ -69,7 +71,7 @@ namespace x86simdsort { static void (*internal_qselect##TYPE)(TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void qselect( \ + void XSS_EXPORT_SYMBOL qselect( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_qselect##TYPE)(arr, k, arrsize, hasnan, descending); \ @@ -80,7 +82,7 @@ namespace x86simdsort { TYPE *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void partial_qsort( \ + void XSS_EXPORT_SYMBOL partial_qsort( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan, bool descending) \ { \ (*internal_partial_qsort##TYPE)(arr, k, arrsize, hasnan, descending); \ @@ -91,7 +93,7 @@ namespace x86simdsort { TYPE *, size_t, bool, bool) \ = NULL; \ template <> \ - std::vector argsort( \ + std::vector XSS_EXPORT_SYMBOL argsort( \ TYPE *arr, size_t arrsize, bool hasnan, bool descending) \ { \ return (*internal_argsort##TYPE)(arr, arrsize, hasnan, descending); \ @@ -102,7 +104,7 @@ namespace x86simdsort { TYPE *, size_t, size_t, bool) \ = NULL; \ template <> \ - std::vector argselect( \ + std::vector XSS_EXPORT_SYMBOL argselect( \ TYPE *arr, size_t k, size_t arrsize, bool hasnan) \ { \ return (*internal_argselect##TYPE)(arr, k, arrsize, hasnan); \ @@ -121,11 +123,11 @@ constexpr bool IS_TYPE_FLOAT16() /* runtime dispatch mechanism */ #define DISPATCH(func, TYPE, ISA) \ - DECLARE_INTERNAL_##func(TYPE) static __attribute__((constructor)) void \ - CAT(CAT(resolve_, func), TYPE)(void) \ + DECLARE_INTERNAL_##func(TYPE) static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ + CAT(resolve_, func), TYPE)(void) \ { \ CAT(CAT(internal_, func), TYPE) = &xss::scalar::func; \ - __builtin_cpu_init(); \ + xss_cpu_init(); \ std::string_view preferred_cpu = find_preferred_cpu(ISA); \ if constexpr (dispatch_requested("avx512", ISA)) { \ if (preferred_cpu.find("avx512") != std::string_view::npos) { \ @@ -215,7 +217,7 @@ DISPATCH_ALL(argselect, TYPE1 *, TYPE2 *, size_t, size_t, bool, bool) \ = NULL; \ template <> \ - void keyvalue_qsort(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_qsort(TYPE1 *key, \ TYPE2 *val, \ size_t arrsize, \ bool hasnan, \ @@ -225,7 +227,7 @@ DISPATCH_ALL(argselect, key, val, arrsize, hasnan, descending); \ } \ template <> \ - void keyvalue_select(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_select(TYPE1 *key, \ TYPE2 *val, \ size_t k, \ size_t arrsize, \ @@ -236,7 +238,7 @@ DISPATCH_ALL(argselect, key, val, k, arrsize, hasnan, descending); \ } \ template <> \ - void keyvalue_partial_sort(TYPE1 *key, \ + void XSS_EXPORT_SYMBOL keyvalue_partial_sort(TYPE1 *key, \ TYPE2 *val, \ size_t k, \ size_t arrsize, \ @@ -248,12 +250,12 @@ DISPATCH_ALL(argselect, } #define DISPATCH_KV_FUNC(func, TYPE1, TYPE2, ISA) \ - static __attribute__((constructor)) void CAT( \ + static XSS_ATTRIBUTE_CONSTRUCTOR void CAT( \ CAT(CAT(CAT(resolve_, func), _), TYPE1), TYPE2)(void) \ { \ CAT(CAT(CAT(CAT(internal_, func), _), TYPE1), TYPE2) \ = &xss::scalar::func; \ - __builtin_cpu_init(); \ + xss_cpu_init(); \ std::string_view preferred_cpu = find_preferred_cpu(ISA); \ if constexpr (dispatch_requested("avx512", ISA)) { \ if (preferred_cpu.find("avx512") != std::string_view::npos) { \ diff --git a/lib/x86simdsort.h b/lib/x86simdsort.h index 2e47b6a..f35bd44 100644 --- a/lib/x86simdsort.h +++ b/lib/x86simdsort.h @@ -6,8 +6,13 @@ #include #include +#if defined(_MSC_VER) +#define XSS_EXPORT_SYMBOL __declspec(dllexport) +#define XSS_HIDE_SYMBOL +#else #define XSS_EXPORT_SYMBOL __attribute__((visibility("default"))) #define XSS_HIDE_SYMBOL __attribute__((visibility("hidden"))) +#endif #define UNUSED(x) (void)(x) namespace x86simdsort { @@ -73,11 +78,14 @@ XSS_EXPORT_SYMBOL void keyvalue_partial_sort(T1 *key, template XSS_EXPORT_SYMBOL void object_qsort(T *arr, U arrsize, Func key_func) { - static_assert(std::is_integral::value, "arrsize must be an integral type"); + static_assert(std::is_integral::value, + "arrsize must be an integral type"); static_assert(sizeof(U) == sizeof(int32_t) || sizeof(U) == sizeof(int64_t), "arrsize must be 32 or 64 bits"); - using return_type_of = typename decltype(std::function{key_func})::result_type; - static_assert(sizeof(return_type_of) == sizeof(int32_t) || sizeof(return_type_of) == sizeof(int64_t), + using return_type_of = + typename decltype(std::function {key_func})::result_type; + static_assert(sizeof(return_type_of) == sizeof(int32_t) + || sizeof(return_type_of) == sizeof(int64_t), "key_func return type must be 32 or 64 bits"); std::vector keys(arrsize); for (U ii = 0; ii < arrsize; ++ii) { diff --git a/lib/x86simdsortcpuid.h b/lib/x86simdsortcpuid.h new file mode 100644 index 0000000..6da177d --- /dev/null +++ b/lib/x86simdsortcpuid.h @@ -0,0 +1,83 @@ +#ifndef X86SIMDSORT_CPUID_H +#define X86SIMDSORT_CPUID_H + +#ifdef _MSC_VER +#include +#include +#include + +static std::unordered_map xss_cpu_features; + +static bool os_supports_avx() +{ + int cpuInfo[4]; + __cpuid(cpuInfo, 1); + + bool osxsaveSupported = (cpuInfo[2] & (1 << 27)) != 0; // OSXSAVE bit + bool avxSupported = (cpuInfo[2] & (1 << 28)) != 0; // AVX bit + if (!(avxSupported && osxsaveSupported)) + return false; + + // Check XCR0[2:1] (XMM and YMM state) + unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0x6) == 0x6; +} + +static bool os_supports_avx512() +{ + if (!os_supports_avx()) + return false; + + // Need XCR0[7:5] = opmask/ZMM/YMM state enabled + unsigned long long xcr0 = _xgetbv(0); + return (xcr0 & 0xE0) == 0xE0; +} + +void xss_cpu_init() +{ + int cpuInfo[4]; + __cpuid(cpuInfo, 0); + int maxLeaf = cpuInfo[0]; + + bool hasAVX2 = false; + bool hasAVX512F = false, hasAVX512DQ = false, hasAVX512BW = false, hasAVX512VL = false; + bool hasAVX512VBMI2 = false, hasAVX512FP16 = false; + + if (maxLeaf >= 7) + { + __cpuidex(cpuInfo, 7, 0); + + // EBX bits + hasAVX2 = os_supports_avx() && (cpuInfo[1] & (1 << 5)); + hasAVX512F = os_supports_avx512() && (cpuInfo[1] & (1 << 16)); + hasAVX512DQ = os_supports_avx512() && (cpuInfo[1] & (1 << 17)); + hasAVX512BW = os_supports_avx512() && (cpuInfo[1] & (1 << 30)); + hasAVX512VL = os_supports_avx512() && (cpuInfo[1] & (1 << 31)); + + // ECX bits + hasAVX512VBMI2 = os_supports_avx512() && (cpuInfo[2] & (1 << 6)); + + // EDX bits + hasAVX512FP16 = os_supports_avx512() && (cpuInfo[3] & (1 << 23)); + } + + xss_cpu_features["avx2"] = hasAVX2; + xss_cpu_features["avx512f"] = hasAVX512F; + xss_cpu_features["avx512dq"] = hasAVX512DQ; + xss_cpu_features["avx512bw"] = hasAVX512BW; + xss_cpu_features["avx512vl"] = hasAVX512VL; + xss_cpu_features["avx512vbmi2"] = hasAVX512VBMI2; + xss_cpu_features["avx512fp16"] = hasAVX512FP16; +} + +inline bool xss_cpu_supports(const char *feature) +{ + auto it = xss_cpu_features.find(feature); + return it != xss_cpu_features.end() && it->second; +} + +#else +#define xss_cpu_init() __builtin_cpu_init() +#define xss_cpu_supports(feature) __builtin_cpu_supports(feature) +#endif // _MSC_VER +#endif // X86SIMDSORT_CPUID_H diff --git a/meson.build b/meson.build index 0b826f0..38e84d5 100644 --- a/meson.build +++ b/meson.build @@ -1,3 +1,4 @@ + project('x86-simd-sort', 'cpp', version : '7.0.x', license : 'BSD 3-clause', @@ -10,6 +11,13 @@ bench = include_directories('benchmarks') utils = include_directories('utils') tests = include_directories('tests') +# check if compiler supports -march=haswell, -march=skylake-avx512 and -march=icelake-client and error out if not +if cpp.get_id() != 'msvc' + if not cpp.has_argument('-march=haswell') or not cpp.has_argument('-march=skylake-avx512') or not cpp.has_argument('-march=icelake-client') + error('Compiler does not support -march=haswell, -march=skylake-avx512 or -march=icelake-client. Please use a newer compiler version.') + endif +endif + # Add IPP sort to benchmarks: benchipp = false ipplink = [] @@ -37,6 +45,7 @@ if get_option('use_openmp') omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP']) endif + fp16code = '''#include int main() { __m512h temp = _mm512_set1_ph(1.0f);