Skip to content

Commit

Permalink
Mitigated poor compressstore performance on AMD Zen 4
Browse files Browse the repository at this point in the history
Zen 4's compressstore AVX512 implementation is highly inefficient (throughput of 50-70). Emulating it using `compress` & `storeu` separately is, in fact, faster than the native operation.

To choose between the native/emulated, a `SW_VCOMPRESS` flag can be passed to the make file (`SW_COMPRESS=1 make`)
  • Loading branch information
natmaurice committed Feb 16, 2023
1 parent 7d7591c commit 41d03b2
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 10 deletions.
3 changes: 2 additions & 1 deletion Makefile
Expand Up @@ -11,6 +11,7 @@ GTEST_LIB = gtest
GTEST_INCLUDE = /usr/local/include
CXXFLAGS += -I$(SRCDIR) -I$(GTEST_INCLUDE) -I$(UTILS)
LD_FLAGS = -L /usr/local/lib -l $(GTEST_LIB) -l pthread
SW_VCOMPRESS ?= 0

all : test bench

Expand All @@ -21,7 +22,7 @@ test: $(TESTDIR)/main.cpp $(TESTOBJS) $(SRCS)
$(CXX) tests/main.cpp $(TESTOBJS) $(CXXFLAGS) $(LD_FLAGS) -o testexe

bench: $(BENCHDIR)/main.cpp $(SRCS)
$(CXX) $(BENCHDIR)/main.cpp $(CXXFLAGS) -march=icelake-client -O3 -o benchexe
$(CXX) $(BENCHDIR)/main.cpp $(CXXFLAGS) -march=icelake-client -O3 -DSW_VCOMPRESS=$(SW_VCOMPRESS) -o benchexe

clean:
rm -f $(TESTDIR)/*.o testexe benchexe
30 changes: 27 additions & 3 deletions src/avx512-16bit-qsort.hpp
Expand Up @@ -8,6 +8,8 @@
#define AVX512_QSORT_16BIT

#include "avx512-common-qsort.h"
#include <immintrin.h>
#include <x86intrin.h>

/*
* Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic
Expand Down Expand Up @@ -103,7 +105,15 @@ struct zmm_vector<float16> {
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
// AVX512_VBMI2
return _mm512_mask_compressstoreu_epi16(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_epi16(mem, store_mask, _mm512_maskz_compress_epi16(mask, x));
#else
_mm512_mask_compressstoreu_epi16(mem, mask, x);
#endif // SW_VCOMPRESS

//return
}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down Expand Up @@ -218,7 +228,14 @@ struct zmm_vector<int16_t> {
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
// AVX512_VBMI2
return _mm512_mask_compressstoreu_epi16(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_epi16(mem, store_mask, _mm512_maskz_compress_epi16(mask, x));
#else
_mm512_mask_compressstoreu_epi16(mem, mask, x);
#endif // SW_VCOMPRESS

}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down Expand Up @@ -315,7 +332,14 @@ struct zmm_vector<uint16_t> {
}
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
return _mm512_mask_compressstoreu_epi16(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_epi16(mem, store_mask, _mm512_maskz_compress_epi16(mask, x));
#else
_mm512_mask_compressstoreu_epi16(mem, mask, x);
#endif // SW_VCOMPRESS

}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down
29 changes: 26 additions & 3 deletions src/avx512-32bit-qsort.hpp
Expand Up @@ -9,6 +9,8 @@
#define AVX512_QSORT_32BIT

#include "avx512-common-qsort.h"
#include <immintrin.h>
#include <x86intrin.h>

/*
* Constants used in sorting 16 elements in a ZMM registers. Based on Bitonic
Expand Down Expand Up @@ -68,7 +70,14 @@ struct zmm_vector<int32_t> {
}
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
return _mm512_mask_compressstoreu_epi32(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_epi32(mem, store_mask, _mm512_maskz_compress_epi32(mask, x));
#else
_mm512_mask_compressstoreu_epi32(mem, mask, x);
#endif // SW_VCOMPRESS

}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down Expand Up @@ -174,7 +183,14 @@ struct zmm_vector<uint32_t> {
}
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
return _mm512_mask_compressstoreu_epi32(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_epi32(mem, store_mask, _mm512_maskz_compress_epi32(mask, x));
#else
_mm512_mask_compressstoreu_epi32(mem, mask, x);
#endif // SW_VCOMPRESS

}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down Expand Up @@ -277,7 +293,14 @@ struct zmm_vector<float> {
}
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
return _mm512_mask_compressstoreu_ps(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_ps(mem, store_mask, _mm512_maskz_compress_ps(mask, x));
#else
_mm512_mask_compressstoreu_ps(mem, mask, x);
#endif // SW_VCOMPRESS

}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down
29 changes: 26 additions & 3 deletions src/avx512-64bit-qsort.hpp
Expand Up @@ -8,6 +8,8 @@
#define AVX512_QSORT_64BIT

#include "avx512-common-qsort.h"
#include <immintrin.h>
#include <x86intrin.h>

/*
* Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic
Expand Down Expand Up @@ -76,7 +78,14 @@ struct zmm_vector<int64_t> {
}
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
return _mm512_mask_compressstoreu_epi64(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_epi64(mem, store_mask, _mm512_maskz_compress_epi64(mask, x));
#else
_mm512_mask_compressstoreu_epi64(mem, mask, x);
#endif // SW_VCOMPRESS

}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down Expand Up @@ -178,7 +187,14 @@ struct zmm_vector<uint64_t> {
}
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
return _mm512_mask_compressstoreu_epi64(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_epi64(mem, store_mask, _mm512_maskz_compress_epi64(mask, x));
#else
_mm512_mask_compressstoreu_epi64(mem, mask, x);
#endif // SW_VCOMPRESS

}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down Expand Up @@ -280,7 +296,14 @@ struct zmm_vector<double> {
}
static void mask_compressstoreu(void *mem, opmask_t mask, zmm_t x)
{
return _mm512_mask_compressstoreu_pd(mem, mask, x);

#if SW_VCOMPRESS
opmask_t store_mask = _pext_u32(-1, mask);
_mm512_mask_storeu_pd(mem, store_mask, _mm512_maskz_compress_pd(mask, x));
#else
_mm512_mask_compressstoreu_pd(mem, mask, x);
#endif // SW_VCOMPRESS

}
static zmm_t mask_loadu(zmm_t x, opmask_t mask, void const *mem)
{
Expand Down

3 comments on commit 41d03b2

@mr-c
Copy link

@mr-c mr-c commented on 41d03b2 Feb 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@natmaurice This is interesting! Have you opened an issue with gcc?

@natmaurice
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mr-c That's a good point.

So far, the current release of gcc (12.2) does not support Zen 4 (-march=znver4), and neither does clang.
This support should arrive for gcc 13. Unfortunately, the pre-release doesn't seem to optimize compressstoreu into a faster emulated version. I haven't found any report about the issue either.

So yes, I'm probably going to follow your advice and will file an issue with gcc.

@mr-c
Copy link

@mr-c mr-c commented on 41d03b2 Feb 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is the best time to report bugs in GCC, before the first release of the new series is out.

By the way, I adapted the code here in a branch of SIMDe I'm currently working on: simd-everywhere/simde@13cc2be (frequently rebased in https://github.com/simd-everywhere/simde/tree/x86-simd-sort )

Do you know if clang supports Zen 4 yet? I can add a similar fix.

Please sign in to comment.