diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index f70b3991473fa..cd43d8ec00b5d 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -332,226 +332,11 @@ void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, const int64_t num_valu } #endif // ARROW_HAVE_AVX2 -#if defined(ARROW_HAVE_AVX512) -template -void ByteStreamSplitDecodeAvx512(const uint8_t* data, int64_t num_values, int64_t stride, - uint8_t* out) { - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2); - constexpr int64_t kBlockSize = sizeof(__m512i) * kNumStreams; - - const int64_t size = num_values * kNumStreams; - if (size < kBlockSize) // Back to AVX2 for small size - return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); - const int64_t num_blocks = size / kBlockSize; - - // First handle suffix. - const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; - for (int64_t i = num_processed_elements; i < num_values; ++i) { - uint8_t gathered_byte_data[kNumStreams]; - for (int b = 0; b < kNumStreams; ++b) { - const int64_t byte_index = b * stride + i; - gathered_byte_data[b] = data[byte_index]; - } - memcpy(out + i * kNumStreams, gathered_byte_data, kNumStreams); - } - - // Processed hierarchically using the unpack, then two shuffles. - __m512i stage[kNumStreamsLog2 + 1][kNumStreams]; - __m512i shuffle[kNumStreams]; - __m512i final_result[kNumStreams]; - constexpr int kNumStreamsHalf = kNumStreams / 2U; - - for (int64_t i = 0; i < num_blocks; ++i) { - for (int j = 0; j < kNumStreams; ++j) { - stage[0][j] = _mm512_loadu_si512( - reinterpret_cast(&data[i * sizeof(__m512i) + j * stride])); - } - - for (int step = 0; step < kNumStreamsLog2; ++step) { - for (int j = 0; j < kNumStreamsHalf; ++j) { - stage[step + 1][j * 2] = - _mm512_unpacklo_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); - stage[step + 1][j * 2 + 1] = - _mm512_unpackhi_epi8(stage[step][j], stage[step][kNumStreamsHalf + j]); - } - } - - if constexpr (kNumStreams == 8) { - // path for double, 128i index: - // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C}, - // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D}, - // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E}, - // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F}, - shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b01000100); - shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b01000100); - shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4], - stage[kNumStreamsLog2][5], 0b01000100); - shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6], - stage[kNumStreamsLog2][7], 0b01000100); - shuffle[4] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b11101110); - shuffle[5] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b11101110); - shuffle[6] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][4], - stage[kNumStreamsLog2][5], 0b11101110); - shuffle[7] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][6], - stage[kNumStreamsLog2][7], 0b11101110); - - final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000); - final_result[1] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000); - final_result[2] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101); - final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101); - final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000); - final_result[5] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000); - final_result[6] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101); - final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101); - } else { - // path for float, 128i index: - // {0x00, 0x04, 0x08, 0x0C}, {0x01, 0x05, 0x09, 0x0D} - // {0x02, 0x06, 0x0A, 0x0E}, {0x03, 0x07, 0x0B, 0x0F}, - shuffle[0] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b01000100); - shuffle[1] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b01000100); - shuffle[2] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][0], - stage[kNumStreamsLog2][1], 0b11101110); - shuffle[3] = _mm512_shuffle_i32x4(stage[kNumStreamsLog2][2], - stage[kNumStreamsLog2][3], 0b11101110); - - final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000); - final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101); - final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000); - final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101); - } - - for (int j = 0; j < kNumStreams; ++j) { - _mm512_storeu_si512( - reinterpret_cast<__m512i*>(out + (i * kNumStreams + j) * sizeof(__m512i)), - final_result[j]); - } - } -} - -template -void ByteStreamSplitEncodeAvx512(const uint8_t* raw_values, const int64_t num_values, - uint8_t* output_buffer_raw) { - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kBlockSize = sizeof(__m512i) * kNumStreams; - - const int64_t size = num_values * kNumStreams; - - if (size < kBlockSize) // Back to AVX2 for small size - return ByteStreamSplitEncodeAvx2(raw_values, num_values, - output_buffer_raw); - - const int64_t num_blocks = size / kBlockSize; - const __m512i* raw_values_simd = reinterpret_cast(raw_values); - __m512i* output_buffer_streams[kNumStreams]; - for (int i = 0; i < kNumStreams; ++i) { - output_buffer_streams[i] = - reinterpret_cast<__m512i*>(&output_buffer_raw[num_values * i]); - } - - // First handle suffix. - const int64_t num_processed_elements = (num_blocks * kBlockSize) / kNumStreams; - for (int64_t i = num_processed_elements; i < num_values; ++i) { - for (int j = 0; j < kNumStreams; ++j) { - const uint8_t byte_in_value = raw_values[i * kNumStreams + j]; - output_buffer_raw[j * num_values + i] = byte_in_value; - } - } - - constexpr int KNumUnpack = (kNumStreams == 8) ? 2 : 3; - __m512i final_result[kNumStreams]; - __m512i unpack[KNumUnpack + 1][kNumStreams]; - __m512i permutex[kNumStreams]; - __m512i permutex_mask; - if constexpr (kNumStreams == 8) { - // use _mm512_set_epi32, no _mm512_set_epi16 for some old gcc version. - permutex_mask = _mm512_set_epi32(0x001F0017, 0x000F0007, 0x001E0016, 0x000E0006, - 0x001D0015, 0x000D0005, 0x001C0014, 0x000C0004, - 0x001B0013, 0x000B0003, 0x001A0012, 0x000A0002, - 0x00190011, 0x00090001, 0x00180010, 0x00080000); - } else { - permutex_mask = _mm512_set_epi32(0x0F, 0x0B, 0x07, 0x03, 0x0E, 0x0A, 0x06, 0x02, 0x0D, - 0x09, 0x05, 0x01, 0x0C, 0x08, 0x04, 0x00); - } - - for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { - for (int i = 0; i < kNumStreams; ++i) { - unpack[0][i] = _mm512_loadu_si512(&raw_values_simd[block_index * kNumStreams + i]); - } - - for (int unpack_lvl = 0; unpack_lvl < KNumUnpack; ++unpack_lvl) { - for (int i = 0; i < kNumStreams / 2; ++i) { - unpack[unpack_lvl + 1][i * 2] = _mm512_unpacklo_epi8( - unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]); - unpack[unpack_lvl + 1][i * 2 + 1] = _mm512_unpackhi_epi8( - unpack[unpack_lvl][i * 2], unpack[unpack_lvl][i * 2 + 1]); - } - } - - if constexpr (kNumStreams == 8) { - // path for double - // 1. unpack to epi16 block - // 2. permutexvar_epi16 to 128i block - // 3. shuffle 128i to final 512i target, index: - // {0x00, 0x04, 0x08, 0x0C}, {0x10, 0x14, 0x18, 0x1C}, - // {0x01, 0x05, 0x09, 0x0D}, {0x11, 0x15, 0x19, 0x1D}, - // {0x02, 0x06, 0x0A, 0x0E}, {0x12, 0x16, 0x1A, 0x1E}, - // {0x03, 0x07, 0x0B, 0x0F}, {0x13, 0x17, 0x1B, 0x1F}, - for (int i = 0; i < kNumStreams; ++i) - permutex[i] = _mm512_permutexvar_epi16(permutex_mask, unpack[KNumUnpack][i]); - - __m512i shuffle[kNumStreams]; - shuffle[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100); - shuffle[1] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b01000100); - shuffle[2] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110); - shuffle[3] = _mm512_shuffle_i32x4(permutex[4], permutex[6], 0b11101110); - shuffle[4] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100); - shuffle[5] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b01000100); - shuffle[6] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110); - shuffle[7] = _mm512_shuffle_i32x4(permutex[5], permutex[7], 0b11101110); - - final_result[0] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b10001000); - final_result[1] = _mm512_shuffle_i32x4(shuffle[0], shuffle[1], 0b11011101); - final_result[2] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b10001000); - final_result[3] = _mm512_shuffle_i32x4(shuffle[2], shuffle[3], 0b11011101); - final_result[4] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b10001000); - final_result[5] = _mm512_shuffle_i32x4(shuffle[4], shuffle[5], 0b11011101); - final_result[6] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b10001000); - final_result[7] = _mm512_shuffle_i32x4(shuffle[6], shuffle[7], 0b11011101); - } else { - // Path for float. - // 1. Processed hierarchically to 32i block using the unpack intrinsics. - // 2. Pack 128i block using _mm256_permutevar8x32_epi32. - // 3. Pack final 256i block with _mm256_permute2x128_si256. - for (int i = 0; i < kNumStreams; ++i) - permutex[i] = _mm512_permutexvar_epi32(permutex_mask, unpack[KNumUnpack][i]); - - final_result[0] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b01000100); - final_result[1] = _mm512_shuffle_i32x4(permutex[0], permutex[2], 0b11101110); - final_result[2] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b01000100); - final_result[3] = _mm512_shuffle_i32x4(permutex[1], permutex[3], 0b11101110); - } - - for (int i = 0; i < kNumStreams; ++i) { - _mm512_storeu_si512(&output_buffer_streams[i][block_index], final_result[i]); - } - } -} -#endif // ARROW_HAVE_AVX512 - #if defined(ARROW_HAVE_SIMD_SPLIT) template void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values, int64_t stride, uint8_t* out) { -#if defined(ARROW_HAVE_AVX512) - return ByteStreamSplitDecodeAvx512(data, num_values, stride, out); -#elif defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_AVX2) return ByteStreamSplitDecodeAvx2(data, num_values, stride, out); #elif defined(ARROW_HAVE_SSE4_2) return ByteStreamSplitDecodeSse2(data, num_values, stride, out); @@ -563,10 +348,7 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int64_t num_values, template void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, const int64_t num_values, uint8_t* output_buffer_raw) { -#if defined(ARROW_HAVE_AVX512) - return ByteStreamSplitEncodeAvx512(raw_values, num_values, - output_buffer_raw); -#elif defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_AVX2) return ByteStreamSplitEncodeAvx2(raw_values, num_values, output_buffer_raw); #elif defined(ARROW_HAVE_SSE4_2) diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc index 71c6063179ea6..421edce4e0aa3 100644 --- a/cpp/src/arrow/util/byte_stream_split_test.cc +++ b/cpp/src/arrow/util/byte_stream_split_test.cc @@ -81,10 +81,6 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { #if defined(ARROW_HAVE_AVX2) encode_funcs_.push_back({"avx2", &ByteStreamSplitEncodeAvx2}); decode_funcs_.push_back({"avx2", &ByteStreamSplitDecodeAvx2}); -#endif -#if defined(ARROW_HAVE_AVX512) - encode_funcs_.push_back({"avx512", &ByteStreamSplitEncodeAvx512}); - decode_funcs_.push_back({"avx512", &ByteStreamSplitDecodeAvx512}); #endif } diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 76c411244b22d..dd258ab815244 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -468,33 +468,6 @@ BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Range(MIN_RANGE, MAX_RANGE); BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Range(MIN_RANGE, MAX_RANGE); #endif -#if defined(ARROW_HAVE_AVX512) -static void BM_ByteStreamSplitDecode_Float_Avx512(benchmark::State& state) { - BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); -} - -static void BM_ByteStreamSplitDecode_Double_Avx512(benchmark::State& state) { - BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx512); -} - -static void BM_ByteStreamSplitEncode_Float_Avx512(benchmark::State& state) { - BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); -} - -static void BM_ByteStreamSplitEncode_Double_Avx512(benchmark::State& state) { - BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx512); -} - -BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx512)->Range(MIN_RANGE, MAX_RANGE); -BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx512)->Range(MIN_RANGE, MAX_RANGE); -#endif - template static auto MakeDeltaBitPackingInputFixed(size_t length) { using T = typename DType::c_type;