Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove popcnt_bitvector #2

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 45 additions & 29 deletions cpp/src/arrow/compute/kernels/hash_aggregate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@
#include "arrow/compute/kernel.h"
#include "arrow/compute/kernels/aggregate_internal.h"
#include "arrow/compute/kernels/common.h"
#include "arrow/exec/groupby.h"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sth is wrong with my clang-format

#include "arrow/util/bit_run_reader.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/make_unique.h"
#include "arrow/visitor_inline.h"
#include "arrow/exec/groupby.h"

namespace arrow {

Expand Down Expand Up @@ -442,15 +442,15 @@ struct GrouperFastImpl : Grouper {
return true;
}

static Result<std::unique_ptr<GrouperFastImpl>> Make(const std::vector<ValueDescr>& keys,
ExecContext* ctx) {
static Result<std::unique_ptr<GrouperFastImpl>> Make(
const std::vector<ValueDescr>& keys, ExecContext* ctx) {
auto impl = ::arrow::internal::make_unique<GrouperFastImpl>();
impl->ctx_ = ctx;
impl->non_null_buffers_maybe_null_.resize(keys.size());
impl->fixedlen_buffers_.resize(keys.size());
impl->varlen_buffer_maybe_null_.resize(keys.size());
impl->key_types_.resize(keys.size());

impl->is_fixedlen_.resize(keys.size());
impl->col_widths_.resize(keys.size());
for (size_t i = 0; i < keys.size(); ++i) {
Expand All @@ -470,8 +470,9 @@ struct GrouperFastImpl : Grouper {
impl->key_types_[i] = key;
}

impl->group_map_.init(arrow::exec::util::CPUInstructionSet::avx2, ctx->memory_pool(), static_cast<uint32_t>(keys.size()),
impl->is_fixedlen_, impl->col_widths_.data());
impl->group_map_.init(arrow::exec::util::CPUInstructionSet::avx2, ctx->memory_pool(),
static_cast<uint32_t>(keys.size()), impl->is_fixedlen_,
impl->col_widths_.data());

return std::move(impl);
}
Expand All @@ -482,11 +483,12 @@ struct GrouperFastImpl : Grouper {

std::shared_ptr<arrow::Buffer> group_ids;
ARROW_ASSIGN_OR_RAISE(
group_ids,
AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));
group_ids, AllocateBuffer(sizeof(uint32_t) * num_rows, ctx_->memory_pool()));

for (int i = 0; i < num_columns; ++i) {
non_null_buffers_maybe_null_[i] = batch[i].array()->buffers[0] != NULLPTR ? batch[i].array()->buffers[0]->data() : nullptr;
non_null_buffers_maybe_null_[i] = batch[i].array()->buffers[0] != NULLPTR
? batch[i].array()->buffers[0]->data()
: nullptr;
fixedlen_buffers_[i] = batch[i].array()->buffers[1]->data();
if (is_fixedlen_[i]) {
varlen_buffer_maybe_null_[i] = nullptr;
Expand All @@ -498,17 +500,17 @@ struct GrouperFastImpl : Grouper {
TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
RETURN_NOT_OK(group_ids_batch.Resize(batch.length));

group_map_.push_input(
static_cast<uint32_t>(num_rows),
non_null_buffers_maybe_null_.data(),
fixedlen_buffers_.data(),
varlen_buffer_maybe_null_.data(),
reinterpret_cast<uint32_t*>(group_ids->mutable_data()));
group_map_.push_input(static_cast<uint32_t>(num_rows),
non_null_buffers_maybe_null_.data(), fixedlen_buffers_.data(),
varlen_buffer_maybe_null_.data(),
reinterpret_cast<uint32_t*>(group_ids->mutable_data()));

return Datum(UInt32Array(batch.length, std::move(group_ids)));
}

uint32_t num_groups() const override { return static_cast<uint32_t>(group_map_.get_num_keys()); }
uint32_t num_groups() const override {
return static_cast<uint32_t>(group_map_.get_num_keys());
}

Result<ExecBatch> GetUniques() override {
uint64_t num_groups;
Expand Down Expand Up @@ -536,43 +538,57 @@ struct GrouperFastImpl : Grouper {
null_counts.resize(num_columns);

for (size_t i = 0; i < num_columns; ++i) {
ARROW_ASSIGN_OR_RAISE(non_null_bufs[i], AllocateBitmap(num_groups, ctx_->memory_pool()));
ARROW_ASSIGN_OR_RAISE(non_null_bufs[i],
AllocateBitmap(num_groups, ctx_->memory_pool()));
non_null_arrays[i] = non_null_bufs[i]->mutable_data();
if (is_fixedlen_[i]) {
if (col_widths_[i] == 0) {
ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocateBitmap(num_groups, ctx_->memory_pool()));
ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i],
AllocateBitmap(num_groups, ctx_->memory_pool()));
} else {
ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocateBuffer(num_groups * col_widths_[i], ctx_->memory_pool()));
ARROW_ASSIGN_OR_RAISE(
fixedlen_bufs[i],
AllocateBuffer(num_groups * col_widths_[i], ctx_->memory_pool()));
}
} else {
ARROW_ASSIGN_OR_RAISE(fixedlen_bufs[i], AllocateBuffer((num_groups + 1) * sizeof(uint32_t), ctx_->memory_pool()));
ARROW_ASSIGN_OR_RAISE(
fixedlen_bufs[i],
AllocateBuffer((num_groups + 1) * sizeof(uint32_t), ctx_->memory_pool()));
}
fixedlen_arrays[i] = fixedlen_bufs[i]->mutable_data();
}

group_map_.pull_output_fixedlen_and_nulls(non_null_arrays.data(), fixedlen_arrays.data(), varlen_sizes.data());
group_map_.pull_output_fixedlen_and_nulls(
non_null_arrays.data(), fixedlen_arrays.data(), varlen_sizes.data());

for (size_t i = 0; i < num_columns; ++i) {
null_counts[i] = static_cast<int>(num_groups) - exec::util::BitUtil::popcnt_bitvector(static_cast<int>(num_groups), non_null_arrays[i]);
auto valid_count = arrow::internal::CountSetBits(non_null_arrays[i], /*offset=*/0,
static_cast<int64_t>(num_groups));
null_counts[i] = static_cast<int>(num_groups) - valid_count;

if (!is_fixedlen_[i]) {
ARROW_ASSIGN_OR_RAISE(varlen_bufs[i], AllocateBuffer(varlen_sizes[i], ctx_->memory_pool()));
ARROW_ASSIGN_OR_RAISE(varlen_bufs[i],
AllocateBuffer(varlen_sizes[i], ctx_->memory_pool()));
varlen_arrays[i] = varlen_bufs[i]->mutable_data();
} else {
varlen_arrays[i] = nullptr;
}
}

group_map_.pull_output_varlen(non_null_arrays.data(), fixedlen_arrays.data(), varlen_arrays.data());
group_map_.pull_output_varlen(non_null_arrays.data(), fixedlen_arrays.data(),
varlen_arrays.data());

for (size_t i = 0; i < num_columns; ++i) {
if (is_fixedlen_[i]) {
out.values[i] = ArrayData::Make(
key_types_[i], num_groups, {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i])},
null_counts[i]);
key_types_[i], num_groups,
{std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i])}, null_counts[i]);
} else {
out.values[i] = ArrayData::Make(
key_types_[i], num_groups, {std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i]), std::move(varlen_bufs[i])},
null_counts[i]);
out.values[i] =
ArrayData::Make(key_types_[i], num_groups,
{std::move(non_null_bufs[i]), std::move(fixedlen_bufs[i]),
std::move(varlen_bufs[i])},
null_counts[i]);
}
}

Expand Down
7 changes: 5 additions & 2 deletions cpp/src/arrow/exec/groupby_map.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <cstdint>

#include "arrow/exec/common.h"
#include "arrow/util/bitmap_ops.h"

namespace arrow {
namespace exec {
Expand Down Expand Up @@ -315,7 +316,8 @@ Status SwissTable::map(const int num_keys, const uint32_t* hashes,
break;
}

int num_matches = util::BitUtil::popcnt_bitvector(num_keys, match_bitvector);
int64_t num_matches =
arrow::internal::CountSetBits(match_bitvector, /*offset=*/0, num_keys);

// after first pass count rows with matches and decide based on their percentage
// whether to call dense or sparse comparison function
Expand Down Expand Up @@ -343,7 +345,8 @@ Status SwissTable::map(const int num_keys, const uint32_t* hashes,

do {
bool out_of_capacity;
RETURN_NOT_OK(lookup_2(hashes, num_ids, ids, out_of_capacity, out_groupids, slot_ids));
RETURN_NOT_OK(
lookup_2(hashes, num_ids, ids, out_of_capacity, out_groupids, slot_ids));
if (out_of_capacity) {
RETURN_NOT_OK(grow_double());
// Set slot_ids for selected vectors to first slot in new initial block.
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/exec/groupby_storage_avx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "arrow/exec/common.h"
#include "arrow/exec/groupby_storage.h"
#include "arrow/util/bit_util.h"

namespace arrow {
namespace exec {
Expand Down Expand Up @@ -205,7 +206,7 @@ void KeyCompare::compare_fixedlen_avx2(uint32_t num_rows, uint32_t length,
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(base_right) + istripe);
cmp &= (mask_last |
_mm256_movemask_epi8(_mm256_cmpeq_epi8(key_stripe_left, key_stripe_right)));
match |= (static_cast<uint64_t>(POPCNT64(cmp) >> 5) << (irow & 63));
match |= (static_cast<uint64_t>(arrow::BitUtil::PopCount(cmp) >> 5) << (irow & 63));
if ((irow & 63) == 63) {
reinterpret_cast<uint64_t*>(match_bitvector)[irow / 64] = match;
match = 0ULL;
Expand Down Expand Up @@ -725,4 +726,4 @@ void KeyTranspose::offsets_to_lengths_avx2(uint32_t num_rows, const uint32_t* of
#endif

} // namespace exec
} // namespace arrow
} // namespace arrow
20 changes: 4 additions & 16 deletions cpp/src/arrow/exec/util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,21 @@

#include "arrow/exec/util.h"

#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_ops.h"

namespace arrow {
namespace exec {
namespace util {

int BitUtil::popcnt_bitvector(const int num_bits, const uint8_t* bits) {
constexpr int unroll = 64;
int count = 0;
for (int i = 0; i < num_bits / unroll; ++i) {
uint64_t word = reinterpret_cast<const uint64_t*>(bits)[i];
count += static_cast<int>(POPCNT64(word));
}
int tail = num_bits % unroll;
if (tail) {
uint64_t word = reinterpret_cast<const uint64_t*>(bits)[num_bits / unroll];
word &= ~0ULL >> (64 - tail);
count += static_cast<int>(POPCNT64(word));
}
return count;
}

inline void BitUtil::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
int& num_indexes, uint16_t* indexes) {
while (word) {
indexes[num_indexes++] = base_index + static_cast<uint16_t>(TZCNT64(word));
word &= word - 1;
}
}

inline void BitUtil::bits_filter_indexes_helper(uint64_t word,
const uint16_t* input_indexes,
int& num_indexes, uint16_t* indexes) {
Expand Down
6 changes: 1 addition & 5 deletions cpp/src/arrow/exec/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,21 @@
#include <cstdint>
#include <vector>

#include "arrow/exec/common.h"
#include "arrow/memory_pool.h"
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/util/logging.h"
#include "arrow/exec/common.h"

#if defined(__clang__) || defined(__GNUC__)
#define LZCNT64(x) __builtin_clzll(x)
#define TZCNT64(x) __builtin_ctzll(x)
#define POPCNT64(x) __builtin_popcountll(x)
#define BYTESWAP(x) __builtin_bswap64(x)
#define ROTL(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#elif defined(_MSC_VER)
#include <intrin.h>
#define LZCNT64(x) __lzcnt64(x)
#define TZCNT64(x) _tzcnt_u64(x)
#define POPCNT64(x) __popcnt64(x)
#define BYTESWAP(x) _byteswap_uint64(x)
#define ROTL(x, n) _rotl((x), (n))
#endif
Expand Down Expand Up @@ -140,8 +138,6 @@ class TempBuffer {

class BitUtil {
public:
static int popcnt_bitvector(const int num_bits, const uint8_t* bits);

template <int bit_to_search = 1>
static void bits_to_indexes(CPUInstructionSet instruction_set, const int num_bits,
const uint8_t* bits, int& num_indexes, uint16_t* indexes);
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/exec/util_avx2.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "arrow/exec/common.h"
#include "arrow/exec/util.h"
#include "arrow/util/bit_util.h"

namespace arrow {
namespace exec {
Expand Down Expand Up @@ -52,7 +53,7 @@ void BitUtil::bits_to_indexes_avx2(const int num_bits, const uint8_t* bits,
_pext_u64(mask, _pdep_u64(word, UINT64_C(0X0101010101010101)) * 0xff) + base;
*reinterpret_cast<uint64_t*>(byte_indexes + num_indexes_loop) = byte_indexes_next;
base += incr;
num_indexes_loop += static_cast<int>(POPCNT64(word & 0xff));
num_indexes_loop += static_cast<int>(arrow::BitUtil::PopCount(word & 0xff));
word >>= 8;
}
// Unpack indexes to 16-bits and either add the base of i * 64 or shuffle input
Expand Down Expand Up @@ -122,7 +123,7 @@ void BitUtil::bits_filter_indexes_avx2(const int num_bits, const uint8_t* bits,
output, _mm256_setr_epi64x(0x0b030a0209010800ULL, 0x0f070e060d050c04ULL,
0x0b030a0209010800ULL, 0x0f070e060d050c04ULL));
_mm256_storeu_si256((__m256i*)(indexes + num_indexes), output);
num_indexes += static_cast<int>(POPCNT64(word & 0xffff));
num_indexes += static_cast<int>(arrow::BitUtil::PopCount(word & 0xffff));
word >>= 16;
++loop_id;
}
Expand Down