Skip to content

Commit

Permalink
[Enhancement] Optimize a subtle inline performance problem (StarRocks…
Browse files Browse the repository at this point in the history
…#23300)

Signed-off-by: liuyehcf <1559500551@qq.com>
  • Loading branch information
liuyehcf authored and 王锐 committed May 17, 2023
1 parent 035bbe9 commit 60f171e
Show file tree
Hide file tree
Showing 2 changed files with 123 additions and 102 deletions.
1 change: 1 addition & 0 deletions be/src/common/compiler_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
/// needs to be inlined for a specific reason or the compiler's heuristics make a bad
/// decision, e.g. not inlining a small function on a hot path.
#define ALWAYS_INLINE __attribute__((always_inline))
#define ALWAYS_NOINLINE __attribute__((noinline))

#define ALIGN_CACHE_LINE __attribute__((aligned(CACHE_LINE_SIZE)))

Expand Down
224 changes: 122 additions & 102 deletions be/src/exec/aggregate/agg_hash_map.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,9 @@ struct AggHashMapWithOneNumberKeyWithNullable

// Non Nullble
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_states_non_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_states_non_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
DCHECK(!key_columns[0]->is_nullable());
auto column = down_cast<ColumnType*>(key_columns[0].get());

Expand All @@ -196,8 +196,9 @@ struct AggHashMapWithOneNumberKeyWithNullable

// Nullable
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_states_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_states_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
// Assign not_founds vector when needs compute not founds.
if constexpr (compute_not_founds) {
DCHECK(not_founds);
Expand All @@ -215,7 +216,6 @@ struct AggHashMapWithOneNumberKeyWithNullable
DCHECK(key_columns[0]->is_nullable());
auto* nullable_column = down_cast<NullableColumn*>(key_columns[0].get());
auto* data_column = down_cast<ColumnType*>(nullable_column->data_column().get());
const auto& null_data = nullable_column->null_column_data();

// Shortcut: if nullable column has no nulls.
if (!nullable_column->has_null()) {
Expand All @@ -226,31 +226,17 @@ struct AggHashMapWithOneNumberKeyWithNullable
this->template compute_agg_prefetch<Func, allocate_and_compute_state, compute_not_founds>(
data_column, agg_states, std::forward<Func>(allocate_func), not_founds);
}
return;
}

for (size_t i = 0; i < chunk_size; i++) {
if (null_data[i]) {
if (UNLIKELY(null_key_data == nullptr)) {
null_key_data = allocate_func(nullptr);
}
(*agg_states)[i] = null_key_data;
} else {
if constexpr (allocate_and_compute_state) {
this->template _handle_data_key_column<Func, compute_not_founds>(
data_column, i, std::forward<Func>(allocate_func), agg_states, not_founds);
} else if constexpr (compute_not_founds) {
_handle_data_key_column_without_allocate(data_column, i, agg_states, not_founds);
}
}
} else {
this->template compute_agg_through_null_data<Func, allocate_and_compute_state, compute_not_founds>(
chunk_size, nullable_column, agg_states, std::forward<Func>(allocate_func), not_founds);
}
}
}

// prefetch branch better performance in case with larger hash tables
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_prefetch(ColumnType* column, Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_prefetch(ColumnType* column, Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
AGG_HASH_MAP_PRECOMPUTE_HASH_VALUES(column, AGG_HASH_MAP_DEFAULT_PREFETCH_DIST);
for (size_t i = 0; i < column_size; i++) {
AGG_HASH_MAP_PREFETCH_HASH_VALUE();
Expand Down Expand Up @@ -280,8 +266,8 @@ struct AggHashMapWithOneNumberKeyWithNullable

// prefetch branch better performance in case with small hash tables
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_noprefetch(ColumnType* column, Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_noprefetch(ColumnType* column, Buffer<AggDataPtr>* agg_states,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
size_t num_rows = column->size();
for (size_t i = 0; i < num_rows; i++) {
FieldType key = column->get_data()[i];
Expand All @@ -306,6 +292,29 @@ struct AggHashMapWithOneNumberKeyWithNullable
}
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
ALWAYS_NOINLINE void compute_agg_through_null_data(size_t chunk_size, NullableColumn* nullable_column,
Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
auto* data_column = down_cast<ColumnType*>(nullable_column->data_column().get());
const auto& null_data = nullable_column->null_column_data();
for (size_t i = 0; i < chunk_size; i++) {
if (null_data[i]) {
if (UNLIKELY(null_key_data == nullptr)) {
null_key_data = allocate_func(nullptr);
}
(*agg_states)[i] = null_key_data;
} else {
if constexpr (allocate_and_compute_state) {
this->template _handle_data_key_column<Func, compute_not_founds>(
data_column, i, std::forward<Func>(allocate_func), agg_states, not_founds);
} else if constexpr (compute_not_founds) {
_handle_data_key_column_without_allocate(data_column, i, agg_states, not_founds);
}
}
}
}

template <typename Func, bool compute_not_founds>
void _handle_data_key_column(ColumnType* data_column, size_t row, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
Expand Down Expand Up @@ -381,9 +390,9 @@ struct AggHashMapWithOneStringKeyWithNullable

// Non Nullable
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_states_non_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_states_non_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
DCHECK(key_columns[0]->is_binary());
auto column = down_cast<BinaryColumn*>(key_columns[0].get());

Expand All @@ -404,8 +413,9 @@ struct AggHashMapWithOneStringKeyWithNullable

// Nullable
template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_states_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_states_nullable(size_t chunk_size, const Columns& key_columns, MemPool* pool,
Func&& allocate_func, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
// Assign not_founds vector when needs compute not founds.
if constexpr (compute_not_founds) {
DCHECK(not_founds);
Expand All @@ -423,7 +433,6 @@ struct AggHashMapWithOneStringKeyWithNullable
DCHECK(key_columns[0]->is_nullable());
auto* nullable_column = down_cast<NullableColumn*>(key_columns[0].get());
auto* data_column = down_cast<BinaryColumn*>(nullable_column->data_column().get());
const auto& null_data = nullable_column->null_column_data();
DCHECK(data_column->is_binary());

if (!nullable_column->has_null()) {
Expand All @@ -434,74 +443,16 @@ struct AggHashMapWithOneStringKeyWithNullable
this->template compute_agg_prefetch<Func, allocate_and_compute_state, compute_not_founds>(
data_column, agg_states, pool, std::forward<Func>(allocate_func), not_founds);
}
return;
}

for (size_t i = 0; i < chunk_size; i++) {
if (null_data[i]) {
if (UNLIKELY(null_key_data == nullptr)) {
null_key_data = allocate_func(nullptr);
}
(*agg_states)[i] = null_key_data;
} else {
if constexpr (allocate_and_compute_state) {
this->template _handle_data_key_column<Func, compute_not_founds>(
data_column, i, pool, std::forward<Func>(allocate_func), agg_states, not_founds);
} else if constexpr (compute_not_founds) {
DCHECK(not_founds);
_handle_data_key_column_without_allocate(data_column, i, agg_states, not_founds);
}
}
}
}
}

template <typename Func, bool compute_not_founds>
void _handle_data_key_column(BinaryColumn* data_column, size_t row, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
auto key = data_column->get_slice(row);
auto iter = this->hash_map.lazy_emplace(key, [&](const auto& ctor) {
if constexpr (compute_not_founds) {
(*not_founds)[row] = 1;
} else {
this->template compute_agg_through_null_data<Func, allocate_and_compute_state, compute_not_founds>(
chunk_size, nullable_column, agg_states, pool, std::forward<Func>(allocate_func), not_founds);
}
uint8_t* pos = pool->allocate(key.size);
strings::memcpy_inlined(pos, key.data, key.size);
Slice pk{pos, key.size};
AggDataPtr pv = allocate_func(pk);
ctor(pk, pv);
});
(*agg_states)[row] = iter->second;
}

void _handle_data_key_column_without_allocate(BinaryColumn* data_column, size_t row, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
auto key = data_column->get_slice(row);
if (auto iter = this->hash_map.find(key); iter != this->hash_map.end()) {
(*agg_states)[row] = iter->second;
} else {
(*not_founds)[row] = 1;
}
}

void insert_keys_to_columns(ResultVector& keys, const Columns& key_columns, size_t chunk_size) {
if constexpr (is_nullable) {
DCHECK(key_columns[0]->is_nullable());
auto* nullable_column = down_cast<NullableColumn*>(key_columns[0].get());
auto* column = down_cast<BinaryColumn*>(nullable_column->mutable_data_column());
keys.resize(chunk_size);
column->append_strings(keys);
nullable_column->null_column_data().resize(chunk_size);
} else {
DCHECK(!null_key_data);
auto* column = down_cast<BinaryColumn*>(key_columns[0].get());
keys.resize(chunk_size);
column->append_strings(keys);
}
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_prefetch(BinaryColumn* column, Buffer<AggDataPtr>* agg_states, MemPool* pool, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_prefetch(BinaryColumn* column, Buffer<AggDataPtr>* agg_states, MemPool* pool,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
AGG_HASH_MAP_PRECOMPUTE_HASH_VALUES(column, AGG_HASH_MAP_DEFAULT_PREFETCH_DIST);
for (size_t i = 0; i < column_size; i++) {
AGG_HASH_MAP_PREFETCH_HASH_VALUE();
Expand Down Expand Up @@ -531,8 +482,8 @@ struct AggHashMapWithOneStringKeyWithNullable
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_noprefetch(BinaryColumn* column, Buffer<AggDataPtr>* agg_states, MemPool* pool,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_noprefetch(BinaryColumn* column, Buffer<AggDataPtr>* agg_states, MemPool* pool,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
size_t num_rows = column->size();
for (size_t i = 0; i < num_rows; i++) {
auto key = column->get_slice(i);
Expand Down Expand Up @@ -560,6 +511,73 @@ struct AggHashMapWithOneStringKeyWithNullable
}
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
ALWAYS_NOINLINE void compute_agg_through_null_data(size_t chunk_size, NullableColumn* nullable_column,
Buffer<AggDataPtr>* agg_states, MemPool* pool,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
auto* data_column = down_cast<BinaryColumn*>(nullable_column->data_column().get());
const auto& null_data = nullable_column->null_column_data();
for (size_t i = 0; i < chunk_size; i++) {
if (null_data[i]) {
if (UNLIKELY(null_key_data == nullptr)) {
null_key_data = allocate_func(nullptr);
}
(*agg_states)[i] = null_key_data;
} else {
if constexpr (allocate_and_compute_state) {
this->template _handle_data_key_column<Func, compute_not_founds>(
data_column, i, pool, std::forward<Func>(allocate_func), agg_states, not_founds);
} else if constexpr (compute_not_founds) {
DCHECK(not_founds);
_handle_data_key_column_without_allocate(data_column, i, agg_states, not_founds);
}
}
}
}

template <typename Func, bool compute_not_founds>
void _handle_data_key_column(BinaryColumn* data_column, size_t row, MemPool* pool, Func&& allocate_func,
Buffer<AggDataPtr>* agg_states, std::vector<uint8_t>* not_founds) {
auto key = data_column->get_slice(row);
auto iter = this->hash_map.lazy_emplace(key, [&](const auto& ctor) {
if constexpr (compute_not_founds) {
(*not_founds)[row] = 1;
}
uint8_t* pos = pool->allocate(key.size);
strings::memcpy_inlined(pos, key.data, key.size);
Slice pk{pos, key.size};
AggDataPtr pv = allocate_func(pk);
ctor(pk, pv);
});
(*agg_states)[row] = iter->second;
}

void _handle_data_key_column_without_allocate(BinaryColumn* data_column, size_t row, Buffer<AggDataPtr>* agg_states,
std::vector<uint8_t>* not_founds) {
auto key = data_column->get_slice(row);
if (auto iter = this->hash_map.find(key); iter != this->hash_map.end()) {
(*agg_states)[row] = iter->second;
} else {
(*not_founds)[row] = 1;
}
}

void insert_keys_to_columns(ResultVector& keys, const Columns& key_columns, size_t chunk_size) {
if constexpr (is_nullable) {
DCHECK(key_columns[0]->is_nullable());
auto* nullable_column = down_cast<NullableColumn*>(key_columns[0].get());
auto* column = down_cast<BinaryColumn*>(nullable_column->mutable_data_column());
keys.resize(chunk_size);
column->append_strings(keys);
nullable_column->null_column_data().resize(chunk_size);
} else {
DCHECK(!null_key_data);
auto* column = down_cast<BinaryColumn*>(key_columns[0].get());
keys.resize(chunk_size);
column->append_strings(keys);
}
}

static constexpr bool has_single_null_key = is_nullable;

AggDataPtr null_key_data = nullptr;
Expand Down Expand Up @@ -712,8 +730,9 @@ struct AggHashMapWithSerializedKeyFixedSize
AggDataPtr get_null_key_data() { return nullptr; }

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_prefetch(size_t chunk_size, const Columns& key_columns, Buffer<AggDataPtr>* agg_states,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_prefetch(size_t chunk_size, const Columns& key_columns,
Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
auto* buffer = reinterpret_cast<uint8_t*>(caches.data());
for (const auto& key_column : key_columns) {
key_column->serialize_batch(buffer, slice_sizes, chunk_size, max_fixed_size);
Expand Down Expand Up @@ -755,8 +774,9 @@ struct AggHashMapWithSerializedKeyFixedSize
}

template <typename Func, bool allocate_and_compute_state, bool compute_not_founds>
void compute_agg_noprefetch(size_t chunk_size, const Columns& key_columns, Buffer<AggDataPtr>* agg_states,
Func&& allocate_func, std::vector<uint8_t>* not_founds) {
ALWAYS_NOINLINE void compute_agg_noprefetch(size_t chunk_size, const Columns& key_columns,
Buffer<AggDataPtr>* agg_states, Func&& allocate_func,
std::vector<uint8_t>* not_founds) {
constexpr int key_size = sizeof(FixedSizeSliceKey);
auto* buffer = reinterpret_cast<uint8_t*>(caches.data());
for (const auto& key_column : key_columns) {
Expand Down

0 comments on commit 60f171e

Please sign in to comment.