From d8a96024b1096d342bfa28a6ffc72fcc33e4be6e Mon Sep 17 00:00:00 2001 From: Wenju He Date: Mon, 13 Oct 2025 10:54:44 +0200 Subject: [PATCH 1/2] [libclc] Refine __clc_get_sub_group_size with fast full sub-group path Add a fast path for the common case that total work-group size is multiple of max sub-group size, avoiding need to calculate number of sub-groups. --- .../generic/workitem/clc_get_sub_group_size.cl | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl b/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl index 8ab4afe1ae05f..89b8913474fd2 100644 --- a/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl +++ b/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl @@ -13,14 +13,11 @@ #include _CLC_OVERLOAD _CLC_DEF uint __clc_get_sub_group_size() { - if (__clc_get_sub_group_id() != __clc_get_num_sub_groups() - 1) { - return __clc_get_max_sub_group_size(); - } - size_t size_x = __clc_get_local_size(0); - size_t size_y = __clc_get_local_size(1); - size_t size_z = __clc_get_local_size(2); - size_t linear_size = size_z * size_y * size_x; - size_t uniform_groups = __clc_get_num_sub_groups() - 1; - size_t uniform_size = __clc_get_max_sub_group_size() * uniform_groups; - return linear_size - uniform_size; + uint linear_size = __clc_get_local_size(0) * __clc_get_local_size(1) * + __clc_get_local_size(2); + uint remainder = linear_size % __clc_get_max_sub_group_size(); + bool full_sub_group = (remainder == 0) || (__clc_get_sub_group_id() < + __clc_get_num_sub_groups() - 1); + + return full_sub_group ? __clc_get_max_sub_group_size() : remainder; } From 954a9e20e6dfb4f3b154c153b4cb31a3eb71f8e7 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Mon, 13 Oct 2025 11:10:12 +0200 Subject: [PATCH 2/2] uint -> size_t --- libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl b/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl index 89b8913474fd2..70f357c015b4a 100644 --- a/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl +++ b/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl @@ -13,8 +13,8 @@ #include _CLC_OVERLOAD _CLC_DEF uint __clc_get_sub_group_size() { - uint linear_size = __clc_get_local_size(0) * __clc_get_local_size(1) * - __clc_get_local_size(2); + size_t linear_size = __clc_get_local_size(0) * __clc_get_local_size(1) * + __clc_get_local_size(2); uint remainder = linear_size % __clc_get_max_sub_group_size(); bool full_sub_group = (remainder == 0) || (__clc_get_sub_group_id() < __clc_get_num_sub_groups() - 1);