diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 9a07d26546bbc..aaa68121db105 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1707,10 +1707,9 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, // Get ExecMode ExecModeVal = KernDescVal.Mode; DP("ExecModeVal %d\n", ExecModeVal); - if (KernDescVal.WG_Size == 0) { - KernDescVal.WG_Size = RTLDeviceInfoTy::Default_WG_Size; - DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WG_Size); - } + // If KernDescVal.WG_Size is 0, it is equivalent to not + // specified. Hence, max_flat_workgroup_size is filtered out in + // getLaunchVals WGSizeVal = KernDescVal.WG_Size; DP("WGSizeVal %d\n", WGSizeVal); check("Loading KernDesc computation property", err); @@ -1920,7 +1919,7 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, } } // check flat_max_work_group_size attr here - if (threadsPerGroup > ConstWGSize) { + if (ConstWGSize > 0 && threadsPerGroup > ConstWGSize) { threadsPerGroup = ConstWGSize; DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n", threadsPerGroup);