@@ -251,11 +251,12 @@ bb2:
251251 ret void
252252}
253253
254- define amdgpu_kernel void @scalar_alloca_vector_gep_i8 (ptr %buffer , float %data , i32 %index ) {
255- ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8 (
256- ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i32 [[INDEX :%.*]]) {
254+ define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4 (ptr %buffer , float %data , i1 %idx_sel ) {
255+ ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4 (
256+ ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL :%.*]]) {
257257; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
258258; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
259+ ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4
259260; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
260261; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
261262; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
@@ -264,6 +265,67 @@ define amdgpu_kernel void @scalar_alloca_vector_gep_i8(ptr %buffer, float %data,
264265 %alloca = alloca <3 x float >, align 16 , addrspace (5 )
265266 %vec = load <3 x float >, ptr %buffer
266267 store <3 x float > %vec , ptr addrspace (5 ) %alloca
268+ %index = select i1 %idx_sel , i32 0 , i32 4
269+ %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
270+ store float %data , ptr addrspace (5 ) %elt , align 4
271+ %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
272+ store <3 x float > %updated , ptr %buffer , align 16
273+ ret void
274+ }
275+
276+ define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8 (ptr %buffer , float %data , i1 %idx_sel ) {
277+ ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(
278+ ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
279+ ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison
280+ ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
281+ ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8
282+ ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2
283+ ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]]
284+ ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16
285+ ; CHECK-NEXT: ret void
286+ ;
287+ %alloca = alloca <3 x float >, align 16 , addrspace (5 )
288+ %vec = load <3 x float >, ptr %buffer
289+ store <3 x float > %vec , ptr addrspace (5 ) %alloca
290+ %index = select i1 %idx_sel , i32 4 , i32 8
291+ %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
292+ store float %data , ptr addrspace (5 ) %elt , align 4
293+ %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
294+ store <3 x float > %updated , ptr %buffer , align 16
295+ ret void
296+ }
297+
298+ define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote (ptr %buffer , float %data , i1 %idx_sel ) {
299+ ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(
300+ ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) {
301+ ; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
302+ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1
303+ ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]]
304+ ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2
305+ ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]]
306+ ; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
307+ ; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
308+ ; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
309+ ; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
310+ ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
311+ ; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
312+ ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
313+ ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
314+ ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
315+ ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]]
316+ ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16
317+ ; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16
318+ ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5
319+ ; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]]
320+ ; CHECK-NEXT: store float [[DATA]], ptr addrspace(3) [[ELT]], align 4
321+ ; CHECK-NEXT: [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16
322+ ; CHECK-NEXT: store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16
323+ ; CHECK-NEXT: ret void
324+ ;
325+ %alloca = alloca <3 x float >, align 16 , addrspace (5 )
326+ %vec = load <3 x float >, ptr %buffer
327+ store <3 x float > %vec , ptr addrspace (5 ) %alloca
328+ %index = select i1 %idx_sel , i32 4 , i32 5
267329 %elt = getelementptr inbounds nuw i8 , ptr addrspace (5 ) %alloca , i32 %index
268330 store float %data , ptr addrspace (5 ) %elt , align 4
269331 %updated = load <3 x float >, ptr addrspace (5 ) %alloca , align 16
0 commit comments