diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 15a4b6796880f..a771b421e77a4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -4168,7 +4168,7 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, auto ShiftAmt = B.buildConstant(S32, 32u - NumBits); auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u); - auto Shift = B.buildLShr(S32, {Extend}, ShiftAmt); + auto Shift = B.buildShl(S32, Extend, ShiftAmt); auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift}); B.buildTrunc(Dst, Ctlz); MI.eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir index 7748b481cf5b7..85cfb9b320f15 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir @@ -82,8 +82,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[SHL]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]] ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) @@ -147,10 +147,10 @@ body: | ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>) ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32) - ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32) - ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[SHL]](s32) + ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32) + ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[SHL2]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]] ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]] @@ -175,8 +175,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 25 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32) + ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK-NEXT: [[FFBH:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[SHL]](s32) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127 ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FFBH]], [[C1]] ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll index d94a27e8c0200..756b819099682 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -377,7 +377,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 24 +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 24 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3] @@ -452,7 +452,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 16 +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 16 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3] @@ -655,7 +655,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3] ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc @@ -760,7 +761,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no ; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1 -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 ; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc @@ -1167,7 +1169,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1705,8 +1708,9 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noa ; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off ; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v0 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_cmp_eq_u32_sdwa s[2:3], v0, v1 src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, -1, s[2:3] ; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm @@ -2186,7 +2190,7 @@ define i7 @v_ctlz_zero_undef_i7(i7 %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 25, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true) @@ -2278,7 +2282,7 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, ; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 14 +; GFX9-GISEL-NEXT: s_lshl_b32 s0, s4, 14 ; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0 ; GFX9-GISEL-NEXT: s_and_b32 s0, s0, 0x3ffff ; GFX9-GISEL-NEXT: s_lshr_b32 s1, s0, 16 @@ -2317,7 +2321,7 @@ define i18 @v_ctlz_zero_undef_i18(i18 %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 14, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) @@ -2355,8 +2359,8 @@ define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 14, v0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 14, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 14, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 14, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] @@ -2394,10 +2398,13 @@ define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true) ret <2 x i16> %ctlz @@ -2439,11 +2446,15 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true) ret <3 x i16> %ctlz @@ -2492,13 +2503,20 @@ define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 -; GFX9-GISEL-NEXT: s_flbit_i32_b32 s4, 0 +; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v2 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v3, v3 ; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, s4, 16, v0 -; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, s4, 16, v1 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true) ret <4 x i16> %ctlz @@ -2536,8 +2554,10 @@ define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 +; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true) ret <2 x i8> %ctlz @@ -2579,8 +2599,8 @@ define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) { ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7: ; GFX9-GISEL: ; %bb.0: ; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v0, 25, v0 -; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v1, 25, v1 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 25, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 25, v1 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0 ; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]