diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 96cb7cdf2d531..9de1a643f1000 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -647,6 +647,12 @@ class CombinerHelper { bool matchRotateOutOfRange(MachineInstr &MI) const; void applyRotateOutOfRange(MachineInstr &MI) const; + bool matchCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, + Register &UnmergeSrc) const; + void applyCombineBuildUnmerge(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, + Register &UnmergeSrc) const; + bool matchUseVectorTruncate(MachineInstr &MI, Register &MatchInfo) const; void applyUseVectorTruncate(MachineInstr &MI, Register &MatchInfo) const; diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 119695e53c3cb..0ab2d9487a295 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -921,6 +921,15 @@ def merge_of_x_and_zero : GICombineRule < [{ return Helper.matchMergeXAndZero(*${MI}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${MI}, ${matchinfo}); }])>; +// Transform build_vector(unmerge(src, 0), ... unmerge(src, n), undef, ..., undef) +// => concat_vectors(src, undef) +def combine_build_unmerge : GICombineRule< + (defs root:$root, register_matchinfo:$unmergeSrc), + (match (G_BUILD_VECTOR $dst, GIVariadic<>:$unused):$root, + [{ return Helper.matchCombineBuildUnmerge(*${root}, MRI, ${unmergeSrc}); }]), + (apply [{ Helper.applyCombineBuildUnmerge(*${root}, MRI, B, ${unmergeSrc}); }]) +>; + def merge_combines: GICombineGroup<[ unmerge_anyext_build_vector, unmerge_merge, @@ -930,7 +939,8 @@ def merge_combines: GICombineGroup<[ unmerge_dead_to_trunc, unmerge_zext_to_zext, merge_of_x_and_undef, - merge_of_x_and_zero + merge_of_x_and_zero, + combine_build_unmerge ]>; // Under certain conditions, transform: diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ec4d13f1cd1b3..45a08347b1ec2 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -3463,6 +3463,88 @@ static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits, isConstTrueVal(TLI, Cst, IsVector, IsFP); } +// This pattern aims to match the following shape to avoid extra mov +// instructions +// G_BUILD_VECTOR( +// G_UNMERGE_VALUES(src, 0) +// G_UNMERGE_VALUES(src, 1) +// G_IMPLICIT_DEF +// G_IMPLICIT_DEF +// ) +// -> +// G_CONCAT_VECTORS( +// src, +// undef +// ) +bool CombinerHelper::matchCombineBuildUnmerge(MachineInstr &MI, + MachineRegisterInfo &MRI, + Register &UnmergeSrc) const { + auto &BV = cast(MI); + + unsigned BuildUseCount = BV.getNumSources(); + if (BuildUseCount % 2 != 0) + return false; + + unsigned NumUnmerge = BuildUseCount / 2; + + auto *Unmerge = getOpcodeDef(BV.getSourceReg(0), MRI); + + // Check the first operand is an unmerge and has the correct number of + // operands + if (!Unmerge || Unmerge->getNumDefs() != NumUnmerge) + return false; + + UnmergeSrc = Unmerge->getSourceReg(); + + LLT DstTy = MRI.getType(MI.getOperand(0).getReg()); + LLT UnmergeSrcTy = MRI.getType(UnmergeSrc); + + // Ensure we only generate legal instructions post-legalizer + if (!IsPreLegalize && + !isLegal({TargetOpcode::G_CONCAT_VECTORS, {DstTy, UnmergeSrcTy}})) + return false; + + // Check that all of the operands before the midpoint come from the same + // unmerge and are in the same order as they are used in the build_vector + for (unsigned I = 0; I < NumUnmerge; ++I) { + auto MaybeUnmergeReg = BV.getSourceReg(I); + auto *LoopUnmerge = getOpcodeDef(MaybeUnmergeReg, MRI); + + if (!LoopUnmerge || LoopUnmerge != Unmerge) + return false; + + if (LoopUnmerge->getOperand(I).getReg() != MaybeUnmergeReg) + return false; + } + + // Check that all of the unmerged values are used + if (Unmerge->getNumDefs() != NumUnmerge) + return false; + + // Check that all of the operands after the mid point are undefs. + for (unsigned I = NumUnmerge; I < BuildUseCount; ++I) { + auto *Undef = getDefIgnoringCopies(BV.getSourceReg(I), MRI); + + if (Undef->getOpcode() != TargetOpcode::G_IMPLICIT_DEF) + return false; + } + + return true; +} + +void CombinerHelper::applyCombineBuildUnmerge(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + Register &UnmergeSrc) const { + assert(UnmergeSrc && "Expected there to be one matching G_UNMERGE_VALUES"); + B.setInstrAndDebugLoc(MI); + + Register UndefVec = B.buildUndef(MRI.getType(UnmergeSrc)).getReg(0); + B.buildConcatVectors(MI.getOperand(0), {UnmergeSrc, UndefVec}); + + MI.eraseFromParent(); +} + // This combine tries to reduce the number of scalarised G_TRUNC instructions by // using vector truncates instead // @@ -8426,4 +8508,4 @@ bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI, } return false; -} +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll index da19991d56259..ae86129286ddc 100644 --- a/llvm/test/CodeGen/AArch64/fptrunc.ll +++ b/llvm/test/CodeGen/AArch64/fptrunc.ll @@ -345,19 +345,11 @@ entry: } define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) { -; CHECK-SD-LABEL: fptrunc_v2f32_v2f16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: fcvtn v0.4h, v0.4s -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fptrunc_v2f32_v2f16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fptrunc_v2f32_v2f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fcvtn v0.4h, v0.4s +; CHECK-NEXT: ret entry: %c = fptrunc <2 x float> %a to <2 x half> ret <2 x half> %c diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index fce4f8e69f14d..e526a9f7bc0f6 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -5763,18 +5763,14 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: scvtf v0.2d, v0.2d ; CHECK-NOFP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i64_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: scvtf v0.2d, v0.2d ; CHECK-FP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = sitofp <2 x i64> %a to <2 x half> @@ -5808,18 +5804,14 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NOFP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i64_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: ucvtf v0.2d, v0.2d ; CHECK-FP16-GI-NEXT: fcvtn v0.2s, v0.2d -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = uitofp <2 x i64> %a to <2 x half> @@ -6232,17 +6224,13 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-NOFP16-GI-LABEL: stofp_v2i32_v2f16: ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i32_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = sitofp <2 x i32> %a to <2 x half> @@ -6267,17 +6255,13 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-NOFP16-GI-LABEL: utofp_v2i32_v2f16: ; CHECK-NOFP16-GI: // %bb.0: // %entry ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i32_v2f16: ; CHECK-FP16-GI: // %bb.0: // %entry ; CHECK-FP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-FP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-FP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-FP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-FP16-GI-NEXT: ret entry: %c = uitofp <2 x i32> %a to <2 x half> @@ -6480,9 +6464,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-NOFP16-GI-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NOFP16-GI-NEXT: sshr v0.2s, v0.2s, #16 ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret entry: %c = sitofp <2 x i16> %a to <2 x half> @@ -6509,9 +6491,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-NOFP16-GI-NEXT: movi d1, #0x00ffff0000ffff ; CHECK-NOFP16-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret entry: %c = uitofp <2 x i16> %a to <2 x half> @@ -6766,9 +6746,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-NOFP16-GI-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NOFP16-GI-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-NOFP16-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: stofp_v2i8_v2f16: @@ -6817,9 +6795,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-NOFP16-GI-NEXT: movi d1, #0x0000ff000000ff ; CHECK-NOFP16-GI-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NOFP16-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-NOFP16-GI-NEXT: mov v1.s[0], v0.s[0] -; CHECK-NOFP16-GI-NEXT: mov v1.s[1], v0.s[1] -; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v1.4s +; CHECK-NOFP16-GI-NEXT: fcvtn v0.4h, v0.4s ; CHECK-NOFP16-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: utofp_v2i8_v2f16: diff --git a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll index c1b8bc6031b18..f7dbcd137e742 100644 --- a/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll @@ -762,25 +762,13 @@ define void @undef_hi3_v4f16(half %arg0) { } define void @undef_hi2_v4i16(<2 x i16> %arg0) { -; GFX8-SDAG-LABEL: undef_hi2_v4i16: -; GFX8-SDAG: ; %bb.0: -; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: ;;#ASMSTART -; GFX8-SDAG-NEXT: ; use v[0:1] -; GFX8-SDAG-NEXT: ;;#ASMEND -; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-GISEL-LABEL: undef_hi2_v4i16: -; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-GISEL-NEXT: ;;#ASMSTART -; GFX8-GISEL-NEXT: ; use v[0:1] -; GFX8-GISEL-NEXT: ;;#ASMEND -; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: undef_hi2_v4i16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: ; use v[0:1] +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: undef_hi2_v4i16: ; GFX9: ; %bb.0: @@ -803,25 +791,13 @@ define void @undef_hi2_v4i16(<2 x i16> %arg0) { } define void @undef_hi2_v4f16(<2 x half> %arg0) { -; GFX8-SDAG-LABEL: undef_hi2_v4f16: -; GFX8-SDAG: ; %bb.0: -; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-SDAG-NEXT: ;;#ASMSTART -; GFX8-SDAG-NEXT: ; use v[0:1] -; GFX8-SDAG-NEXT: ;;#ASMEND -; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-GISEL-LABEL: undef_hi2_v4f16: -; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-GISEL-NEXT: ;;#ASMSTART -; GFX8-GISEL-NEXT: ; use v[0:1] -; GFX8-GISEL-NEXT: ;;#ASMEND -; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX8-LABEL: undef_hi2_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: ;;#ASMSTART +; GFX8-NEXT: ; use v[0:1] +; GFX8-NEXT: ;;#ASMEND +; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: undef_hi2_v4f16: ; GFX9: ; %bb.0: @@ -842,5 +818,3 @@ define void @undef_hi2_v4f16(<2 x half> %arg0) { call void asm sideeffect "; use $0", "v"(<4 x half> %undef.hi); ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX8: {{.*}}