diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b6946936f8acc..0177d9d423be1 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -631,6 +631,12 @@ class CombinerHelper { /// addressing mode usage. bool matchReassocPtrAdd(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Try to reassociate to reassociate operands of a commutative binop. + bool tryReassocBinOp(unsigned Opc, Register DstReg, Register Op0, + Register Op1, BuildFnTy &MatchInfo); + /// Reassociate commutative binary operations like G_ADD. + bool matchReassocCommBinOp(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Do constant folding when opportunities are exposed after MIR building. bool matchConstantFold(MachineInstr &MI, APInt &MatchInfo); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index d0b9da0382ddf..0aa4fa2123369 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -32,6 +32,7 @@ #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/LowLevelTypeUtils.h" #include "llvm/CodeGen/MachineValueType.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -3584,6 +3585,17 @@ class TargetLowering : public TargetLoweringBase { return N0.hasOneUse(); } + // Lets target to control the following reassociation of operands: (op (op x, + // c1), y) -> (op (op x, y), c1) where N0 is (op x, c1) and N1 is y. By + // default consider profitable any case where N0 has single use. This + // behavior reflects the condition replaced by this target hook call in the + // combiner. Any particular target can implement its own heuristic to + // restrict common combiner. + virtual bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, + Register N1) const { + return MRI.hasOneNonDBGUse(N0); + } + virtual bool isSDNodeAlwaysUniform(const SDNode * N) const { return false; } diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index ed4d3f3c0f8f9..5bd92c856999e 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -862,7 +862,13 @@ def reassoc_ptradd : GICombineRule< [{ return Helper.matchReassocPtrAdd(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>; -def reassocs : GICombineGroup<[reassoc_ptradd]>; +def reassoc_comm_binops : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_ADD $root, $src1, $src2):$root, + [{ return Helper.matchReassocCommBinOp(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + +def reassocs : GICombineGroup<[reassoc_ptradd, reassoc_comm_binops]>; // Constant fold operations. def constant_fold : GICombineRule< diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index d958c9c1dda04..94a756089f98a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -4501,6 +4501,58 @@ bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI, return false; } +bool CombinerHelper::tryReassocBinOp(unsigned Opc, Register DstReg, + Register OpLHS, Register OpRHS, + BuildFnTy &MatchInfo) { + LLT OpRHSTy = MRI.getType(OpRHS); + MachineInstr *OpLHSDef = MRI.getVRegDef(OpLHS); + + if (OpLHSDef->getOpcode() != Opc) + return false; + + MachineInstr *OpRHSDef = MRI.getVRegDef(OpRHS); + Register OpLHSLHS = OpLHSDef->getOperand(1).getReg(); + Register OpLHSRHS = OpLHSDef->getOperand(2).getReg(); + + if (isConstantOrConstantSplatVector(*MRI.getVRegDef(OpLHSRHS), MRI)) { + if (isConstantOrConstantSplatVector(*OpRHSDef, MRI)) { + // (Opc (Opc X, C1), C2) -> (Opc X, (Opc C1, C2)) + MatchInfo = [=](MachineIRBuilder &B) { + auto NewCst = B.buildInstr(Opc, {OpRHSTy}, {OpLHSRHS, OpRHS}); + B.buildInstr(Opc, {DstReg}, {OpLHSLHS, NewCst}); + }; + return true; + } + if (getTargetLowering().isReassocProfitable(MRI, OpLHS, OpRHS) && + MRI.hasOneNonDBGUse(OpLHSLHS)) { + // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) + // iff (op x, c1) has one use + MatchInfo = [=](MachineIRBuilder &B) { + auto NewLHSLHS = B.buildInstr(Opc, {OpRHSTy}, {OpLHSLHS, OpRHS}); + B.buildInstr(Opc, {DstReg}, {NewLHSLHS, OpLHSRHS}); + }; + return true; + } + } + + return false; +} + +bool CombinerHelper::matchReassocCommBinOp(MachineInstr &MI, + BuildFnTy &MatchInfo) { + // We don't check if the reassociation will break a legal addressing mode + // here since pointer arithmetic is handled by G_PTR_ADD. + unsigned Opc = MI.getOpcode(); + Register DstReg = MI.getOperand(0).getReg(); + Register LHSReg = MI.getOperand(1).getReg(); + Register RHSReg = MI.getOperand(2).getReg(); + + if (tryReassocBinOp(Opc, DstReg, LHSReg, RHSReg, MatchInfo)) + return true; + if (tryReassocBinOp(Opc, DstReg, RHSReg, LHSReg, MatchInfo)) + return true; + return false; +} bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) { Register Op1 = MI.getOperand(1).getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 38a4bd9260913..09d9430aea49a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -5158,6 +5158,11 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, } } +bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, + Register N0, Register N1) const { + return true; // FIXME: handle regbanks +} + TargetLowering::AtomicExpansionKind AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { switch (RMW->getOperation()) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index bc35c4ea0b3ea..c2d6a225a2618 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -292,6 +292,9 @@ class AMDGPUTargetLowering : public TargetLowering { bool SNaN = false, unsigned Depth = 0) const override; + bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, + Register N1) const override; + /// Helper function that adds Reg to the LiveIn list of the DAG's /// MachineFunction. /// diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f090b0c4c3278..8c24789d53c8b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13572,6 +13572,11 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, hasMemSDNodeUser(*N0->use_begin())); } +bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI, + Register N0, Register N1) const { + return true; // FIXME: handle regbanks +} + MachineMemOperand::Flags SITargetLowering::getTargetMMOFlags(const Instruction &I) const { // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load. diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 4a18c349e8779..5dcb79d4d7ae7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -495,6 +495,9 @@ class SITargetLowering final : public AMDGPUTargetLowering { bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override; + bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, + Register N1) const override; + bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; bool isCanonicalized(Register Reg, MachineFunction &MF, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-binop-reassoc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-binop-reassoc.mir new file mode 100644 index 0000000000000..f1676e48e5113 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-binop-reassoc.mir @@ -0,0 +1,124 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s + +# Combins: (Opc (Opc X, C1), C2) -> (Opc X, (Opc C1, C2)) +--- +name: test1_add_move_inner_cst_to_fold +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +body: | + bb.1: + liveins: $x0 + + ; CHECK-LABEL: name: test1_add_move_inner_cst_to_fold + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s64) = COPY $x0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 88 + ; CHECK-NEXT: %add_outer:_(s64) = G_ADD %x, [[C]] + ; CHECK-NEXT: $x0 = COPY %add_outer(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %x:_(s64) = COPY $x0 + %C1:_(s64) = G_CONSTANT i64 64 + %C2:_(s64) = G_CONSTANT i64 24 + %add_inner:_(s64) = G_ADD %x, %C1 + %add_outer:_(s64) = G_ADD %add_inner, %C2 + $x0 = COPY %add_outer + RET_ReallyLR implicit $x0 + +... + +# (op (op x, c1), y) -> (op (op x, y), c1) +--- +name: test2_add_move_inner_cst_to_rhs +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +body: | + bb.1: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test2_add_move_inner_cst_to_rhs + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s64) = COPY $x0 + ; CHECK-NEXT: %C1:_(s64) = G_CONSTANT i64 64 + ; CHECK-NEXT: %y:_(s64) = COPY $x1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %x, %y + ; CHECK-NEXT: %add_outer:_(s64) = G_ADD [[ADD]], %C1 + ; CHECK-NEXT: $x0 = COPY %add_outer(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %x:_(s64) = COPY $x0 + %C1:_(s64) = G_CONSTANT i64 64 + %y:_(s64) = COPY $x1 + %add_inner:_(s64) = G_ADD %x, %C1 + %add_outer:_(s64) = G_ADD %add_inner, %y + $x0 = COPY %add_outer + RET_ReallyLR implicit $x0 + +... +--- +name: test2_add_move_inner_cst_to_rhs_multiuse +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +body: | + bb.1: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test2_add_move_inner_cst_to_rhs_multiuse + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s64) = COPY $x0 + ; CHECK-NEXT: %C1:_(s64) = G_CONSTANT i64 64 + ; CHECK-NEXT: %y:_(s64) = COPY $x1 + ; CHECK-NEXT: %add_inner:_(s64) = G_ADD %x, %C1 + ; CHECK-NEXT: %add_outer:_(s64) = G_ADD %add_inner, %y + ; CHECK-NEXT: $x0 = COPY %add_outer(s64) + ; CHECK-NEXT: $x1 = COPY %add_inner(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %x:_(s64) = COPY $x0 + %C1:_(s64) = G_CONSTANT i64 64 + %y:_(s64) = COPY $x1 + %add_inner:_(s64) = G_ADD %x, %C1 + %add_outer:_(s64) = G_ADD %add_inner, %y + $x0 = COPY %add_outer + $x1 = COPY %add_inner + RET_ReallyLR implicit $x0 + +... +--- +name: test2_add_move_inner_cst_to_rhs_vector +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$x0' } +body: | + bb.1: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: test2_add_move_inner_cst_to_rhs_vector + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: %C1:_(s64) = G_CONSTANT i64 64 + ; CHECK-NEXT: %VEC_C1:_(<2 x s64>) = G_BUILD_VECTOR %C1(s64), %C1(s64) + ; CHECK-NEXT: %y:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<2 x s64>) = G_ADD %x, %y + ; CHECK-NEXT: %add_outer:_(<2 x s64>) = G_ADD [[ADD]], %VEC_C1 + ; CHECK-NEXT: $q0 = COPY %add_outer(<2 x s64>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %x:_(<2 x s64>) = COPY $q0 + %C1:_(s64) = G_CONSTANT i64 64 + %VEC_C1:_(<2 x s64>) = G_BUILD_VECTOR %C1, %C1 + %y:_(<2 x s64>) = COPY $q1 + %add_inner:_(<2 x s64>) = G_ADD %x, %VEC_C1 + %add_outer:_(<2 x s64>) = G_ADD %add_inner, %y + $q0 = COPY %add_outer + RET_ReallyLR implicit $q0 + +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index 404931cb76615..9d0c6b0cb1d49 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -5016,9 +5016,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX6-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr @@ -5032,9 +5033,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX7-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr @@ -5048,9 +5050,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX8-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[COPY5]], [[S_MOV_B32_]], implicit-def $scc - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[S_ADD_I32_]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY6]], [[COPY4]], 0, implicit $exec + ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, 1024 @@ -5071,10 +5074,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX7-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -5088,10 +5091,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX7-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX8-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -5105,10 +5108,10 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 - ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 - ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] + ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64 = V_ADD_CO_U32_e64 [[COPY4]], [[COPY6]], 0, implicit $exec - ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) + ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GFX8-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_ADD_CO_U32_e64_]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 1024, 0, 0, implicit $exec :: (dereferenceable invariant load (s32)) ; GFX8-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index 4b82c99e39732..8c24a28211b70 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -3057,12 +3057,14 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; CHECK-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) - ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_sgpr_vgpr @@ -3076,12 +3078,14 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_sgpr_vgpr(<4 x i32> inr ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 + ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GREEDY-NEXT: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[COPY5]], [[C]] - ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[ADD]](s32) - ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY6]], [[COPY4]] - ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[COPY4]], [[ADD]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] + ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.s, 1024 @@ -3102,13 +3106,14 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; CHECK-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; CHECK-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] - ; CHECK-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; CHECK-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GREEDY-LABEL: name: s_buffer_load_f32_offset_add_imm_vgpr_sgpr @@ -3122,13 +3127,14 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr ; GREEDY-NEXT: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; GREEDY-NEXT: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; GREEDY-NEXT: [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 - ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 - ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) + ; GREEDY-NEXT: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) ; GREEDY-NEXT: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[COPY4]], [[COPY6]] - ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32) + ; GREEDY-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1024 + ; GREEDY-NEXT: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32) ; GREEDY-NEXT: [[ADD1:%[0-9]+]]:vgpr(s32) = G_ADD [[ADD]], [[COPY7]] - ; GREEDY-NEXT: [[C1:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 - ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C1]](s32), [[ADD]], [[COPY5]], 0, 0, 0 :: (dereferenceable invariant load (s32)) + ; GREEDY-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[C2:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 0 + ; GREEDY-NEXT: [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:vgpr(s32) = G_AMDGPU_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[C2]](s32), [[ADD]], [[C1]], 1024, 0, 0 :: (dereferenceable invariant load (s32)) ; GREEDY-NEXT: $vgpr0 = COPY [[AMDGPU_BUFFER_LOAD]](s32) ; GREEDY-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %offset.base = add i32 %offset.v, 1024