diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp index 4b136f18ba98dd..315ef15a029606 100644 --- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp @@ -174,6 +174,7 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc, default: break; case TargetOpcode::G_ADD: + case TargetOpcode::G_PTR_ADD: case TargetOpcode::G_AND: case TargetOpcode::G_ASHR: case TargetOpcode::G_LSHR: @@ -193,7 +194,13 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc, // Try to constant fold these. assert(SrcOps.size() == 2 && "Invalid sources"); assert(DstOps.size() == 1 && "Invalid dsts"); - if (SrcOps[0].getLLTTy(*getMRI()).isVector()) { + LLT SrcTy = SrcOps[0].getLLTTy(*getMRI()); + + if (Opc == TargetOpcode::G_PTR_ADD && + getDataLayout().isNonIntegralAddressSpace(SrcTy.getAddressSpace())) + break; + + if (SrcTy.isVector()) { // Try to constant fold vector constants. Register VecCst = ConstantFoldVectorBinop( Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI(), *this); @@ -201,6 +208,7 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc, return buildCopy(DstOps[0], VecCst); break; } + if (Optional Cst = ConstantFoldBinOp(Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI())) return buildConstant(DstOps[0], *Cst); diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index ea47190e923280..71916b2300c638 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -500,6 +500,7 @@ Optional llvm::ConstantFoldBinOp(unsigned Opcode, const Register Op1, default: break; case TargetOpcode::G_ADD: + case TargetOpcode::G_PTR_ADD: return C1 + C2; case TargetOpcode::G_AND: return C1 & C2; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir index aceb983e1e0a3d..add7816778eb66 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir @@ -342,12 +342,9 @@ body: | ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0 ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero - ; CHECK-NEXT: %ten:_(p3) = G_CONSTANT i32 10 - ; CHECK-NEXT: %twenty:_(p3) = G_CONSTANT i32 20 - ; CHECK-NEXT: %thirty:_(s32) = G_CONSTANT i32 30 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD %ten, %thirty(s32) - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD %twenty, %thirty(s32) - ; CHECK-NEXT: %ptr_add:_(p3) = G_SELECT %cond(s1), [[PTR_ADD]], [[PTR_ADD1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 40 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 50 + ; CHECK-NEXT: %ptr_add:_(p3) = G_SELECT %cond(s1), [[C]], [[C1]] ; CHECK-NEXT: S_ENDPGM 0, implicit %ptr_add(p3) %reg:_(s32) = COPY $vgpr0 %zero:_(s32) = G_CONSTANT i32 0 @@ -372,12 +369,9 @@ body: | ; CHECK-NEXT: %reg:_(s32) = COPY $vgpr0 ; CHECK-NEXT: %zero:_(s32) = G_CONSTANT i32 0 ; CHECK-NEXT: %cond:_(s1) = G_ICMP intpred(eq), %reg(s32), %zero - ; CHECK-NEXT: %ten:_(s32) = G_CONSTANT i32 10 - ; CHECK-NEXT: %twenty:_(s32) = G_CONSTANT i32 20 - ; CHECK-NEXT: %thirty:_(p3) = G_CONSTANT i32 30 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD %thirty, %ten(s32) - ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD %thirty, %twenty(s32) - ; CHECK-NEXT: %ptr_add:_(p3) = G_SELECT %cond(s1), [[PTR_ADD]], [[PTR_ADD1]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(p3) = G_CONSTANT i32 40 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(p3) = G_CONSTANT i32 50 + ; CHECK-NEXT: %ptr_add:_(p3) = G_SELECT %cond(s1), [[C]], [[C1]] ; CHECK-NEXT: S_ENDPGM 0, implicit %ptr_add(p3) %reg:_(s32) = COPY $vgpr0 %zero:_(s32) = G_CONSTANT i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll index 27a2e8d4483aea..4588b84e664aab 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2550,13 +2550,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: s_cselect_b32 s7, s16, s15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off -; GFX9-NEXT: s_mov_b64 s[0:1], 16 +; GFX9-NEXT: v_mov_b32_e32 v4, 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_s: @@ -2685,7 +2685,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: s_cmp_eq_u32 s7, 1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_mov_b32_e32 v10, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s0, s9, s8 ; GFX10-NEXT: s_cmp_eq_u32 s7, 2 @@ -2731,9 +2732,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: v_mov_b32_e32 v6, s6 ; GFX10-NEXT: v_mov_b32_e32 v7, s7 -; GFX10-NEXT: s_mov_b64 s[0:1], 16 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -2783,10 +2783,10 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_s: @@ -2908,6 +2908,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: s_lshl_b32 s8, s8, s3 ; GFX10-NEXT: s_lshl_b32 s3, s9, s3 ; GFX10-NEXT: s_not_b32 s8, s8 +; GFX10-NEXT: v_mov_b32_e32 v12, 16 ; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo @@ -2918,19 +2919,18 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s5 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s6 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 -; GFX10-NEXT: v_and_or_b32 v12, v0, s8, s3 +; GFX10-NEXT: v_and_or_b32 v14, v0, s8, s3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s3, s7, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v12, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v12, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v12, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v12, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v12, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v12, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v12, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v12, s2 -; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v14, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v14, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, v14, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, v14, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v6, v14, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v7, v14, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v14, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v9, v14, s2 ; GFX10-NEXT: global_store_dwordx4 v[10:11], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v13, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v[12:13], v[4:7], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -2992,10 +2992,10 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_s: @@ -3124,6 +3124,7 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: s_cmp_eq_u32 s0, 1 ; GFX10-NEXT: v_and_b32_e32 v8, s3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, 16 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s1, s9, s8 @@ -3151,27 +3152,26 @@ define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: s_lshl_b32 s3, s3, s2 ; GFX10-NEXT: s_andn2_b32 s1, s1, s3 -; GFX10-NEXT: v_lshl_or_b32 v10, v8, s2, s1 +; GFX10-NEXT: v_lshl_or_b32 v12, v8, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 2 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 3 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 4 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 5 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 6 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v12, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 7 -; GFX10-NEXT: s_mov_b64 s[0:1], 16 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc_lo ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3234,10 +3234,10 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_s_v: @@ -3365,22 +3365,21 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX10-LABEL: insertelement_s_v16i16_s_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v0 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v0 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_and_b32 s6, s4, s5 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s5 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 ; GFX10-NEXT: v_lshlrev_b32_e64 v8, v0, s6 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 @@ -3390,29 +3389,30 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v1, s15, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, s15, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 +; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, s12 ; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 -; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3474,10 +3474,10 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_store_dwordx4 v0, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_s_v16i16_v_v: @@ -3604,20 +3604,19 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX10-LABEL: insertelement_s_v16i16_v_v: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 1, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v12, 1, v1 ; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 ; GFX10-NEXT: s_mov_b32 s4, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v10 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 4, v12 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 4, v1 -; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v10 -; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s3, 5, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v12 +; GFX10-NEXT: v_cmp_eq_u32_e64 s6, 0, v12 ; GFX10-NEXT: v_lshlrev_b32_e64 v3, v1, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v10 +; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 6, v12 ; GFX10-NEXT: v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -3628,29 +3627,30 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, s15, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v2, s15, s5 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 ; GFX10-NEXT: v_mov_b32_e32 v3, s11 -; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 +; GFX10-NEXT: v_and_or_b32 v13, v10, v9, v8 ; GFX10-NEXT: v_mov_b32_e32 v4, s12 ; GFX10-NEXT: v_mov_b32_e32 v5, s13 ; GFX10-NEXT: v_mov_b32_e32 v6, s14 ; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v11, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v11, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v11, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v11, s5 -; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: v_mov_b32_e32 v10, 16 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v13, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v13, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v13, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v13, s1 +; GFX10-NEXT: v_mov_b32_e32 v11, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v13, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v13, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v13, s5 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3696,13 +3696,13 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_s_v: @@ -3810,7 +3810,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v14, 0 +; GFX10-NEXT: v_mov_b32_e32 v13, 16 ; GFX10-NEXT: s_and_b32 s6, s2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 @@ -3819,6 +3819,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, 0 ; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 @@ -3833,20 +3834,19 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 -; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2 +; GFX10-NEXT: v_and_or_b32 v15, v1, v11, v2 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5 -; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 ; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -3891,13 +3891,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc -; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_s: @@ -4019,6 +4019,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: s_not_b32 s7, s7 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 +; GFX10-NEXT: v_mov_b32_e32 v13, 16 ; GFX10-NEXT: v_mov_b32_e32 v14, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo @@ -4029,18 +4030,17 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v9, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v10, s5 -; GFX10-NEXT: v_and_or_b32 v13, v0, s7, v1 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v13, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v13, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v13, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v13, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v13, s5 -; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: v_and_or_b32 v15, v0, s7, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v15, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v15, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, v15, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v7, v15, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v8, v15, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v9, v15, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v15, s5 ; GFX10-NEXT: global_store_dwordx4 v[11:12], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v14, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v[13:14], v[4:7], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx @@ -4085,13 +4085,13 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] -; GFX9-NEXT: s_mov_b64 s[0:1], 16 -; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_mov_b32_e32 v10, 16 +; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off -; GFX9-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] +; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off ; GFX9-NEXT: s_endpgm ; ; GFX8-LABEL: insertelement_v_v16i16_v_v: @@ -4198,6 +4198,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v14, 16 ; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 @@ -4222,18 +4223,17 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v11, s5 -; GFX10-NEXT: v_and_or_b32 v14, v1, v3, v2 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v14, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v14, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v14, s1 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v14, s2 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v14, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v14, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v14, s5 -; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: v_and_or_b32 v16, v1, v3, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, v16, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v16, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v16, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v16, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v16, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v16, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v10, v16, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v11, v16, s5 ; GFX10-NEXT: global_store_dwordx4 v[12:13], v[0:3], off -; GFX10-NEXT: global_store_dwordx4 v15, v[4:7], s[0:1] +; GFX10-NEXT: global_store_dwordx4 v[14:15], v[4:7], off ; GFX10-NEXT: s_endpgm %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 697cee38c90f06..f9a83e6f3ab3e4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -5184,22 +5184,22 @@ define amdgpu_ps void @amdgpu_ps_call_default_cc() { ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY [[DEF]](p4) ; CHECK-NEXT: [[C1:%[0-9]+]]:_(p4) = G_CONSTANT i64 0 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[C1]], [[C2]](s64) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY [[C1]](p4) ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY4]](<4 x s32>) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[DEF2]](s32) + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY5]](<4 x s32>) ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[DEF]](p4) ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY]](p4) - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY1]](p4) ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[DEF1]](s64) ; CHECK-NEXT: $sgpr12 = COPY [[DEF2]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY1]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY2]](s32) - ; CHECK-NEXT: $vgpr31 = COPY [[COPY3]](s32) + ; CHECK-NEXT: $sgpr13 = COPY [[COPY2]](s32) + ; CHECK-NEXT: $sgpr14 = COPY [[COPY3]](s32) + ; CHECK-NEXT: $vgpr31 = COPY [[COPY4]](s32) ; CHECK-NEXT: $sgpr30_sgpr31 = G_SI_CALL [[C]](p0), 0, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc ; CHECK-NEXT: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll index f29e6456d090fb..a5bc34a9fac78c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -1947,11 +1947,11 @@ define void @byval_a3i32_align128_byval_i16_align64([3 x i32] addrspace(5)* byva ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[COPY1]](p5) :: (dereferenceable load (s16) from %ir.arg1, addrspace 5) ; CHECK-NEXT: G_STORE [[LOAD]](s32), [[C]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null`, addrspace 1) ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[C]], [[C3]](s64) - ; CHECK-NEXT: G_STORE [[LOAD1]](s32), [[PTR_ADD2]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 4, addrspace 1) - ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[C]], [[C4]](s64) - ; CHECK-NEXT: G_STORE [[LOAD2]](s32), [[PTR_ADD3]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 8, addrspace 1) + ; CHECK-NEXT: [[C4:%[0-9]+]]:_(p1) = G_CONSTANT i64 4 + ; CHECK-NEXT: G_STORE [[LOAD1]](s32), [[C4]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 4, addrspace 1) + ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK-NEXT: [[C6:%[0-9]+]]:_(p1) = G_CONSTANT i64 8 + ; CHECK-NEXT: G_STORE [[LOAD2]](s32), [[C6]](p1) :: (store (s32) into `[3 x i32] addrspace(1)* null` + 8, addrspace 1) ; CHECK-NEXT: G_STORE [[LOAD3]](s16), [[COPY3]](p1) :: (store (s16) into `i16 addrspace(1)* null`, addrspace 1) ; CHECK-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] ; CHECK-NEXT: S_SETPC_B64_return [[COPY4]] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll new file mode 100644 index 00000000000000..88eab83433a8e4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - -stop-after=irtranslator %s | FileCheck %s + +; Check that the CSEMIRBuilder doesn't fold away the getelementptr during IRTranslator +define i8 addrspace(7)* @no_auto_constfold_gep() { + ; CHECK-LABEL: name: no_auto_constfold_gep + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(p7) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 123 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[C]], [[C1]](s64) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTR_ADD]](p7) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK-NEXT: S_SETPC_B64_return [[COPY1]], implicit $vgpr0, implicit $vgpr1 + %gep = getelementptr i8, i8 addrspace(7)* null, i64 123 + ret i8 addrspace(7)* %gep +} + +; Check that the CSEMIRBuilder doesn't fold away the getelementptr during IRTranslator +define <2 x i8 addrspace(7)*> @no_auto_constfold_gep_vector() { + ; CHECK-LABEL: name: no_auto_constfold_gep_vector + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $sgpr30_sgpr31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(p7) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p7>) = G_BUILD_VECTOR [[C]](p7), [[C]](p7) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 123 + ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[C1]](s64) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<2 x p7>) = G_PTR_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR1]](<2 x s64>) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p7>) = COPY [[PTR_ADD]](<2 x p7>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x p7>) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK-NEXT: S_SETPC_B64_return [[COPY2]], implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 + %gep = getelementptr i8, <2 x i8 addrspace(7)*> zeroinitializer, <2 x i64> + ret <2 x i8 addrspace(7)*> %gep +} diff --git a/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll b/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll index a6c70a9d7c4360..4e37fd8500e4ab 100644 --- a/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll @@ -8,16 +8,16 @@ define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 { ; CHECK-NEXT: s_mov_b64 s[4:5], 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: global_load_dwordx4 v[7:10], v[0:1], off +; CHECK-NEXT: global_load_dwordx4 v[6:9], v[0:1], off ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:16 -; CHECK-NEXT: ; kill: def $vgpr7_vgpr8_vgpr9_vgpr10 killed $vgpr7_vgpr8_vgpr9_vgpr10 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 killed $exec +; CHECK-NEXT: ; kill: def $vgpr6_vgpr7_vgpr8_vgpr9 killed $vgpr6_vgpr7_vgpr8_vgpr9 def $vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13 killed $exec ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v14, v3 -; CHECK-NEXT: v_mov_b32_e32 v13, v2 -; CHECK-NEXT: v_mov_b32_e32 v12, v1 -; CHECK-NEXT: v_mov_b32_e32 v11, v0 +; CHECK-NEXT: v_mov_b32_e32 v13, v3 +; CHECK-NEXT: v_mov_b32_e32 v12, v2 +; CHECK-NEXT: v_mov_b32_e32 v11, v1 +; CHECK-NEXT: v_mov_b32_e32 v10, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: global_load_dwordx4 v[18:21], v[0:1], off @@ -30,66 +30,67 @@ define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 { ; CHECK-NEXT: v_mov_b32_e32 v24, v2 ; CHECK-NEXT: v_mov_b32_e32 v23, v1 ; CHECK-NEXT: v_mov_b32_e32 v22, v0 -; CHECK-NEXT: v_mov_b32_e32 v2, v7 -; CHECK-NEXT: v_mov_b32_e32 v3, v8 -; CHECK-NEXT: v_mov_b32_e32 v0, v9 -; CHECK-NEXT: v_mov_b32_e32 v1, v10 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v6, v12 -; CHECK-NEXT: v_mov_b32_e32 v12, v13 -; CHECK-NEXT: v_mov_b32_e32 v13, v14 -; CHECK-NEXT: v_mov_b32_e32 v8, v18 -; CHECK-NEXT: v_mov_b32_e32 v9, v19 -; CHECK-NEXT: v_mov_b32_e32 v16, v20 -; CHECK-NEXT: v_mov_b32_e32 v17, v21 -; CHECK-NEXT: v_mov_b32_e32 v14, v22 -; CHECK-NEXT: v_mov_b32_e32 v15, v23 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v5, v7 +; CHECK-NEXT: v_mov_b32_e32 v2, v8 +; CHECK-NEXT: v_mov_b32_e32 v3, v9 +; CHECK-NEXT: v_mov_b32_e32 v0, v10 +; CHECK-NEXT: v_mov_b32_e32 v1, v11 +; CHECK-NEXT: v_mov_b32_e32 v8, v12 +; CHECK-NEXT: v_mov_b32_e32 v9, v13 +; CHECK-NEXT: v_mov_b32_e32 v16, v18 +; CHECK-NEXT: v_mov_b32_e32 v17, v19 +; CHECK-NEXT: v_mov_b32_e32 v14, v20 +; CHECK-NEXT: v_mov_b32_e32 v15, v21 +; CHECK-NEXT: v_mov_b32_e32 v12, v22 +; CHECK-NEXT: v_mov_b32_e32 v13, v23 ; CHECK-NEXT: v_mov_b32_e32 v10, v24 ; CHECK-NEXT: v_mov_b32_e32 v11, v25 +; CHECK-NEXT: v_mov_b32_e32 v6, v4 +; CHECK-NEXT: v_mov_b32_e32 v4, v5 +; CHECK-NEXT: v_mov_b32_e32 v7, v16 +; CHECK-NEXT: v_mov_b32_e32 v5, v17 +; CHECK-NEXT: v_add_co_u32 v6, s6, v6, v7 +; CHECK-NEXT: v_add_co_ci_u32_e64 v4, s6, v4, v5, s6 +; CHECK-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v7, v4 ; CHECK-NEXT: v_mov_b32_e32 v4, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v3 -; CHECK-NEXT: v_mov_b32_e32 v7, v8 -; CHECK-NEXT: v_mov_b32_e32 v3, v9 -; CHECK-NEXT: v_add_co_u32 v7, s6, v4, v7 +; CHECK-NEXT: v_mov_b32_e32 v5, v14 +; CHECK-NEXT: v_mov_b32_e32 v3, v15 +; CHECK-NEXT: v_add_co_u32 v4, s6, v4, v5 ; CHECK-NEXT: v_add_co_ci_u32_e64 v2, s6, v2, v3, s6 -; CHECK-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v8, v2 +; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v5, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v0 ; CHECK-NEXT: v_mov_b32_e32 v0, v1 -; CHECK-NEXT: v_mov_b32_e32 v3, v16 -; CHECK-NEXT: v_mov_b32_e32 v1, v17 -; CHECK-NEXT: v_add_co_u32 v3, s6, v2, v3 +; CHECK-NEXT: v_mov_b32_e32 v3, v12 +; CHECK-NEXT: v_mov_b32_e32 v1, v13 +; CHECK-NEXT: v_add_co_u32 v2, s6, v2, v3 ; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v1, s6 -; CHECK-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v4, v0 -; CHECK-NEXT: v_mov_b32_e32 v1, v5 -; CHECK-NEXT: v_mov_b32_e32 v0, v6 -; CHECK-NEXT: v_mov_b32_e32 v5, v14 -; CHECK-NEXT: v_mov_b32_e32 v2, v15 -; CHECK-NEXT: v_add_co_u32 v1, s6, v1, v5 -; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v2, s6 -; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v2, v0 -; CHECK-NEXT: v_mov_b32_e32 v5, v12 -; CHECK-NEXT: v_mov_b32_e32 v0, v13 +; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v3, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, v8 +; CHECK-NEXT: v_mov_b32_e32 v1, v9 ; CHECK-NEXT: v_mov_b32_e32 v9, v10 -; CHECK-NEXT: v_mov_b32_e32 v6, v11 -; CHECK-NEXT: v_add_co_u32 v5, s6, v5, v9 -; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v6, s6 -; CHECK-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v6, v0 -; CHECK-NEXT: ; kill: def $vgpr7_vgpr8 killed $vgpr7_vgpr8 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v10, v4 -; CHECK-NEXT: v_mov_b32_e32 v9, v3 -; CHECK-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v3, v5 -; CHECK-NEXT: v_mov_b32_e32 v4, v6 -; CHECK-NEXT: v_mov_b32_e32 v6, s5 -; CHECK-NEXT: v_mov_b32_e32 v5, s4 -; CHECK-NEXT: global_store_dwordx4 v[5:6], v[7:10], off +; CHECK-NEXT: v_mov_b32_e32 v8, v11 +; CHECK-NEXT: v_add_co_u32 v0, s6, v0, v9 +; CHECK-NEXT: v_add_co_ci_u32_e64 v8, s6, v1, v8, s6 +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v1, v8 +; CHECK-NEXT: ; kill: def $vgpr6_vgpr7 killed $vgpr6_vgpr7 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v9, v5 +; CHECK-NEXT: v_mov_b32_e32 v8, v4 +; CHECK-NEXT: ; kill: def $vgpr2_vgpr3 killed $vgpr2_vgpr3 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v5, v1 +; CHECK-NEXT: v_mov_b32_e32 v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off ; CHECK-NEXT: s_mov_b64 s[4:5], 16 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_endpgm entry: %load0 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32