From 67cfbec7461312ce6c1a8f38ad4f45b7886d51ef Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 14 Feb 2020 21:23:07 -0500 Subject: [PATCH] AMDGPU/GlobalISel: Insert readfirstlane on SGPR returns In case the source value ends up in a VGPR, insert a readfirstlane to avoid producing an illegal copy later. If it turns out to be unnecessary, it can be folded out. --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 12 + llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll | 51 +- .../GlobalISel/irtranslator-amdgpu_ps.ll | 71 +++ .../GlobalISel/irtranslator-amdgpu_vs.ll | 96 ++- .../GlobalISel/llvm.amdgcn.s.buffer.load.ll | 600 +++++++++++++----- .../regbankselect-amdgcn.s.buffer.load.ll | 120 +++- 6 files changed, 693 insertions(+), 257 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index f32f9ec0e6dc1..7d30178b57ed5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -59,6 +59,18 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler { } else ExtReg = extendRegister(ValVReg, VA); + // If this is a scalar return, insert a readfirstlane just in case the value + // ends up in a VGPR. + // FIXME: Assert this is a shader return. + const SIRegisterInfo *TRI + = static_cast(MRI.getTargetRegisterInfo()); + if (TRI->isSGPRReg(MRI, PhysReg)) { + auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, + {MRI.getType(ExtReg)}, false) + .addReg(ExtReg); + ExtReg = ToSGPR.getReg(0); + } + MIRBuilder.buildCopy(PhysReg, ExtReg); MIB.addUse(PhysReg, RegState::Implicit); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll index f446f35b0e0c4..a6c2524f1962b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll @@ -29,8 +29,7 @@ define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %bswap = call i32 @llvm.bswap.i32(i32 %src) - %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap) - ret i32 %to.sgpr + ret i32 %bswap } define i32 @v_bswap_i32(i32 %src) { @@ -96,13 +95,7 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) { ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src) - %bswap.0 = extractelement <2 x i32> %bswap, i32 0 - %bswap.1 = extractelement <2 x i32> %bswap, i32 1 - %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0) - %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1) - %ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0 - %ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1 - ret <2 x i32> %ins.1 + ret <2 x i32> %bswap } define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) { @@ -137,7 +130,7 @@ define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) { ret <2 x i32> %bswap } -define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) { +define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) { ; GFX7-LABEL: s_bswap_i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8 @@ -173,14 +166,7 @@ define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) { ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog %bswap = call i64 @llvm.bswap.i64(i64 %src) - %cast = bitcast i64 %bswap to <2 x i32> - %elt0 = extractelement <2 x i32> %cast, i32 0 - %elt1 = extractelement <2 x i32> %cast, i32 1 - %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0) - %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1) - %ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0 - %ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1 - ret <2 x i32> %ins.1 + ret i64 %bswap } define i64 @v_bswap_i64(i64 %src) { @@ -218,7 +204,7 @@ define i64 @v_bswap_i64(i64 %src) { ret i64 %bswap } -define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) { +define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) { ; GFX7-LABEL: s_bswap_v2i64: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8 @@ -274,20 +260,7 @@ define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) { ; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src) - %cast = bitcast <2 x i64> %bswap to <4 x i32> - %bswap.0 = extractelement <4 x i32> %cast, i32 0 - %bswap.1 = extractelement <4 x i32> %cast, i32 1 - %bswap.2 = extractelement <4 x i32> %cast, i32 2 - %bswap.3 = extractelement <4 x i32> %cast, i32 3 - %to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0) - %to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1) - %to.sgpr2 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.2) - %to.sgpr3 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.3) - %ins.0 = insertelement <4 x i32> undef, i32 %to.sgpr0, i32 0 - %ins.1 = insertelement <4 x i32> %ins.0, i32 %to.sgpr1, i32 1 - %ins.2 = insertelement <4 x i32> %ins.1, i32 %to.sgpr2, i32 2 - %ins.3 = insertelement <4 x i32> %ins.2, i32 %to.sgpr3, i32 3 - ret <4 x i32> %ins.3 + ret <2 x i64> %bswap } define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) { @@ -345,7 +318,6 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) { ; GFX7-NEXT: s_and_b32 s0, s0, 0xffff ; GFX7-NEXT: s_lshr_b32 s0, s0, 8 ; GFX7-NEXT: s_or_b32 s0, s0, s1 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_bswap_i16: @@ -364,10 +336,7 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %bswap = call i16 @llvm.bswap.i16(i16 %src) - %zext = zext i16 %bswap to i32 - %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) - %trunc = trunc i32 %to.sgpr to i16 - ret i16 %trunc + ret i16 %bswap } define i16 @v_bswap_i16(i16 %src) { @@ -431,9 +400,8 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) { ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: ; return to shader part epilog %bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src) - %cast0 = bitcast <2 x i16> %bswap to i32 - %to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0) - ret i32 %to.sgpr + %cast = bitcast <2 x i16> %bswap to i32 + ret i32 %cast } define i32 @v_bswap_i16_zext_to_i32(i16 %src) { @@ -574,7 +542,6 @@ define i64 @v_bswap_i48(i64 %src) { ret i64 %zext } -declare i32 @llvm.amdgcn.readfirstlane(i32) #0 declare i16 @llvm.bswap.i16(i16) #1 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1 declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll index 39fb8b0ed15e1..e7ef3bd455e4b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll @@ -31,6 +31,77 @@ main_body: ret void } +define amdgpu_ps float @vgpr_return(i32 %vgpr) { + ; CHECK-LABEL: name: vgpr_return + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: $vgpr0 = COPY [[COPY]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0 + %cast = bitcast i32 %vgpr to float + ret float %cast +} + +define amdgpu_ps i32 @sgpr_return_i32(i32 %vgpr) { + ; CHECK-LABEL: name: sgpr_return_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 + ret i32 %vgpr +} + +define amdgpu_ps i64 @sgpr_return_i64(i64 %vgpr) { + ; CHECK-LABEL: name: sgpr_return_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ret i64 %vgpr +} + +define amdgpu_ps <2 x i32> @sgpr_return_v2i32(<2 x i32> %vgpr) { + ; CHECK-LABEL: name: sgpr_return_v2i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + ret <2 x i32> %vgpr +} + +define amdgpu_ps { i32, i32 } @sgpr_struct_return_i32_i32(i32 %vgpr0, i32 %vgpr1) { + ; CHECK-LABEL: name: sgpr_struct_return_i32_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 + %insertvalue0 = insertvalue { i32, i32 } undef, i32 %vgpr0, 0 + %value = insertvalue { i32, i32 } %insertvalue0, i32 %vgpr1, 1 + ret { i32, i32 } %value +} + declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll index e93d39289a447..6a32581f7a9c5 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll @@ -1,73 +1,99 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=irtranslator -global-isel %s -o - | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=irtranslator %s -o - | FileCheck %s -; CHECK-LABEL: name: test_f32_inreg -; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2 -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[S0]] define amdgpu_vs void @test_f32_inreg(float inreg %arg0) { + ; CHECK-LABEL: name: test_f32_inreg + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0 + ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0 ret void } -; CHECK-LABEL: name: test_f32 -; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0 -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[V0]] define amdgpu_vs void @test_f32(float %arg0) { + ; CHECK-LABEL: name: test_f32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0 + ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0 ret void } -; CHECK-LABEL: name: test_ptr2_inreg -; CHECK: [[S2:%[0-9]+]]:_(s32) = COPY $sgpr2 -; CHECK: [[S3:%[0-9]+]]:_(s32) = COPY $sgpr3 -; CHECK: [[PTR:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[S2]](s32), [[S3]](s32) -; CHECK: G_LOAD [[PTR]] define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) { + ; CHECK-LABEL: name: test_ptr2_inreg + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; CHECK: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (volatile load 4 from %ir.arg0, addrspace 4) + ; CHECK: S_ENDPGM 0 %tmp0 = load volatile i32, i32 addrspace(4)* %arg0 ret void } -; CHECK-LABEL: name: test_sgpr_alignment0 -; CHECK: [[S2:%[0-9]+]]:_(s32) = COPY $sgpr2 -; CHECK: [[S3:%[0-9]+]]:_(s32) = COPY $sgpr3 -; CHECK: [[S4:%[0-9]+]]:_(s32) = COPY $sgpr4 -; CHECK: [[S34:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[S3]](s32), [[S4]](s32) -; CHECK: G_LOAD [[S34]] -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[S2]](s32) define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) { + ; CHECK-LABEL: name: test_sgpr_alignment0 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4 + ; CHECK: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (volatile load 4 from %ir.arg1, addrspace 4) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0 + ; CHECK: S_ENDPGM 0 %tmp0 = load volatile i32, i32 addrspace(4)* %arg1 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0 ret void } -; CHECK-LABEL: name: test_order -; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2 -; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3 -; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0 -; CHECK: [[V1:%[0-9]+]]:_(s32) = COPY $vgpr1 -; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[V0]](s32), [[S0]](s32), [[V1]](s32), [[S1]](s32) define amdgpu_vs void @test_order(float inreg %arg0, float inreg %arg1, float %arg2, float %arg3) { + ; CHECK-LABEL: name: test_order + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY2]](s32), [[COPY]](s32), [[COPY3]](s32), [[COPY1]](s32), 0, 0 + ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg2, float %arg0, float %arg3, float %arg1, i1 false, i1 false) #0 ret void } -; CHECK-LABEL: name: ret_struct -; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2 -; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3 -; CHECK: $sgpr0 = COPY [[S0]] -; CHECK: $sgpr1 = COPY [[S1]] -; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 define amdgpu_vs <{ i32, i32 }> @ret_struct(i32 inreg %arg0, i32 inreg %arg1) { + ; CHECK-LABEL: name: ret_struct + ; CHECK: bb.1.main_body: + ; CHECK: liveins: $sgpr2, $sgpr3 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3 + ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 main_body: %tmp0 = insertvalue <{ i32, i32 }> undef, i32 %arg0, 0 %tmp1 = insertvalue <{ i32, i32 }> %tmp0, i32 %arg1, 1 ret <{ i32, i32 }> %tmp1 } -; CHECK_LABEL: name: non_void_ret -; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 -; CHECK: $sgpr0 = COPY [[ZERO]] -; SI_RETURN_TO_EPILOG $sgpr0 define amdgpu_vs i32 @non_void_ret() { + ; CHECK-LABEL: name: non_void_ret + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 ret i32 0 } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll index 176d54bd5b238..db24f4431197f 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll @@ -17,7 +17,9 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX6: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32 ; GFX7: bb.1 (%ir-block.0): @@ -29,7 +31,9 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX7: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32 ; GFX8: bb.1 (%ir-block.0): @@ -41,7 +45,9 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; GFX8: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret i32 %val @@ -58,7 +64,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so ; GFX6: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_glc ; GFX7: bb.1 (%ir-block.0): @@ -70,7 +78,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so ; GFX7: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_glc ; GFX8: bb.1 (%ir-block.0): @@ -82,7 +92,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_glc(<4 x i32> inreg %rsrc, i32 inreg %so ; GFX8: [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[COPY4]], 1, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 1) ret i32 %val @@ -101,8 +113,12 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX6: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 8, align 4) ; GFX6: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX6: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 - ; GFX6: $sgpr0 = COPY [[COPY5]] - ; GFX6: $sgpr1 = COPY [[COPY6]] + ; GFX6: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX6: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX6: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; GFX7-LABEL: name: s_buffer_load_v2i32 ; GFX7: bb.1 (%ir-block.0): @@ -116,8 +132,12 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 8, align 4) ; GFX7: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX7: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 - ; GFX7: $sgpr0 = COPY [[COPY5]] - ; GFX7: $sgpr1 = COPY [[COPY6]] + ; GFX7: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX7: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX7: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 ; GFX8-LABEL: name: s_buffer_load_v2i32 ; GFX8: bb.1 (%ir-block.0): @@ -131,8 +151,12 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8: [[S_BUFFER_LOAD_DWORDX2_SGPR:%[0-9]+]]:sreg_64_xexec = S_BUFFER_LOAD_DWORDX2_SGPR [[REG_SEQUENCE]], [[COPY4]], 0, 0 :: (dereferenceable invariant load 8, align 4) ; GFX8: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub0 ; GFX8: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX2_SGPR]].sub1 - ; GFX8: $sgpr0 = COPY [[COPY5]] - ; GFX8: $sgpr1 = COPY [[COPY6]] + ; GFX8: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX8: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; GFX8: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <2 x i32> %val @@ -153,9 +177,15 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX6: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 ; GFX6: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 ; GFX6: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX6: $sgpr0 = COPY [[COPY6]] - ; GFX6: $sgpr1 = COPY [[COPY7]] - ; GFX6: $sgpr2 = COPY [[COPY8]] + ; GFX6: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX6: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX6: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX6: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX6: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; GFX7-LABEL: name: s_buffer_load_v3i32 ; GFX7: bb.1 (%ir-block.0): @@ -171,9 +201,15 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 ; GFX7: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 ; GFX7: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX7: $sgpr0 = COPY [[COPY6]] - ; GFX7: $sgpr1 = COPY [[COPY7]] - ; GFX7: $sgpr2 = COPY [[COPY8]] + ; GFX7: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX7: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX7: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX7: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX7: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 ; GFX8-LABEL: name: s_buffer_load_v3i32 ; GFX8: bb.1 (%ir-block.0): @@ -189,9 +225,15 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub0 ; GFX8: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub1 ; GFX8: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY5]].sub2 - ; GFX8: $sgpr0 = COPY [[COPY6]] - ; GFX8: $sgpr1 = COPY [[COPY7]] - ; GFX8: $sgpr2 = COPY [[COPY8]] + ; GFX8: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX8: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec + ; GFX8: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX8: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec + ; GFX8: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x i32> %val @@ -216,14 +258,30 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX6: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub5 ; GFX6: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub6 ; GFX6: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub7 - ; GFX6: $sgpr0 = COPY [[COPY5]] - ; GFX6: $sgpr1 = COPY [[COPY6]] - ; GFX6: $sgpr2 = COPY [[COPY7]] - ; GFX6: $sgpr3 = COPY [[COPY8]] - ; GFX6: $sgpr4 = COPY [[COPY9]] - ; GFX6: $sgpr5 = COPY [[COPY10]] - ; GFX6: $sgpr6 = COPY [[COPY11]] - ; GFX6: $sgpr7 = COPY [[COPY12]] + ; GFX6: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX6: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX6: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX6: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; GFX6: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX6: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX6: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX6: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX6: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX6: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX6: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] + ; GFX6: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX6: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX6: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] + ; GFX6: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX6: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX6: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] + ; GFX6: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; GFX6: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX6: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 ; GFX7-LABEL: name: s_buffer_load_v8i32 ; GFX7: bb.1 (%ir-block.0): @@ -243,14 +301,30 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX7: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub5 ; GFX7: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub6 ; GFX7: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub7 - ; GFX7: $sgpr0 = COPY [[COPY5]] - ; GFX7: $sgpr1 = COPY [[COPY6]] - ; GFX7: $sgpr2 = COPY [[COPY7]] - ; GFX7: $sgpr3 = COPY [[COPY8]] - ; GFX7: $sgpr4 = COPY [[COPY9]] - ; GFX7: $sgpr5 = COPY [[COPY10]] - ; GFX7: $sgpr6 = COPY [[COPY11]] - ; GFX7: $sgpr7 = COPY [[COPY12]] + ; GFX7: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX7: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX7: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX7: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; GFX7: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX7: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX7: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX7: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX7: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX7: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX7: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] + ; GFX7: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX7: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX7: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] + ; GFX7: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX7: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX7: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] + ; GFX7: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; GFX7: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX7: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 ; GFX8-LABEL: name: s_buffer_load_v8i32 ; GFX8: bb.1 (%ir-block.0): @@ -270,14 +344,30 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; GFX8: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub5 ; GFX8: [[COPY11:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub6 ; GFX8: [[COPY12:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX8_SGPR]].sub7 - ; GFX8: $sgpr0 = COPY [[COPY5]] - ; GFX8: $sgpr1 = COPY [[COPY6]] - ; GFX8: $sgpr2 = COPY [[COPY7]] - ; GFX8: $sgpr3 = COPY [[COPY8]] - ; GFX8: $sgpr4 = COPY [[COPY9]] - ; GFX8: $sgpr5 = COPY [[COPY10]] - ; GFX8: $sgpr6 = COPY [[COPY11]] - ; GFX8: $sgpr7 = COPY [[COPY12]] + ; GFX8: [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX8: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY14]], implicit $exec + ; GFX8: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX8: [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY15]], implicit $exec + ; GFX8: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX8: [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX8: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY16]], implicit $exec + ; GFX8: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX8: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX8: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY17]], implicit $exec + ; GFX8: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] + ; GFX8: [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX8: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec + ; GFX8: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] + ; GFX8: [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX8: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec + ; GFX8: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] + ; GFX8: [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; GFX8: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec + ; GFX8: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x i32> %val @@ -310,22 +400,54 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX6: [[COPY18:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub13 ; GFX6: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub14 ; GFX6: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub15 - ; GFX6: $sgpr0 = COPY [[COPY5]] - ; GFX6: $sgpr1 = COPY [[COPY6]] - ; GFX6: $sgpr2 = COPY [[COPY7]] - ; GFX6: $sgpr3 = COPY [[COPY8]] - ; GFX6: $sgpr4 = COPY [[COPY9]] - ; GFX6: $sgpr5 = COPY [[COPY10]] - ; GFX6: $sgpr6 = COPY [[COPY11]] - ; GFX6: $sgpr7 = COPY [[COPY12]] - ; GFX6: $sgpr8 = COPY [[COPY13]] - ; GFX6: $sgpr9 = COPY [[COPY14]] - ; GFX6: $sgpr10 = COPY [[COPY15]] - ; GFX6: $sgpr11 = COPY [[COPY16]] - ; GFX6: $sgpr12 = COPY [[COPY17]] - ; GFX6: $sgpr13 = COPY [[COPY18]] - ; GFX6: $sgpr14 = COPY [[COPY19]] - ; GFX6: $sgpr15 = COPY [[COPY20]] + ; GFX6: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX6: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX6: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec + ; GFX6: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX6: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX6: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec + ; GFX6: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX6: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX6: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX6: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX6: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX6: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX6: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] + ; GFX6: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX6: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec + ; GFX6: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] + ; GFX6: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX6: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX6: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] + ; GFX6: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; GFX6: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX6: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX6: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] + ; GFX6: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec + ; GFX6: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]] + ; GFX6: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX6: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec + ; GFX6: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX6: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]] + ; GFX6: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX6: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]] + ; GFX6: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]] + ; GFX6: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX6: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]] + ; GFX6: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX6: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec + ; GFX6: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]] + ; GFX6: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]] + ; GFX6: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec + ; GFX6: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]] + ; GFX6: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]] + ; GFX6: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec + ; GFX6: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]] + ; GFX6: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]] + ; GFX6: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec + ; GFX6: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 ; GFX7-LABEL: name: s_buffer_load_v16i32 ; GFX7: bb.1 (%ir-block.0): @@ -353,22 +475,54 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX7: [[COPY18:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub13 ; GFX7: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub14 ; GFX7: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub15 - ; GFX7: $sgpr0 = COPY [[COPY5]] - ; GFX7: $sgpr1 = COPY [[COPY6]] - ; GFX7: $sgpr2 = COPY [[COPY7]] - ; GFX7: $sgpr3 = COPY [[COPY8]] - ; GFX7: $sgpr4 = COPY [[COPY9]] - ; GFX7: $sgpr5 = COPY [[COPY10]] - ; GFX7: $sgpr6 = COPY [[COPY11]] - ; GFX7: $sgpr7 = COPY [[COPY12]] - ; GFX7: $sgpr8 = COPY [[COPY13]] - ; GFX7: $sgpr9 = COPY [[COPY14]] - ; GFX7: $sgpr10 = COPY [[COPY15]] - ; GFX7: $sgpr11 = COPY [[COPY16]] - ; GFX7: $sgpr12 = COPY [[COPY17]] - ; GFX7: $sgpr13 = COPY [[COPY18]] - ; GFX7: $sgpr14 = COPY [[COPY19]] - ; GFX7: $sgpr15 = COPY [[COPY20]] + ; GFX7: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX7: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX7: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec + ; GFX7: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX7: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX7: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec + ; GFX7: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX7: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX7: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX7: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX7: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX7: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX7: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] + ; GFX7: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX7: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec + ; GFX7: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] + ; GFX7: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX7: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX7: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] + ; GFX7: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; GFX7: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX7: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX7: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] + ; GFX7: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec + ; GFX7: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]] + ; GFX7: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX7: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec + ; GFX7: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX7: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]] + ; GFX7: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX7: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]] + ; GFX7: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]] + ; GFX7: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX7: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]] + ; GFX7: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX7: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec + ; GFX7: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]] + ; GFX7: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]] + ; GFX7: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec + ; GFX7: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]] + ; GFX7: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]] + ; GFX7: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec + ; GFX7: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]] + ; GFX7: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]] + ; GFX7: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec + ; GFX7: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 ; GFX8-LABEL: name: s_buffer_load_v16i32 ; GFX8: bb.1 (%ir-block.0): @@ -396,22 +550,54 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; GFX8: [[COPY18:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub13 ; GFX8: [[COPY19:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub14 ; GFX8: [[COPY20:%[0-9]+]]:sreg_32 = COPY [[S_BUFFER_LOAD_DWORDX16_SGPR]].sub15 - ; GFX8: $sgpr0 = COPY [[COPY5]] - ; GFX8: $sgpr1 = COPY [[COPY6]] - ; GFX8: $sgpr2 = COPY [[COPY7]] - ; GFX8: $sgpr3 = COPY [[COPY8]] - ; GFX8: $sgpr4 = COPY [[COPY9]] - ; GFX8: $sgpr5 = COPY [[COPY10]] - ; GFX8: $sgpr6 = COPY [[COPY11]] - ; GFX8: $sgpr7 = COPY [[COPY12]] - ; GFX8: $sgpr8 = COPY [[COPY13]] - ; GFX8: $sgpr9 = COPY [[COPY14]] - ; GFX8: $sgpr10 = COPY [[COPY15]] - ; GFX8: $sgpr11 = COPY [[COPY16]] - ; GFX8: $sgpr12 = COPY [[COPY17]] - ; GFX8: $sgpr13 = COPY [[COPY18]] - ; GFX8: $sgpr14 = COPY [[COPY19]] - ; GFX8: $sgpr15 = COPY [[COPY20]] + ; GFX8: [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; GFX8: [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; GFX8: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY22]], implicit $exec + ; GFX8: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; GFX8: [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[COPY7]] + ; GFX8: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY23]], implicit $exec + ; GFX8: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] + ; GFX8: [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[COPY8]] + ; GFX8: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY24]], implicit $exec + ; GFX8: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]] + ; GFX8: [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[COPY9]] + ; GFX8: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY25]], implicit $exec + ; GFX8: $sgpr4 = COPY [[V_READFIRSTLANE_B32_4]] + ; GFX8: [[COPY26:%[0-9]+]]:vgpr_32 = COPY [[COPY10]] + ; GFX8: [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY26]], implicit $exec + ; GFX8: $sgpr5 = COPY [[V_READFIRSTLANE_B32_5]] + ; GFX8: [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[COPY11]] + ; GFX8: [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY27]], implicit $exec + ; GFX8: $sgpr6 = COPY [[V_READFIRSTLANE_B32_6]] + ; GFX8: [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[COPY12]] + ; GFX8: [[V_READFIRSTLANE_B32_7:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY28]], implicit $exec + ; GFX8: $sgpr7 = COPY [[V_READFIRSTLANE_B32_7]] + ; GFX8: [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[COPY13]] + ; GFX8: [[V_READFIRSTLANE_B32_8:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY29]], implicit $exec + ; GFX8: $sgpr8 = COPY [[V_READFIRSTLANE_B32_8]] + ; GFX8: [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[COPY14]] + ; GFX8: [[V_READFIRSTLANE_B32_9:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY30]], implicit $exec + ; GFX8: $sgpr9 = COPY [[V_READFIRSTLANE_B32_9]] + ; GFX8: [[COPY31:%[0-9]+]]:vgpr_32 = COPY [[COPY15]] + ; GFX8: [[V_READFIRSTLANE_B32_10:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY31]], implicit $exec + ; GFX8: $sgpr10 = COPY [[V_READFIRSTLANE_B32_10]] + ; GFX8: [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[COPY16]] + ; GFX8: [[V_READFIRSTLANE_B32_11:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY32]], implicit $exec + ; GFX8: $sgpr11 = COPY [[V_READFIRSTLANE_B32_11]] + ; GFX8: [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[COPY17]] + ; GFX8: [[V_READFIRSTLANE_B32_12:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY33]], implicit $exec + ; GFX8: $sgpr12 = COPY [[V_READFIRSTLANE_B32_12]] + ; GFX8: [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[COPY18]] + ; GFX8: [[V_READFIRSTLANE_B32_13:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY34]], implicit $exec + ; GFX8: $sgpr13 = COPY [[V_READFIRSTLANE_B32_13]] + ; GFX8: [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[COPY19]] + ; GFX8: [[V_READFIRSTLANE_B32_14:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY35]], implicit $exec + ; GFX8: $sgpr14 = COPY [[V_READFIRSTLANE_B32_14]] + ; GFX8: [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[COPY20]] + ; GFX8: [[V_READFIRSTLANE_B32_15:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY36]], implicit $exec + ; GFX8: $sgpr15 = COPY [[V_READFIRSTLANE_B32_15]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x i32> %val @@ -428,7 +614,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_1 ; GFX7: bb.1 (%ir-block.0): @@ -440,7 +628,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_1 ; GFX8: bb.1 (%ir-block.0): @@ -451,7 +641,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1(<4 x i32> inreg %rsrc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1, i32 0) ret i32 %val @@ -467,7 +659,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 1, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_glc_4 ; GFX7: bb.1 (%ir-block.0): @@ -478,7 +672,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1, 1, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_glc_4 ; GFX8: bb.1 (%ir-block.0): @@ -489,7 +685,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_4(<4 x i32> inreg %rsrc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 4, 1, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 4, i32 1) ret i32 %val @@ -506,7 +704,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_255 ; GFX7: bb.1 (%ir-block.0): @@ -518,7 +718,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 255 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_255 ; GFX8: bb.1 (%ir-block.0): @@ -529,7 +731,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_255(<4 x i32> inreg %rsrc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 255, i32 0) ret i32 %val @@ -545,7 +749,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 64, 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_256 ; GFX7: bb.1 (%ir-block.0): @@ -556,7 +762,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 64, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_256 ; GFX8: bb.1 (%ir-block.0): @@ -567,7 +775,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_256(<4 x i32> inreg %rsrc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 256, 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 256, i32 0) ret i32 %val @@ -583,7 +793,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { ; GFX6: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_1020 ; GFX7: bb.1 (%ir-block.0): @@ -594,7 +806,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 255, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_1020 ; GFX8: bb.1 (%ir-block.0): @@ -605,7 +819,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1020(<4 x i32> inreg %rsrc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1020, 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1020, i32 0) ret i32 %val @@ -622,7 +838,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_1023 ; GFX7: bb.1 (%ir-block.0): @@ -634,7 +852,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1023 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_1023 ; GFX8: bb.1 (%ir-block.0): @@ -645,7 +865,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1023(<4 x i32> inreg %rsrc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1023, 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1023, i32 0) ret i32 %val @@ -662,7 +884,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1024 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_1024 ; GFX7: bb.1 (%ir-block.0): @@ -673,7 +897,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 256, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_1024 ; GFX8: bb.1 (%ir-block.0): @@ -684,7 +910,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1024(<4 x i32> inreg %rsrc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1024, 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1024, i32 0) ret i32 %val @@ -701,7 +929,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1025 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_1025 ; GFX7: bb.1 (%ir-block.0): @@ -713,7 +943,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1025 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_1025 ; GFX8: bb.1 (%ir-block.0): @@ -724,7 +956,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_1025(<4 x i32> inreg %rsrc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 1025, 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 1025, i32 0) ret i32 %val @@ -741,7 +975,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_neg1 ; GFX7: bb.1 (%ir-block.0): @@ -753,7 +989,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 ; GFX7: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_neg1 ; GFX8: bb.1 (%ir-block.0): @@ -765,7 +1003,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg1(<4 x i32> inreg %desc) { ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1, i32 0) ret i32 %load @@ -782,7 +1022,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_neg4 ; GFX7: bb.1 (%ir-block.0): @@ -793,7 +1035,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073741823, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_neg4 ; GFX8: bb.1 (%ir-block.0): @@ -805,7 +1049,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg4(<4 x i32> inreg %desc) { ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967292 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -4, i32 0) ret i32 %load @@ -822,7 +1068,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967288 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_neg8 ; GFX7: bb.1 (%ir-block.0): @@ -833,7 +1081,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073741822, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_neg8 ; GFX8: bb.1 (%ir-block.0): @@ -845,7 +1095,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg8(<4 x i32> inreg %desc) { ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967288 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -8, i32 0) ret i32 %load @@ -862,7 +1114,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_bit31 ; GFX7: bb.1 (%ir-block.0): @@ -873,7 +1127,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 536870912, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_bit31 ; GFX8: bb.1 (%ir-block.0): @@ -885,7 +1141,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit31(<4 x i32> inreg %desc) { ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -2147483648, i32 0) ret i32 %load @@ -902,7 +1160,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 1, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_glc_bit30 ; GFX7: bb.1 (%ir-block.0): @@ -913,7 +1173,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 268435456, 1, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_glc_bit30 ; GFX8: bb.1 (%ir-block.0): @@ -925,7 +1187,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_glc_bit30(<4 x i32> inreg %desc) ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1073741824 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 1, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1073741824, i32 1) ret i32 %load @@ -942,7 +1206,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_bit29 ; GFX7: bb.1 (%ir-block.0): @@ -953,7 +1219,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 134217728, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_bit29 ; GFX8: bb.1 (%ir-block.0): @@ -965,7 +1233,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit29(<4 x i32> inreg %desc) { ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 536870912 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 536870912, i32 0) ret i32 %load @@ -982,7 +1252,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2097152 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_bit21 ; GFX7: bb.1 (%ir-block.0): @@ -993,7 +1265,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 524288, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_bit21 ; GFX8: bb.1 (%ir-block.0): @@ -1005,7 +1279,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit21(<4 x i32> inreg %desc) { ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2097152 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 2097152, i32 0) ret i32 %load @@ -1022,7 +1298,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_bit20 ; GFX7: bb.1 (%ir-block.0): @@ -1033,7 +1311,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 262144, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_bit20 ; GFX8: bb.1 (%ir-block.0): @@ -1045,7 +1325,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit20(<4 x i32> inreg %desc) { ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1048576 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 1048576, i32 0) ret i32 %load @@ -1062,7 +1344,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4293918720 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_neg_bit20 ; GFX7: bb.1 (%ir-block.0): @@ -1073,7 +1357,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073479680, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_neg_bit20 ; GFX8: bb.1 (%ir-block.0): @@ -1085,7 +1371,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit20(<4 x i32> inreg %desc) ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4293918720 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -1048576, i32 0) ret i32 %load @@ -1102,7 +1390,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 524288 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_bit19 ; GFX7: bb.1 (%ir-block.0): @@ -1113,7 +1403,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 131072, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_bit19 ; GFX8: bb.1 (%ir-block.0): @@ -1124,7 +1416,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_bit19(<4 x i32> inreg %desc) { ; GFX8: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[REG_SEQUENCE]], 524288, 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 524288, i32 0) ret i32 %load @@ -1141,7 +1435,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX6: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 ; GFX6: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX6: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX6: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX6: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX6: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX7-LABEL: name: s_buffer_load_i32_offset_neg_bit19 ; GFX7: bb.1 (%ir-block.0): @@ -1152,7 +1448,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX7: [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5 ; GFX7: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX7: [[S_BUFFER_LOAD_DWORD_IMM_ci:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM_ci [[REG_SEQUENCE]], 1073610752, 0, 0 :: (dereferenceable invariant load 4) - ; GFX7: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_IMM_ci]] + ; GFX7: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX7: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX7: SI_RETURN_TO_EPILOG implicit $sgpr0 ; GFX8-LABEL: name: s_buffer_load_i32_offset_neg_bit19 ; GFX8: bb.1 (%ir-block.0): @@ -1164,7 +1462,9 @@ define amdgpu_ps i32 @s_buffer_load_i32_offset_neg_bit19(<4 x i32> inreg %desc) ; GFX8: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; GFX8: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008 ; GFX8: [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0 :: (dereferenceable invariant load 4) - ; GFX8: $sgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]] + ; GFX8: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec + ; GFX8: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; GFX8: SI_RETURN_TO_EPILOG implicit $sgpr0 %load = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 -524288, i32 0) ret i32 %load diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll index 6b0262e29f2ea..2d1ac5b126463 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.s.buffer.load.ll @@ -14,7 +14,9 @@ define amdgpu_ps i32 @s_buffer_load_i32(<4 x i32> inreg %rsrc, i32 inreg %soffse ; CHECK: [[COPY4:%[0-9]+]]:sgpr(s32) = COPY $sgpr6 ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(s32) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 4) - ; CHECK: $sgpr0 = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[AMDGPU_S_BUFFER_LOAD]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0 %val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret i32 %val @@ -32,8 +34,12 @@ define amdgpu_ps <2 x i32> @s_buffer_load_v2i32(<4 x i32> inreg %rsrc, i32 inreg ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<2 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 8, align 4) ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<2 x s32>) - ; CHECK: $sgpr0 = COPY [[UV]](s32) - ; CHECK: $sgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1 %val = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <2 x i32> %val @@ -52,9 +58,15 @@ define amdgpu_ps <3 x i32> @s_buffer_load_v3i32(<4 x i32> inreg %rsrc, i32 inreg ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<4 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 12, align 4) ; CHECK: [[EXTRACT:%[0-9]+]]:sgpr(<3 x s32>) = G_EXTRACT [[AMDGPU_S_BUFFER_LOAD]](<4 x s32>), 0 ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[EXTRACT]](<3 x s32>) - ; CHECK: $sgpr0 = COPY [[UV]](s32) - ; CHECK: $sgpr1 = COPY [[UV1]](s32) - ; CHECK: $sgpr2 = COPY [[UV2]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; CHECK: $sgpr2 = COPY [[INT2]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2 %val = call <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <3 x i32> %val @@ -72,14 +84,30 @@ define amdgpu_ps <8 x i32> @s_buffer_load_v8i32(<4 x i32> inreg %rsrc, i32 inreg ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 32, align 4) ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<8 x s32>) - ; CHECK: $sgpr0 = COPY [[UV]](s32) - ; CHECK: $sgpr1 = COPY [[UV1]](s32) - ; CHECK: $sgpr2 = COPY [[UV2]](s32) - ; CHECK: $sgpr3 = COPY [[UV3]](s32) - ; CHECK: $sgpr4 = COPY [[UV4]](s32) - ; CHECK: $sgpr5 = COPY [[UV5]](s32) - ; CHECK: $sgpr6 = COPY [[UV6]](s32) - ; CHECK: $sgpr7 = COPY [[UV7]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; CHECK: $sgpr2 = COPY [[INT2]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) + ; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) + ; CHECK: $sgpr3 = COPY [[INT3]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) + ; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; CHECK: $sgpr4 = COPY [[INT4]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) + ; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) + ; CHECK: $sgpr5 = COPY [[INT5]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) + ; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) + ; CHECK: $sgpr6 = COPY [[INT6]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) + ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) + ; CHECK: $sgpr7 = COPY [[INT7]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7 %val = call <8 x i32> @llvm.amdgcn.s.buffer.load.v8i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <8 x i32> %val @@ -97,22 +125,54 @@ define amdgpu_ps <16 x i32> @s_buffer_load_v16i32(<4 x i32> inreg %rsrc, i32 inr ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32) ; CHECK: [[AMDGPU_S_BUFFER_LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_AMDGPU_S_BUFFER_LOAD [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), 0 :: (dereferenceable invariant load 64, align 4) ; CHECK: [[UV:%[0-9]+]]:sgpr(s32), [[UV1:%[0-9]+]]:sgpr(s32), [[UV2:%[0-9]+]]:sgpr(s32), [[UV3:%[0-9]+]]:sgpr(s32), [[UV4:%[0-9]+]]:sgpr(s32), [[UV5:%[0-9]+]]:sgpr(s32), [[UV6:%[0-9]+]]:sgpr(s32), [[UV7:%[0-9]+]]:sgpr(s32), [[UV8:%[0-9]+]]:sgpr(s32), [[UV9:%[0-9]+]]:sgpr(s32), [[UV10:%[0-9]+]]:sgpr(s32), [[UV11:%[0-9]+]]:sgpr(s32), [[UV12:%[0-9]+]]:sgpr(s32), [[UV13:%[0-9]+]]:sgpr(s32), [[UV14:%[0-9]+]]:sgpr(s32), [[UV15:%[0-9]+]]:sgpr(s32) = G_UNMERGE_VALUES [[AMDGPU_S_BUFFER_LOAD]](<16 x s32>) - ; CHECK: $sgpr0 = COPY [[UV]](s32) - ; CHECK: $sgpr1 = COPY [[UV1]](s32) - ; CHECK: $sgpr2 = COPY [[UV2]](s32) - ; CHECK: $sgpr3 = COPY [[UV3]](s32) - ; CHECK: $sgpr4 = COPY [[UV4]](s32) - ; CHECK: $sgpr5 = COPY [[UV5]](s32) - ; CHECK: $sgpr6 = COPY [[UV6]](s32) - ; CHECK: $sgpr7 = COPY [[UV7]](s32) - ; CHECK: $sgpr8 = COPY [[UV8]](s32) - ; CHECK: $sgpr9 = COPY [[UV9]](s32) - ; CHECK: $sgpr10 = COPY [[UV10]](s32) - ; CHECK: $sgpr11 = COPY [[UV11]](s32) - ; CHECK: $sgpr12 = COPY [[UV12]](s32) - ; CHECK: $sgpr13 = COPY [[UV13]](s32) - ; CHECK: $sgpr14 = COPY [[UV14]](s32) - ; CHECK: $sgpr15 = COPY [[UV15]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:vgpr(s32) = COPY [[UV]](s32) + ; CHECK: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY5]](s32) + ; CHECK: $sgpr0 = COPY [[INT]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[UV1]](s32) + ; CHECK: [[INT1:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY6]](s32) + ; CHECK: $sgpr1 = COPY [[INT1]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[UV2]](s32) + ; CHECK: [[INT2:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY7]](s32) + ; CHECK: $sgpr2 = COPY [[INT2]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[UV3]](s32) + ; CHECK: [[INT3:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY8]](s32) + ; CHECK: $sgpr3 = COPY [[INT3]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[UV4]](s32) + ; CHECK: [[INT4:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY9]](s32) + ; CHECK: $sgpr4 = COPY [[INT4]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[UV5]](s32) + ; CHECK: [[INT5:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY10]](s32) + ; CHECK: $sgpr5 = COPY [[INT5]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:vgpr(s32) = COPY [[UV6]](s32) + ; CHECK: [[INT6:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY11]](s32) + ; CHECK: $sgpr6 = COPY [[INT6]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:vgpr(s32) = COPY [[UV7]](s32) + ; CHECK: [[INT7:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY12]](s32) + ; CHECK: $sgpr7 = COPY [[INT7]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[UV8]](s32) + ; CHECK: [[INT8:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY13]](s32) + ; CHECK: $sgpr8 = COPY [[INT8]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:vgpr(s32) = COPY [[UV9]](s32) + ; CHECK: [[INT9:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY14]](s32) + ; CHECK: $sgpr9 = COPY [[INT9]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:vgpr(s32) = COPY [[UV10]](s32) + ; CHECK: [[INT10:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY15]](s32) + ; CHECK: $sgpr10 = COPY [[INT10]](s32) + ; CHECK: [[COPY16:%[0-9]+]]:vgpr(s32) = COPY [[UV11]](s32) + ; CHECK: [[INT11:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY16]](s32) + ; CHECK: $sgpr11 = COPY [[INT11]](s32) + ; CHECK: [[COPY17:%[0-9]+]]:vgpr(s32) = COPY [[UV12]](s32) + ; CHECK: [[INT12:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY17]](s32) + ; CHECK: $sgpr12 = COPY [[INT12]](s32) + ; CHECK: [[COPY18:%[0-9]+]]:vgpr(s32) = COPY [[UV13]](s32) + ; CHECK: [[INT13:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY18]](s32) + ; CHECK: $sgpr13 = COPY [[INT13]](s32) + ; CHECK: [[COPY19:%[0-9]+]]:vgpr(s32) = COPY [[UV14]](s32) + ; CHECK: [[INT14:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY19]](s32) + ; CHECK: $sgpr14 = COPY [[INT14]](s32) + ; CHECK: [[COPY20:%[0-9]+]]:vgpr(s32) = COPY [[UV15]](s32) + ; CHECK: [[INT15:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY20]](s32) + ; CHECK: $sgpr15 = COPY [[INT15]](s32) ; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15 %val = call <16 x i32> @llvm.amdgcn.s.buffer.load.v16i32(<4 x i32> %rsrc, i32 %soffset, i32 0) ret <16 x i32> %val