Skip to content

Commit

Permalink
AMDGPU/GlobalISel: Insert readfirstlane on SGPR returns
Browse files Browse the repository at this point in the history
In case the source value ends up in a VGPR, insert a readfirstlane to
avoid producing an illegal copy later. If it turns out to be
unnecessary, it can be folded out.
  • Loading branch information
arsenm committed Mar 10, 2020
1 parent a314050 commit 67cfbec
Show file tree
Hide file tree
Showing 6 changed files with 693 additions and 257 deletions.
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Expand Up @@ -59,6 +59,18 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
} else
ExtReg = extendRegister(ValVReg, VA);

// If this is a scalar return, insert a readfirstlane just in case the value
// ends up in a VGPR.
// FIXME: Assert this is a shader return.
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
if (TRI->isSGPRReg(MRI, PhysReg)) {
auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
{MRI.getType(ExtReg)}, false)
.addReg(ExtReg);
ExtReg = ToSGPR.getReg(0);
}

MIRBuilder.buildCopy(PhysReg, ExtReg);
MIB.addUse(PhysReg, RegState::Implicit);
}
Expand Down
51 changes: 9 additions & 42 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/bswap.ll
Expand Up @@ -29,8 +29,7 @@ define amdgpu_ps i32 @s_bswap_i32(i32 inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
%bswap = call i32 @llvm.bswap.i32(i32 %src)
%to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap)
ret i32 %to.sgpr
ret i32 %bswap
}

define i32 @v_bswap_i32(i32 %src) {
Expand Down Expand Up @@ -96,13 +95,7 @@ define amdgpu_ps <2 x i32> @s_bswap_v2i32(<2 x i32> inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: ; return to shader part epilog
%bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %src)
%bswap.0 = extractelement <2 x i32> %bswap, i32 0
%bswap.1 = extractelement <2 x i32> %bswap, i32 1
%to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0)
%to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1)
%ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0
%ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1
ret <2 x i32> %ins.1
ret <2 x i32> %bswap
}

define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
Expand Down Expand Up @@ -137,7 +130,7 @@ define <2 x i32> @v_bswap_v2i32(<2 x i32> %src) {
ret <2 x i32> %bswap
}

define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) {
define amdgpu_ps i64 @s_bswap_i64(i64 inreg %src) {
; GFX7-LABEL: s_bswap_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8
Expand Down Expand Up @@ -173,14 +166,7 @@ define amdgpu_ps <2 x i32> @s_bswap_i64(i64 inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: ; return to shader part epilog
%bswap = call i64 @llvm.bswap.i64(i64 %src)
%cast = bitcast i64 %bswap to <2 x i32>
%elt0 = extractelement <2 x i32> %cast, i32 0
%elt1 = extractelement <2 x i32> %cast, i32 1
%to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0)
%to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1)
%ins.0 = insertelement <2 x i32> undef, i32 %to.sgpr0, i32 0
%ins.1 = insertelement <2 x i32> %ins.0, i32 %to.sgpr1, i32 1
ret <2 x i32> %ins.1
ret i64 %bswap
}

define i64 @v_bswap_i64(i64 %src) {
Expand Down Expand Up @@ -218,7 +204,7 @@ define i64 @v_bswap_i64(i64 %src) {
ret i64 %bswap
}

define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) {
define amdgpu_ps <2 x i64> @s_bswap_v2i64(<2 x i64> inreg %src) {
; GFX7-LABEL: s_bswap_v2i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_alignbit_b32 v0, s1, s1, 8
Expand Down Expand Up @@ -274,20 +260,7 @@ define amdgpu_ps <4 x i32> @s_bswap_v2i64(<2 x i64> inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s3, v3
; GFX9-NEXT: ; return to shader part epilog
%bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %src)
%cast = bitcast <2 x i64> %bswap to <4 x i32>
%bswap.0 = extractelement <4 x i32> %cast, i32 0
%bswap.1 = extractelement <4 x i32> %cast, i32 1
%bswap.2 = extractelement <4 x i32> %cast, i32 2
%bswap.3 = extractelement <4 x i32> %cast, i32 3
%to.sgpr0 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.0)
%to.sgpr1 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.1)
%to.sgpr2 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.2)
%to.sgpr3 = call i32 @llvm.amdgcn.readfirstlane(i32 %bswap.3)
%ins.0 = insertelement <4 x i32> undef, i32 %to.sgpr0, i32 0
%ins.1 = insertelement <4 x i32> %ins.0, i32 %to.sgpr1, i32 1
%ins.2 = insertelement <4 x i32> %ins.1, i32 %to.sgpr2, i32 2
%ins.3 = insertelement <4 x i32> %ins.2, i32 %to.sgpr3, i32 3
ret <4 x i32> %ins.3
ret <2 x i64> %bswap
}

define <2 x i64> @v_bswap_v2i64(<2 x i64> %src) {
Expand Down Expand Up @@ -345,7 +318,6 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: s_lshr_b32 s0, s0, 8
; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: s_bfe_u32 s0, s0, 0x100000
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_bswap_i16:
Expand All @@ -364,10 +336,7 @@ define amdgpu_ps i16 @s_bswap_i16(i16 inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
%bswap = call i16 @llvm.bswap.i16(i16 %src)
%zext = zext i16 %bswap to i32
%to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %zext)
%trunc = trunc i32 %to.sgpr to i16
ret i16 %trunc
ret i16 %bswap
}

define i16 @v_bswap_i16(i16 %src) {
Expand Down Expand Up @@ -431,9 +400,8 @@ define amdgpu_ps i32 @s_bswap_v2i16(<2 x i16> inreg %src) {
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
%bswap = call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %src)
%cast0 = bitcast <2 x i16> %bswap to i32
%to.sgpr = call i32 @llvm.amdgcn.readfirstlane(i32 %cast0)
ret i32 %to.sgpr
%cast = bitcast <2 x i16> %bswap to i32
ret i32 %cast
}

define i32 @v_bswap_i16_zext_to_i32(i16 %src) {
Expand Down Expand Up @@ -574,7 +542,6 @@ define i64 @v_bswap_i48(i64 %src) {
ret i64 %zext
}

declare i32 @llvm.amdgcn.readfirstlane(i32) #0
declare i16 @llvm.bswap.i16(i16) #1
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>) #1
declare <3 x i16> @llvm.bswap.v3i16(<3 x i16>) #1
Expand Down
71 changes: 71 additions & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_ps.ll
Expand Up @@ -31,6 +31,77 @@ main_body:
ret void
}

define amdgpu_ps float @vgpr_return(i32 %vgpr) {
; CHECK-LABEL: name: vgpr_return
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: $vgpr0 = COPY [[COPY]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $vgpr0
%cast = bitcast i32 %vgpr to float
ret float %cast
}

define amdgpu_ps i32 @sgpr_return_i32(i32 %vgpr) {
; CHECK-LABEL: name: sgpr_return_i32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0
ret i32 %vgpr
}

define amdgpu_ps i64 @sgpr_return_i64(i64 %vgpr) {
; CHECK-LABEL: name: sgpr_return_i64
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[MV]](s64)
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
ret i64 %vgpr
}

define amdgpu_ps <2 x i32> @sgpr_return_v2i32(<2 x i32> %vgpr) {
; CHECK-LABEL: name: sgpr_return_v2i32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32)
; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BUILD_VECTOR]](<2 x s32>)
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[UV1]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
ret <2 x i32> %vgpr
}

define amdgpu_ps { i32, i32 } @sgpr_struct_return_i32_i32(i32 %vgpr0, i32 %vgpr1) {
; CHECK-LABEL: name: sgpr_struct_return_i32_i32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
%insertvalue0 = insertvalue { i32, i32 } undef, i32 %vgpr0, 0
%value = insertvalue { i32, i32 } %insertvalue0, i32 %vgpr1, 1
ret { i32, i32 } %value
}

declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #0

attributes #0 = { nounwind }
Expand Down
96 changes: 61 additions & 35 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_vs.ll
@@ -1,73 +1,99 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=irtranslator %s -o - | FileCheck %s

; CHECK-LABEL: name: test_f32_inreg
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[S0]]
define amdgpu_vs void @test_f32_inreg(float inreg %arg0) {
; CHECK-LABEL: name: test_f32_inreg
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr2
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
ret void
}

; CHECK-LABEL: name: test_f32
; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[V0]]
define amdgpu_vs void @test_f32(float %arg0) {
; CHECK-LABEL: name: test_f32
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $vgpr0
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
ret void
}

; CHECK-LABEL: name: test_ptr2_inreg
; CHECK: [[S2:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S3:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[PTR:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[S2]](s32), [[S3]](s32)
; CHECK: G_LOAD [[PTR]]
define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
; CHECK-LABEL: name: test_ptr2_inreg
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr2, $sgpr3
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (volatile load 4 from %ir.arg0, addrspace 4)
; CHECK: S_ENDPGM 0
%tmp0 = load volatile i32, i32 addrspace(4)* %arg0
ret void
}

; CHECK-LABEL: name: test_sgpr_alignment0
; CHECK: [[S2:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S3:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[S4:%[0-9]+]]:_(s32) = COPY $sgpr4
; CHECK: [[S34:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[S3]](s32), [[S4]](s32)
; CHECK: G_LOAD [[S34]]
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[S2]](s32)
define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) {
; CHECK-LABEL: name: test_sgpr_alignment0
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr2, $sgpr3, $sgpr4
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr4
; CHECK: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (volatile load 4 from %ir.arg1, addrspace 4)
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), 0, 0
; CHECK: S_ENDPGM 0
%tmp0 = load volatile i32, i32 addrspace(4)* %arg1
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
ret void
}

; CHECK-LABEL: name: test_order
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[V0:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[V1:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[V0]](s32), [[S0]](s32), [[V1]](s32), [[S1]](s32)
define amdgpu_vs void @test_order(float inreg %arg0, float inreg %arg1, float %arg2, float %arg3) {
; CHECK-LABEL: name: test_order
; CHECK: bb.1 (%ir-block.0):
; CHECK: liveins: $sgpr2, $sgpr3, $vgpr0, $vgpr1
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr1
; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), 32, 15, [[COPY2]](s32), [[COPY]](s32), [[COPY3]](s32), [[COPY1]](s32), 0, 0
; CHECK: S_ENDPGM 0
call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg2, float %arg0, float %arg3, float %arg1, i1 false, i1 false) #0
ret void
}

; CHECK-LABEL: name: ret_struct
; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[S1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: $sgpr0 = COPY [[S0]]
; CHECK: $sgpr1 = COPY [[S1]]
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
define amdgpu_vs <{ i32, i32 }> @ret_struct(i32 inreg %arg0, i32 inreg %arg1) {
; CHECK-LABEL: name: ret_struct
; CHECK: bb.1.main_body:
; CHECK: liveins: $sgpr2, $sgpr3
; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr2
; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr3
; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[COPY1]](s32)
; CHECK: $sgpr1 = COPY [[INT1]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
main_body:
%tmp0 = insertvalue <{ i32, i32 }> undef, i32 %arg0, 0
%tmp1 = insertvalue <{ i32, i32 }> %tmp0, i32 %arg1, 1
ret <{ i32, i32 }> %tmp1
}

; CHECK_LABEL: name: non_void_ret
; CHECK: [[ZERO:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: $sgpr0 = COPY [[ZERO]]
; SI_RETURN_TO_EPILOG $sgpr0
define amdgpu_vs i32 @non_void_ret() {
; CHECK-LABEL: name: non_void_ret
; CHECK: bb.1 (%ir-block.0):
; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.readfirstlane), [[C]](s32)
; CHECK: $sgpr0 = COPY [[INT]](s32)
; CHECK: SI_RETURN_TO_EPILOG implicit $sgpr0
ret i32 0
}

Expand Down

0 comments on commit 67cfbec

Please sign in to comment.