[AMDGPU] Add inreg support for SGPR arguments (#67182)

Function parameters marked with inreg are supposed to be allocated to SGPRs. However, for compute functions, this is ignored and function parameters are allocated to VGPRs. This fix modifies CC_AMDGPU_Func in AMDGPUCallingConv.td to use SGPRs if input arg is marked inreg. --------- Co-authored-by: Jun Wang <jun.wang7@amd.com>
llvm · Nov 8, 2023 · 5447017 · 5447017
1 parent c99951d
commit 5447017
Show file tree

Hide file tree

Showing 7 changed files with 1,847 additions and 13 deletions.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -187,6 +187,11 @@ def CC_AMDGPU_Func : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
   CCIfType<[i1], CCPromoteToType<i32>>,
   CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+    !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i))  // SGPR0-29
+  >>>,
+
   CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2665,6 +2665,11 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   if (!IsKernel) {
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+    if (!IsGraphics && !Subtarget->enableFlatScratch()) {
+      CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+                                                  AMDGPU::SGPR2, AMDGPU::SGPR3},
+                              4);
+    }
     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
   }
 

diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2529,8 +2529,8 @@ bool isArgPassedInSGPR(const Argument *A) {
     return A->hasAttribute(Attribute::InReg) ||
            A->hasAttribute(Attribute::ByVal);
   default:
-    // TODO: Should calls support inreg for SGPR inputs?
-    return false;
+    // TODO: treat i1 as divergent?
+    return A->hasAttribute(Attribute::InReg);
   }
 }
 
@@ -2556,8 +2556,7 @@ bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
     return CB->paramHasAttr(ArgNo, Attribute::InReg) ||
            CB->paramHasAttr(ArgNo, Attribute::ByVal);
   default:
-    // TODO: Should calls support inreg for SGPR inputs?
-    return false;
+    return CB->paramHasAttr(ArgNo, Attribute::InReg);
   }
 }
 

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll
@@ -39,6 +39,15 @@ define i32 @asm_sgpr(i32 %divergent) {
   ret i32 %sgpr
 }
 
+; SGPR asm outputs are uniform regardless of the input operands.
+; Argument not divergent if marked inreg.
+; CHECK-LABEL: for function 'asm_sgpr_inreg_arg':
+; CHECK-NOT: DIVERGENT
+define i32 @asm_sgpr_inreg_arg(i32 inreg %divergent) {
+  %sgpr = call i32 asm "; def $0, $1","=s,v"(i32 %divergent)
+  ret i32 %sgpr
+}
+
 ; CHECK-LABEL: for function 'asm_mixed_sgpr_vgpr':
 ; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1, $2", "=s,=v,v"(i32 %divergent)
 ; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0
@@ -58,6 +67,18 @@ define void @single_lane_func_arguments(i32 %i32, i1 %i1) #2 {
  ret void
 }
 
+; CHECK-LABEL: for function 'divergent_args':
+; CHECK: DIVERGENT ARGUMENTS
+define void @divergent_args(i32 %i32, i1 %i1) {
+ ret void
+}
+
+; CHECK-LABEL: for function 'no_divergent_args_if_inreg':
+; CHECK-NOT: DIVERGENT
+define void @no_divergent_args_if_inreg(i32 inreg %i32, i1 inreg %i1) {
+ ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.readfirstlane(i32) #0
 declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #1

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/kernel-args.ll
@@ -30,8 +30,6 @@ define amdgpu_kernel void @test_amdgpu_kernel(ptr addrspace(4) byref([4 x <16 x
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
 ; CHECK: DIVERGENT:
-; CHECK: DIVERGENT:
-; CHECK: DIVERGENT:
 define void @test_c(ptr addrspace(5) byval([4 x <16 x i8>]) %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 {
   ret void
 }