Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU] Fix moveToValue for copy to phys SGPR #76715

Merged
merged 2 commits into from
Jan 2, 2024

Conversation

Pierre-vh
Copy link
Contributor

Fixes #76031

@llvmbot
Copy link
Collaborator

llvmbot commented Jan 2, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

Changes

Fixes #76031


Full diff: https://github.com/llvm/llvm-project/pull/76715.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+11)
  • (modified) llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll (+13-12)
  • (added) llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir (+44)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2fb3957a1ca9dc..b08c579895aaae 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7198,6 +7198,17 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     Register DstReg = Inst.getOperand(0).getReg();
     const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
 
+    // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
+    // hope for the best.
+    if (Inst.isCopy() && DstReg.isPhysical() &&
+        RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
+      BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+              get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
+          .add(Inst.getOperand(1));
+      Inst.eraseFromParent();
+      return;
+    }
+
     if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
         NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
       // Instead of creating a copy where src and dst are the same register
diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
index ac196635b363a4..5c1a7093720428 100644
--- a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
@@ -4,19 +4,20 @@
 define amdgpu_cs <2 x i32> @f() {
 ; CHECK-LABEL: f:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, s0
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s3, s0
-; CHECK-NEXT:    s_mov_b32 s4, s0
-; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; CHECK-NEXT:    s_mov_b32 s5, s0
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_mov_b32 s5, s4
+; CHECK-NEXT:    s_mov_b32 s6, s4
+; CHECK-NEXT:    s_mov_b32 s7, s4
+; CHECK-NEXT:    s_mov_b32 s0, s4
+; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; CHECK-NEXT:    s_mov_b32 s1, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; CHECK-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[0:1], v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s4
+; CHECK-NEXT:    s_mov_b32 s1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CHECK-NEXT:    ; return to shader part epilog
 bb:
   %i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
new file mode 100644
index 00000000000000..4292e76f370962
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies-copy-to-sgpr.mir
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -march=amdgcn -mcpu=tonga -run-pass=si-fix-sgpr-copies --verify-machineinstrs -o - %s | FileCheck %s
+
+# Copy to $sgpr0 is disconnected and becomes an IMPLICIT_DEF
+# Inserted V_AND_B32 defines virtual register after use.
+
+---
+name:            si_fix_sgpr_copies_breaks_function
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; CHECK-LABEL: name: si_fix_sgpr_copies_breaks_function
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+    ; CHECK-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
+    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -32768
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_1]]
+    ; CHECK-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 killed [[COPY1]], [[COPY2]], implicit $exec
+    ; CHECK-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_2]], [[V_XOR_B32_e64_]], implicit $exec
+    ; CHECK-NEXT: $sgpr0 = V_READFIRSTLANE_B32 [[V_AND_B32_e64_]], implicit $exec
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0
+    %0:sgpr_32 = COPY $sgpr0
+    %2:sreg_32 = S_MOV_B32 16
+    %3:sreg_32 = S_LSHR_B32 %0, killed %2, implicit-def dead $scc
+    %4:sreg_32 = COPY killed %3
+    %5:sreg_32 = S_MOV_B32 -32768
+    %7:vgpr_32 = COPY killed %5
+    %6:vgpr_32 = V_XOR_B32_e64 killed %4, %7, implicit $exec
+    %8:sreg_32 = S_MOV_B32 65535
+    %10:sreg_32 = COPY %6
+    %9:sreg_32 = S_AND_B32 killed %8, killed %10, implicit-def dead $scc
+    $sgpr0 = COPY %9
+    SI_RETURN_TO_EPILOG $sgpr0
+
+...

if (Inst.isCopy() && DstReg.isPhysical() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only going to work for 32-bit registers but still an improvement

; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: s_mov_b32 s1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; CHECK-NEXT: v_readfirstlane_b32 s0, v0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks odd, not sure it actually works. s0 isn't guaranteed to be v0, is it? What if the first active lane gets the 1 from cndmask ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The whole function looks trivially uniform to me, so it doesn't matter. However in a more realistic example, you have a return to epilog of a divergent value which you can't really do anything with

@Pierre-vh Pierre-vh merged commit 3356575 into llvm:main Jan 2, 2024
3 of 4 checks passed
@Pierre-vh Pierre-vh deleted the fix-move-to-valu branch January 2, 2024 13:45
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[AMDGPU] SIFixSGPRCopies breaks functions with VALU copy to SGPR
3 participants