Skip to content

Conversation

@arsenm
Copy link
Contributor

@arsenm arsenm commented Mar 4, 2025

This fixes a miscompile where a 64-bit materialize incorrectly folds into
a sub1 use operand.

We currently do not see many subregister use operands. Incidentally,
there are also SIFoldOperands bugs that prevent this fold from
appearing here. Pre-fix folding of 32-bit subregister uses from 64-bit
materializes, in preparation for future patches.

The existing APIs are awkward since they expect to have a fully formed
instruction with operands to use, and not something new which needs
to be created.

Copy link
Contributor Author

arsenm commented Mar 4, 2025

@llvmbot
Copy link
Member

llvmbot commented Mar 4, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

This fixes a miscompile where a 64-bit materialize incorrectly folds into
a sub1 use operand.

We currently do not see many subregister use operands. Incidentally,
there are also SIFoldOperands bugs that prevent this fold from
appearing here. Pre-fix folding of 32-bit subregister uses from 64-bit
materializes, in preparation for future patches.

The existing APIs are awkward since they expect to have a fully formed
instruction with operands to use, and not something new which needs
to be created.


Full diff: https://github.com/llvm/llvm-project/pull/129664.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+52-11)
  • (modified) llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir (+4-4)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index eb9aabf8b6317..ee9a8ec3bb245 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -50,6 +50,11 @@ struct FoldCandidate {
     }
   }
 
+  FoldCandidate(MachineInstr *MI, unsigned OpNo, int64_t FoldImm,
+                bool Commuted_ = false, int ShrinkOp = -1)
+      : UseMI(MI), ImmToFold(FoldImm), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
+        Kind(MachineOperand::MO_Immediate), Commuted(Commuted_) {}
+
   bool isFI() const {
     return Kind == MachineOperand::MO_FrameIndex;
   }
@@ -578,16 +583,29 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
 }
 
 static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
-                                MachineInstr *MI, unsigned OpNo,
-                                MachineOperand *FoldOp, bool Commuted = false,
-                                int ShrinkOp = -1) {
+                                FoldCandidate &&Entry) {
   // Skip additional folding on the same operand.
   for (FoldCandidate &Fold : FoldList)
-    if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
+    if (Fold.UseMI == Entry.UseMI && Fold.UseOpNo == Entry.UseOpNo)
       return;
-  LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
-                    << " operand " << OpNo << "\n  " << *MI);
-  FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
+  LLVM_DEBUG(dbgs() << "Append " << (Entry.Commuted ? "commuted" : "normal")
+                    << " operand " << Entry.UseOpNo << "\n  " << *Entry.UseMI);
+  FoldList.push_back(Entry);
+}
+
+static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
+                                MachineInstr *MI, unsigned OpNo,
+                                MachineOperand *FoldOp, bool Commuted = false,
+                                int ShrinkOp = -1) {
+  appendFoldCandidate(FoldList,
+                      FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
+}
+
+static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
+                                MachineInstr *MI, unsigned OpNo, int64_t ImmVal,
+                                bool Commuted = false, int ShrinkOp = -1) {
+  appendFoldCandidate(FoldList,
+                      FoldCandidate(MI, OpNo, ImmVal, Commuted, ShrinkOp));
 }
 
 bool SIFoldOperandsImpl::tryAddToFoldList(
@@ -847,11 +865,35 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
     return false;
 
   uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
-  if (OpToFold.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
-    appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold);
-    return true;
+  MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+  if (OpToFold.isImm()) {
+    if (unsigned UseSubReg = UseOp.getSubReg()) {
+      std::optional<int64_t> SubImm =
+          SIInstrInfo::extractSubregFromImm(OpToFold.getImm(), UseSubReg);
+      if (!SubImm)
+        return false;
+
+      // TODO: Avoid the temporary MachineOperand
+      MachineOperand TmpOp = MachineOperand::CreateImm(*SubImm);
+      if (TII->isOperandLegal(*UseMI, UseOpIdx, &TmpOp)) {
+        appendFoldCandidate(FoldList, UseMI, UseOpIdx, *SubImm);
+        return true;
+      }
+
+      return false;
+    }
+
+    if (TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
+      appendFoldCandidate(FoldList, UseMI, UseOpIdx, &OpToFold);
+      return true;
+    }
   }
 
+  // TODO: Verify the following code handles subregisters correctly.
+  // TODO: Handle extract of global reference
+  if (UseOp.getSubReg())
+    return false;
+
   if (!OpToFold.isReg())
     return false;
 
@@ -861,7 +903,6 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
 
   // Maybe it is just a COPY of an immediate itself.
   MachineInstr *Def = MRI->getVRegDef(UseReg);
-  MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
   if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
     MachineOperand &DefOp = Def->getOperand(1);
     if (DefOp.isImm() && TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
diff --git a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
index 591bda2b22f12..2389477bb72a8 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fold-operands-subreg-imm.mir
@@ -17,7 +17,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY]].sub1, %subreg.sub1
     ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[REG_SEQUENCE]].sub0, 8, implicit-def $scc
-    ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[REG_SEQUENCE]].sub1, 8, implicit-def $scc, implicit $scc
+    ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[REG_SEQUENCE]].sub1, 0, implicit-def $scc, implicit $scc
     ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]]
     %0:sgpr_64 = COPY $sgpr8_sgpr9
@@ -42,8 +42,8 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr8_vgpr9
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]].sub0, %subreg.sub0, [[COPY]].sub1, %subreg.sub1
-    ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, 30064771075, 0, implicit $exec
-    ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, 30064771075, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+    ; CHECK-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[REG_SEQUENCE]].sub0, 3, 0, implicit $exec
+    ; CHECK-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[REG_SEQUENCE]].sub1, 7, [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE1]]
     %0:vreg_64 = COPY $vgpr8_vgpr9
@@ -116,7 +116,7 @@ body:             |
     ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 8
     ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B64_]].sub1, %subreg.sub0, [[S_MOV_B64_]].sub0, %subreg.sub1
     ; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 8, implicit-def $scc
-    ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 8, implicit-def $scc, implicit $scc
+    ; CHECK-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY]].sub1, 0, implicit-def $scc, implicit $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_ADD_U32_]], implicit [[S_ADDC_U32_]]
     %0:sgpr_64 = COPY $sgpr8_sgpr9
     %1:sreg_64 = S_MOV_B64 8

Base automatically changed from users/arsenm/amdgpu-add-tests-subreg-with-imm-si-fold-operands to main March 4, 2025 16:12
…ands

This fixes a miscompile where a 64-bit materialize incorrectly folds into
a sub1 use operand.

We currently do not see many subregister use operands. Incidentally,
there are also SIFoldOperands bugs that prevent this fold from
appearing here. Pre-fix folding of 32-bit subregister uses from 64-bit
materializes, in preparation for future patches.

The existing APIs are awkward since they expect to have a fully formed
instruction with operands to use, and not something new which needs
to be created.
@arsenm arsenm force-pushed the users/arsenm/amdgpu/fix-fold-operands-immediate-handling-subreg-uses branch from 5b08cec to c216acd Compare March 4, 2025 16:13
@arsenm arsenm merged commit c8f4c35 into main Mar 4, 2025
11 checks passed
@arsenm arsenm deleted the users/arsenm/amdgpu/fix-fold-operands-immediate-handling-subreg-uses branch March 4, 2025 18:06
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants