Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PowerPC] Implement lock free __atomic_fetch_min/max for i128 #69573

Open
wants to merge 3 commits into
base: main
Choose a base branch
from

Conversation

bzEq
Copy link
Collaborator

@bzEq bzEq commented Oct 19, 2023

Fixes #68390.

@llvmbot
Copy link
Collaborator

llvmbot commented Oct 19, 2023

@llvm/pr-subscribers-llvm-ir

Author: Kai Luo (bzEq)

Changes

Fixes #68390.


Patch is 43.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69573.diff

5 Files Affected:

  • (modified) llvm/include/llvm/IR/IntrinsicsPowerPC.td (+5)
  • (modified) llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp (+145)
  • (modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+12)
  • (modified) llvm/lib/Target/PowerPC/PPCInstr64Bit.td (+9)
  • (modified) llvm/test/CodeGen/PowerPC/atomics-i128.ll (+964)
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index 3ede2a3736bf30d..ebf0f5df061cfc9 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1927,6 +1927,11 @@ let TargetPrefix = "ppc" in {
   def int_ppc_atomicrmw_or_i128   : AtomicRMW128Intrinsic;
   def int_ppc_atomicrmw_xor_i128  : AtomicRMW128Intrinsic;
   def int_ppc_atomicrmw_nand_i128 : AtomicRMW128Intrinsic;
+  def int_ppc_atomicrmw_max_i128  : AtomicRMW128Intrinsic;
+  def int_ppc_atomicrmw_umax_i128 : AtomicRMW128Intrinsic;
+  def int_ppc_atomicrmw_min_i128  : AtomicRMW128Intrinsic;
+  def int_ppc_atomicrmw_umin_i128 : AtomicRMW128Intrinsic;
+
   def int_ppc_cmpxchg_i128 : Intrinsic<[llvm_i64_ty, llvm_i64_ty],
                                        [llvm_ptr_ty,
                                         llvm_i64_ty, llvm_i64_ty,
diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
index aee57a5075ff719..c8554ce2eb55fac 100644
--- a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
@@ -46,6 +46,8 @@ class PPCExpandAtomicPseudo : public MachineFunctionPass {
                           MachineBasicBlock::iterator &NMBBI);
   bool expandAtomicCmpSwap128(MachineBasicBlock &MBB, MachineInstr &MI,
                               MachineBasicBlock::iterator &NMBBI);
+  bool expandAtomicRMWMinMax128(MachineBasicBlock &MBB, MachineInstr &MI,
+                                MachineBasicBlock::iterator &NMBBI);
 };
 
 static void PairedCopy(const PPCInstrInfo *TII, MachineBasicBlock &MBB,
@@ -111,6 +113,11 @@ bool PPCExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB, MachineInstr &MI,
     MI.eraseFromParent();
     return true;
   }
+  case PPC::ATOMIC_LOAD_MIN_I128:
+  case PPC::ATOMIC_LOAD_UMIN_I128:
+  case PPC::ATOMIC_LOAD_MAX_I128:
+  case PPC::ATOMIC_LOAD_UMAX_I128:
+    return expandAtomicRMWMinMax128(MBB, MI, NMBBI);
   default:
     return false;
   }
@@ -294,6 +301,144 @@ bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128(
   return true;
 }
 
+bool PPCExpandAtomicPseudo::expandAtomicRMWMinMax128(
+    MachineBasicBlock &MBB, MachineInstr &MI,
+    MachineBasicBlock::iterator &NMBBI) {
+  const MCInstrDesc &LL = TII->get(PPC::LQARX);
+  const MCInstrDesc &SC = TII->get(PPC::STQCX);
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  // For min/max operations, control flow is kinda like:
+  // MBB:
+  //   ...
+  // LoopMBB:
+  //   lqarx old, ptr
+  //   cmpd old.hi, op.hi
+  //   bgt ExitMBB
+  // CmpHiFailMBB:
+  //   blt StoreMBB
+  // CmpLoMBB:
+  //   cmpld old.lo, op.lo
+  //   bgt ExitMBB
+  // CmpLoFailMBB:
+  //   beq ExitMBB
+  // StoreMBB:
+  //   BUILD_QUADWROD tmp, op.lo, op.hi
+  //   stqcx. tmp, ptr
+  //   bne LoopMBB
+  // ExitMBB:
+  //   ...
+  const BasicBlock *BB = MBB.getBasicBlock();
+  // Create layout of control flow.
+  MachineFunction::iterator MFI = ++MBB.getIterator();
+  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *CmpHiFailMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *CmpLoMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *CmpLoFailMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *StoreMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(MFI, LoopMBB);
+  MF->insert(MFI, CmpHiFailMBB);
+  MF->insert(MFI, CmpLoMBB);
+  MF->insert(MFI, CmpLoFailMBB);
+  MF->insert(MFI, StoreMBB);
+  MF->insert(MFI, ExitMBB);
+  ExitMBB->splice(ExitMBB->begin(), &MBB, std::next(MI.getIterator()),
+                  MBB.end());
+  ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(ExitMBB);
+  LoopMBB->addSuccessor(CmpHiFailMBB);
+  CmpHiFailMBB->addSuccessor(CmpLoMBB);
+  CmpHiFailMBB->addSuccessor(StoreMBB);
+  CmpLoMBB->addSuccessor(ExitMBB);
+  CmpLoMBB->addSuccessor(CmpLoFailMBB);
+  CmpLoFailMBB->addSuccessor(ExitMBB);
+  CmpLoFailMBB->addSuccessor(StoreMBB);
+  StoreMBB->addSuccessor(LoopMBB);
+  StoreMBB->addSuccessor(ExitMBB);
+  Register Old = MI.getOperand(0).getReg();
+  Register OldHi = TRI->getSubReg(Old, PPC::sub_gp8_x0);
+  Register OldLo = TRI->getSubReg(Old, PPC::sub_gp8_x1);
+  Register Scratch = MI.getOperand(1).getReg();
+  Register ScratchHi = TRI->getSubReg(Scratch, PPC::sub_gp8_x0);
+  Register ScratchLo = TRI->getSubReg(Scratch, PPC::sub_gp8_x1);
+  Register RA = MI.getOperand(2).getReg();
+  Register RB = MI.getOperand(3).getReg();
+  Register OpLo = MI.getOperand(4).getReg();
+  Register OpHi = MI.getOperand(5).getReg();
+  MachineBasicBlock *CurrentMBB = LoopMBB;
+  unsigned CmpOp, CmpFailPred, CmpSuccPred;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unhandled atomic min/max operation");
+  case PPC::ATOMIC_LOAD_MAX_I128:
+    CmpOp = PPC::CMPD;
+    CmpSuccPred = PPC::PRED_GT;
+    CmpFailPred = PPC::PRED_LT;
+    break;
+  case PPC::ATOMIC_LOAD_UMAX_I128:
+    CmpOp = PPC::CMPLD;
+    CmpSuccPred = PPC::PRED_GT;
+    CmpFailPred = PPC::PRED_LT;
+    break;
+  case PPC::ATOMIC_LOAD_MIN_I128:
+    CmpOp = PPC::CMPD;
+    CmpSuccPred = PPC::PRED_LT;
+    CmpFailPred = PPC::PRED_GT;
+    break;
+  case PPC::ATOMIC_LOAD_UMIN_I128:
+    CmpOp = PPC::CMPLD;
+    CmpSuccPred = PPC::PRED_LT;
+    CmpFailPred = PPC::PRED_GT;
+    break;
+  }
+
+  BuildMI(CurrentMBB, DL, LL, Old).addReg(RA).addReg(RB);
+  BuildMI(CurrentMBB, DL, TII->get(CmpOp), PPC::CR0)
+      .addReg(OldHi)
+      .addReg(OpHi);
+  BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+      .addImm(CmpSuccPred)
+      .addReg(PPC::CR0)
+      .addMBB(ExitMBB);
+  CurrentMBB = CmpHiFailMBB;
+  BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+      .addImm(CmpFailPred)
+      .addReg(PPC::CR0)
+      .addMBB(StoreMBB);
+  CurrentMBB = CmpLoMBB;
+  BuildMI(CurrentMBB, DL, TII->get(PPC::CMPLD), PPC::CR0)
+      .addReg(OldLo)
+      .addReg(OpLo);
+  BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+      .addImm(CmpSuccPred)
+      .addReg(PPC::CR0)
+      .addMBB(ExitMBB);
+  CurrentMBB = CmpLoFailMBB;
+  BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_EQ)
+      .addReg(PPC::CR0)
+      .addMBB(ExitMBB);
+  CurrentMBB = StoreMBB;
+  PairedCopy(TII, *CurrentMBB, CurrentMBB->end(), DL, ScratchHi, ScratchLo,
+             OpHi, OpLo);
+  BuildMI(CurrentMBB, DL, SC).addReg(Scratch).addReg(RA).addReg(RB);
+  BuildMI(CurrentMBB, DL, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE)
+      .addReg(PPC::CR0)
+      .addMBB(LoopMBB);
+  recomputeLiveIns(*LoopMBB);
+  recomputeLiveIns(*CmpHiFailMBB);
+  recomputeLiveIns(*CmpLoMBB);
+  recomputeLiveIns(*CmpLoFailMBB);
+  recomputeLiveIns(*StoreMBB);
+  recomputeLiveIns(*ExitMBB);
+  NMBBI = MBB.end();
+  MI.eraseFromParent();
+  return true;
+}
+
 } // namespace
 
 INITIALIZE_PASS(PPCExpandAtomicPseudo, DEBUG_TYPE, "PowerPC Expand Atomic",
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 5e0c2d62f5a9cb5..786e8eca993197c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16989,6 +16989,10 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::ppc_atomicrmw_or_i128:
   case Intrinsic::ppc_atomicrmw_xor_i128:
   case Intrinsic::ppc_cmpxchg_i128:
+  case Intrinsic::ppc_atomicrmw_max_i128:
+  case Intrinsic::ppc_atomicrmw_umax_i128:
+  case Intrinsic::ppc_atomicrmw_min_i128:
+  case Intrinsic::ppc_atomicrmw_umin_i128:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(0);
@@ -18593,6 +18597,14 @@ getIntrinsicForAtomicRMWBinOp128(AtomicRMWInst::BinOp BinOp) {
     return Intrinsic::ppc_atomicrmw_xor_i128;
   case AtomicRMWInst::Nand:
     return Intrinsic::ppc_atomicrmw_nand_i128;
+  case AtomicRMWInst::Max:
+    return Intrinsic::ppc_atomicrmw_max_i128;
+  case AtomicRMWInst::UMax:
+    return Intrinsic::ppc_atomicrmw_umax_i128;
+  case AtomicRMWInst::Min:
+    return Intrinsic::ppc_atomicrmw_min_i128;
+  case AtomicRMWInst::UMin:
+    return Intrinsic::ppc_atomicrmw_umin_i128;
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 0322bb37b1fdf8f..49996bcbcb11de1 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -393,6 +393,11 @@ def ATOMIC_CMP_SWAP_I128 : PPCPostRAExpPseudo<
                               (ins memrr:$ptr, g8rc:$cmp_lo, g8rc:$cmp_hi,
                                    g8rc:$new_lo, g8rc:$new_hi),
                               "#ATOMIC_CMP_SWAP_I128", []>;
+
+def ATOMIC_LOAD_MAX_I128  : AtomicRMW128<"#ATOMIC_LOAD_MAX_I128">;
+def ATOMIC_LOAD_UMAX_I128 : AtomicRMW128<"#ATOMIC_LOAD_UMAX_I128">;
+def ATOMIC_LOAD_MIN_I128  : AtomicRMW128<"#ATOMIC_LOAD_MIN_I128">;
+def ATOMIC_LOAD_UMIN_I128 : AtomicRMW128<"#ATOMIC_LOAD_UMIN_I128">;
 }
 
 class PatAtomicRMWI128<SDPatternOperator OpNode, AtomicRMW128 Inst> :
@@ -410,6 +415,10 @@ def : PatAtomicRMWI128<int_ppc_atomicrmw_and_i128,  ATOMIC_LOAD_AND_I128>;
 def : PatAtomicRMWI128<int_ppc_atomicrmw_nand_i128, ATOMIC_LOAD_NAND_I128>;
 def : PatAtomicRMWI128<int_ppc_atomicrmw_or_i128,   ATOMIC_LOAD_OR_I128>;
 def : PatAtomicRMWI128<int_ppc_atomicrmw_xchg_i128, ATOMIC_SWAP_I128>;
+def : PatAtomicRMWI128<int_ppc_atomicrmw_max_i128,  ATOMIC_LOAD_MAX_I128>;
+def : PatAtomicRMWI128<int_ppc_atomicrmw_umax_i128, ATOMIC_LOAD_UMAX_I128>;
+def : PatAtomicRMWI128<int_ppc_atomicrmw_min_i128,  ATOMIC_LOAD_MIN_I128>;
+def : PatAtomicRMWI128<int_ppc_atomicrmw_umin_i128, ATOMIC_LOAD_UMIN_I128>;
 def : Pat<(int_ppc_cmpxchg_i128 ForceXForm:$ptr,
                                 i64:$cmp_lo,
                                 i64:$cmp_hi,
diff --git a/llvm/test/CodeGen/PowerPC/atomics-i128.ll b/llvm/test/CodeGen/PowerPC/atomics-i128.ll
index f5422a9b7b54280..55e8e01335c3310 100644
--- a/llvm/test/CodeGen/PowerPC/atomics-i128.ll
+++ b/llvm/test/CodeGen/PowerPC/atomics-i128.ll
@@ -1735,3 +1735,967 @@ entry:
   %1 = extractvalue { i128, i1 } %0, 1
   ret i1 %1
 }
+
+define i128 @max(ptr %p, i128 %v) {
+; CHECK-LABEL: max:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sync
+; CHECK-NEXT:  .LBB13_1: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lqarx r6, 0, r3
+; CHECK-NEXT:    cmpd r6, r4
+; CHECK-NEXT:    bgt cr0, .LBB13_6
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    blt cr0, .LBB13_5
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld r7, r5
+; CHECK-NEXT:    bgt cr0, .LBB13_6
+; CHECK-NEXT:  # %bb.4: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    beq cr0, .LBB13_6
+; CHECK-NEXT:  .LBB13_5: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr r9, r5
+; CHECK-NEXT:    mr r8, r4
+; CHECK-NEXT:    stqcx. r8, 0, r3
+; CHECK-NEXT:    bne cr0, .LBB13_1
+; CHECK-NEXT:  .LBB13_6: # %entry
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    mr r3, r6
+; CHECK-NEXT:    mr r4, r7
+; CHECK-NEXT:    blr
+;
+; PWR7-LABEL: max:
+; PWR7:       # %bb.0: # %entry
+; PWR7-NEXT:    mflr r0
+; PWR7-NEXT:    stdu r1, -176(r1)
+; PWR7-NEXT:    std r0, 192(r1)
+; PWR7-NEXT:    .cfi_def_cfa_offset 176
+; PWR7-NEXT:    .cfi_offset lr, 16
+; PWR7-NEXT:    .cfi_offset r27, -40
+; PWR7-NEXT:    .cfi_offset r28, -32
+; PWR7-NEXT:    .cfi_offset r29, -24
+; PWR7-NEXT:    .cfi_offset r30, -16
+; PWR7-NEXT:    std r28, 144(r1) # 8-byte Folded Spill
+; PWR7-NEXT:    std r29, 152(r1) # 8-byte Folded Spill
+; PWR7-NEXT:    mr r29, r4
+; PWR7-NEXT:    mr r28, r3
+; PWR7-NEXT:    ld r4, 8(r3)
+; PWR7-NEXT:    ld r3, 0(r3)
+; PWR7-NEXT:    std r27, 136(r1) # 8-byte Folded Spill
+; PWR7-NEXT:    addi r27, r1, 120
+; PWR7-NEXT:    std r30, 160(r1) # 8-byte Folded Spill
+; PWR7-NEXT:    mr r30, r5
+; PWR7-NEXT:    .p2align 4
+; PWR7-NEXT:  .LBB13_1: # %atomicrmw.start
+; PWR7-NEXT:    #
+; PWR7-NEXT:    cmpld r3, r29
+; PWR7-NEXT:    cmpd cr1, r3, r29
+; PWR7-NEXT:    li r7, 5
+; PWR7-NEXT:    li r8, 5
+; PWR7-NEXT:    std r3, 120(r1)
+; PWR7-NEXT:    crandc 4*cr5+lt, 4*cr1+gt, eq
+; PWR7-NEXT:    cmpld cr1, r4, r30
+; PWR7-NEXT:    crand 4*cr5+gt, eq, 4*cr1+gt
+; PWR7-NEXT:    std r4, 128(r1)
+; PWR7-NEXT:    cror 4*cr5+lt, 4*cr5+gt, 4*cr5+lt
+; PWR7-NEXT:    isel r5, r3, r29, 4*cr5+lt
+; PWR7-NEXT:    isel r6, r4, r30, 4*cr5+lt
+; PWR7-NEXT:    mr r3, r28
+; PWR7-NEXT:    mr r4, r27
+; PWR7-NEXT:    bl __atomic_compare_exchange_16
+; PWR7-NEXT:    nop
+; PWR7-NEXT:    mr r5, r3
+; PWR7-NEXT:    ld r4, 128(r1)
+; PWR7-NEXT:    ld r3, 120(r1)
+; PWR7-NEXT:    cmpldi r5, 0
+; PWR7-NEXT:    beq cr0, .LBB13_1
+; PWR7-NEXT:  # %bb.2: # %atomicrmw.end
+; PWR7-NEXT:    ld r30, 160(r1) # 8-byte Folded Reload
+; PWR7-NEXT:    ld r29, 152(r1) # 8-byte Folded Reload
+; PWR7-NEXT:    ld r28, 144(r1) # 8-byte Folded Reload
+; PWR7-NEXT:    ld r27, 136(r1) # 8-byte Folded Reload
+; PWR7-NEXT:    addi r1, r1, 176
+; PWR7-NEXT:    ld r0, 16(r1)
+; PWR7-NEXT:    mtlr r0
+; PWR7-NEXT:    blr
+;
+; LE-PWR8-LABEL: max:
+; LE-PWR8:       # %bb.0: # %entry
+; LE-PWR8-NEXT:    sync
+; LE-PWR8-NEXT:  .LBB13_1: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    lqarx r6, 0, r3
+; LE-PWR8-NEXT:    cmpd r6, r5
+; LE-PWR8-NEXT:    bgt cr0, .LBB13_6
+; LE-PWR8-NEXT:  # %bb.2: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    blt cr0, .LBB13_5
+; LE-PWR8-NEXT:  # %bb.3: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    cmpld r7, r4
+; LE-PWR8-NEXT:    bgt cr0, .LBB13_6
+; LE-PWR8-NEXT:  # %bb.4: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    beq cr0, .LBB13_6
+; LE-PWR8-NEXT:  .LBB13_5: # %entry
+; LE-PWR8-NEXT:    #
+; LE-PWR8-NEXT:    mr r9, r4
+; LE-PWR8-NEXT:    mr r8, r5
+; LE-PWR8-NEXT:    stqcx. r8, 0, r3
+; LE-PWR8-NEXT:    bne cr0, .LBB13_1
+; LE-PWR8-NEXT:  .LBB13_6: # %entry
+; LE-PWR8-NEXT:    lwsync
+; LE-PWR8-NEXT:    mr r3, r7
+; LE-PWR8-NEXT:    mr r4, r6
+; LE-PWR8-NEXT:    blr
+;
+; AIX64-PWR8-LABEL: max:
+; AIX64-PWR8:       # %bb.0: # %entry
+; AIX64-PWR8-NEXT:    sync
+; AIX64-PWR8-NEXT:  L..BB13_1: # %entry
+; AIX64-PWR8-NEXT:    #
+; AIX64-PWR8-NEXT:    lqarx r6, 0, r3
+; AIX64-PWR8-NEXT:    cmpd r6, r4
+; AIX64-PWR8-NEXT:    bgt cr0, L..BB13_6
+; AIX64-PWR8-NEXT:  # %bb.2: # %entry
+; AIX64-PWR8-NEXT:    #
+; AIX64-PWR8-NEXT:    blt cr0, L..BB13_5
+; AIX64-PWR8-NEXT:  # %bb.3: # %entry
+; AIX64-PWR8-NEXT:    #
+; AIX64-PWR8-NEXT:    cmpld r7, r5
+; AIX64-PWR8-NEXT:    bgt cr0, L..BB13_6
+; AIX64-PWR8-NEXT:  # %bb.4: # %entry
+; AIX64-PWR8-NEXT:    #
+; AIX64-PWR8-NEXT:    beq cr0, L..BB13_6
+; AIX64-PWR8-NEXT:  L..BB13_5: # %entry
+; AIX64-PWR8-NEXT:    #
+; AIX64-PWR8-NEXT:    mr r9, r5
+; AIX64-PWR8-NEXT:    mr r8, r4
+; AIX64-PWR8-NEXT:    stqcx. r8, 0, r3
+; AIX64-PWR8-NEXT:    bne cr0, L..BB13_1
+; AIX64-PWR8-NEXT:  L..BB13_6: # %entry
+; AIX64-PWR8-NEXT:    lwsync
+; AIX64-PWR8-NEXT:    mr r3, r6
+; AIX64-PWR8-NEXT:    mr r4, r7
+; AIX64-PWR8-NEXT:    blr
+;
+; PPC-PWR8-LABEL: max:
+; PPC-PWR8:       # %bb.0: # %entry
+; PPC-PWR8-NEXT:    mflr r0
+; PPC-PWR8-NEXT:    stwu r1, -80(r1)
+; PPC-PWR8-NEXT:    stw r0, 84(r1)
+; PPC-PWR8-NEXT:    .cfi_def_cfa_offset 80
+; PPC-PWR8-NEXT:    .cfi_offset lr, 4
+; PPC-PWR8-NEXT:    .cfi_offset r24, -32
+; PPC-PWR8-NEXT:    .cfi_offset r25, -28
+; PPC-PWR8-NEXT:    .cfi_offset r26, -24
+; PPC-PWR8-NEXT:    .cfi_offset r27, -20
+; PPC-PWR8-NEXT:    .cfi_offset r28, -16
+; PPC-PWR8-NEXT:    .cfi_offset r29, -12
+; PPC-PWR8-NEXT:    .cfi_offset r30, -8
+; PPC-PWR8-NEXT:    stw r26, 56(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    stw r27, 60(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r27, r5
+; PPC-PWR8-NEXT:    mr r26, r3
+; PPC-PWR8-NEXT:    stw r28, 64(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r28, r6
+; PPC-PWR8-NEXT:    lwz r6, 12(r3)
+; PPC-PWR8-NEXT:    stw r24, 48(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    lwz r5, 8(r3)
+; PPC-PWR8-NEXT:    lwz r4, 4(r3)
+; PPC-PWR8-NEXT:    stw r25, 52(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    addi r25, r1, 32
+; PPC-PWR8-NEXT:    lwz r3, 0(r3)
+; PPC-PWR8-NEXT:    stw r29, 68(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r29, r7
+; PPC-PWR8-NEXT:    addi r24, r1, 16
+; PPC-PWR8-NEXT:    stw r30, 72(r1) # 4-byte Folded Spill
+; PPC-PWR8-NEXT:    mr r30, r8
+; PPC-PWR8-NEXT:    .p2align 4
+; PPC-PWR8-NEXT:  .LBB13_1: # %atomicrmw.start
+; PPC-PWR8-NEXT:    #
+; PPC-PWR8-NEXT:    cmplw r3, r27
+; PPC-PWR8-NEXT:    cmpw cr1, r3, r27
+; PPC-PWR8-NEXT:    stw r3, 32(r1)
+; PPC-PWR8-NEXT:    stw r4, 36(r1)
+; PPC-PWR8-NEXT:    xor r7, r3, r27
+; PPC-PWR8-NEXT:    xor r8, r4, r28
+; PPC-PWR8-NEXT:    stw r5, 40(r1)
+; PPC-PWR8-NEXT:    stw r6, 44(r1)
+; PPC-PWR8-NEXT:    cmplw cr5, r4, r28
+; PPC-PWR8-NEXT:    cmplw cr6, r5, r29
+; PPC-PWR8-NEXT:    crandc 4*cr5+lt, 4*cr1+gt, eq
+; PPC-PWR8-NEXT:    cmplw cr7, r6, r30
+; PPC-PWR8-NEXT:    crand 4*cr5+gt, eq, 4*cr5+gt
+; PPC-PWR8-NEXT:    or r7, r8, r7
+; PPC-PWR8-NEXT:    cmplwi cr1, r7, 0
+; PPC-PWR8-NEXT:    crand 4*cr5+eq, 4*cr6+eq, 4*cr7+gt
+; PPC-PWR8-NEXT:    crandc 4*cr5+un, 4*cr6+gt, 4*cr6+eq
+; PPC-PWR8-NEXT:    li r7, 5
+; PPC-PWR8-NEXT:    li r8, 5
+; PPC-PWR8-NEXT:    cror 4*cr5+lt, 4*cr5+gt, 4*cr5+lt
+; PPC-PWR8-NEXT:    cror 4*cr5+gt, 4*cr5+eq, 4*cr5+un
+; PPC-PWR8-NEXT:    crandc 4*cr5+lt, 4*cr5+lt, 4*cr1+eq
+; PPC-PWR8-NEXT:    crand 4*cr5+gt, 4*cr1+eq, 4*cr5+gt
+; PPC-PWR8-NEXT:    cror 4*cr5+lt, 4*cr5+gt, 4*cr5+lt
+; PPC-PWR8-NEXT:    isel r3, r3, r27, 4*cr5+lt
+; PPC-PWR8-NEXT:    isel r5, r5, r29, 4*cr5+lt
+; PPC-PWR8-NEXT:    isel r6, r6, r30, 4*cr5+lt
+; PPC-PWR8-NEXT:    isel r4, r4, r28, 4*cr5+lt
+; PPC-PWR8-NEXT:    stw r6, 28(r1)
+; PPC-PWR8-NEXT:    stw r5, 24(r1)
+; PPC-PWR8-NEXT:    stw r4, 20(r1)
+; PPC-PWR8-NEXT:    stw r3, 16(r1)
+; PPC-PWR8-NEXT:    li r3, 16
+; PPC-PWR8-NEXT:    mr r4, r26
+; PPC-PWR8-NEXT:    mr r5, r25
+; PPC-PWR8-NEXT:    mr r6, r24
+; PPC-PWR8-NEXT:    bl __atomic_compare_exchange
+; PPC-PWR8-NEXT:    mr r7, r3
+; PPC-PWR8-NEXT:    lwz r6, 44(r1)
+; PPC-PWR8-NEXT:    lwz r5, 40(r1)
+; PPC-PWR8-NEXT:    lwz r4, 36(r1)
+; PPC-PWR8-NEXT:    lwz r3, 32(r1)
+; PPC-PWR8-NEXT:    cmplwi r7, 0
+; PPC-PWR8-NEXT:    beq cr0, .LBB13_1
+; PPC-PWR8-NEXT:  # %bb.2: # %atomicrmw.end
+; PPC-PWR8-NEXT:    lwz r30, 72(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r29, 68(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r28, 64(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r27, 60(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r26, 56(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r25, 52(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r24, 48(r1) # 4-byte Folded Reload
+; PPC-PWR8-NEXT:    lwz r0, 84(r1)
+; PPC-PWR8-NEXT:    addi r1, r1, 80
+; PPC-PWR8-NEXT:    mtlr r0
+; PPC-PWR8-NEXT:    blr
+entry:
+  %0 = atomicrmw max ptr %p, i128 %v seq_cst, align 16
+  ret i128 %0
+}
+
+define i128 @umax(ptr %p, i128 %v) {
+; CHECK-LABEL: umax:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sync
+; CHECK-NEXT:  .LBB14_1: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lqarx r6, 0, r3
+; CHECK-NEXT:    cmpld r6, r4
+; CHECK-NEXT:    bgt cr0, .LBB14_6
+; CHECK-NEXT:  # %bb.2: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    blt cr0, .LBB14_5
+; CHECK-NEXT:  # %bb.3: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    cmpld r7, r5
+; CHECK-NEXT:    bgt cr0, .LBB14_6
+; CHECK-NEXT:  # %bb.4: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    beq cr0, .LBB14_6
+; CHECK-NEXT:  .LBB14_5: # %entry
+; CHECK-NEXT:    #
+; CHECK-NEXT:    mr r9, r5
+; CHECK-NEXT:    mr r8, r4
+; CHECK-NEXT:    stqcx. r8, 0, r3
+; CHECK-NEXT:    bne cr0, .LBB14_1
+; CHECK-NEXT:  .LBB14_6: # %entry
+; CHECK-NEXT:    lwsync
+; CHECK-NEXT:    mr r3, r6
+; CHECK-NEXT:    mr r4, r7
+; CHECK-NEXT:    blr
+;
+; PWR7-LABEL: umax:
+; PWR7:       # %bb.0: # %entry
+; PWR7-NEXT:    mflr r0
+; PWR7-NEXT:    stdu r1, -176(r1)
+; PWR7-NEXT:    std r0, 192(r1)
+; PWR7-NEXT:    .cfi_def_cfa_offset 176
+; PWR7-NEXT:    .cfi_offset lr, 16
+; PWR7-NEXT:    .cfi_offset r27, -40
+; PWR7-NEXT:    .cfi_offset r28, -32
+; PWR7-NEXT:    .cfi_offset r29, -24
+; PWR7-NEXT:    .cfi_offset r30, -16
+; PWR7-NEXT:    std r28, 144(r1) # 8-byte Folded Spill
+; PWR7-NEXT:    std r29, 152(r1) # 8-byte Folded Spill
+; PWR7-NEXT:    mr r29, r4
+; PWR7-NEXT:    mr r28, r3
+; PWR7-NEXT:    ld r4, 8(r3)
+; PWR7-NEXT:    ld r3, 0(r3)...
[truncated]

@github-actions
Copy link

github-actions bot commented Oct 19, 2023

✅ With the latest revision this PR passed the C/C++ code formatter.

@bzEq bzEq changed the title [PowerPC] Implement __atomic_fetch_min/max for i128 [PowerPC] Implement lock free __atomic_fetch_min/max for i128 Oct 19, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

powerpc64le-linux-gnu 128-bit atomic max causes ICE
2 participants