Introduce codegen for the Signal Processing Engine

Summary: The Signal Processing Engine (SPE) is found on NXP/Freescale e500v1, e500v2, and several e200 cores. This adds support targeting the e500v2, as this is more common than the e500v1, and is in SoCs still on the market. This patch is very intrusive because the SPE is binary incompatible with the traditional FPU. After discussing with others, the cleanest solution was to make both SPE and FPU features on top of a base PowerPC subset, so all FPU instructions are now wrapped with HasFPU predicates. Supported by this are: * Code generation following the SPE ABI at the LLVM IR level (calling conventions) * Single- and Double-precision math at the level supported by the APU. Still to do: * Vector operations * SPE intrinsics As this changes the Callee-saved register list order, one test, which tests the precise generated code, was updated to account for the new register order. Reviewed by: nemanjai Differential Revision: https://reviews.llvm.org/D44830 llvm-svn: 337347
llvm · Jul 18, 2018 · d52990c · d52990c
1 parent 4fa4fa6
commit d52990c
Show file tree

Hide file tree

Showing 23 changed files with 1,922 additions and 624 deletions.
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -83,6 +83,16 @@ static const MCPhysReg FRegs[32] = {
   PPC::F24, PPC::F25, PPC::F26, PPC::F27,
   PPC::F28, PPC::F29, PPC::F30, PPC::F31
 };
+static const MCPhysReg SPERegs[32] = {
+  PPC::S0,  PPC::S1,  PPC::S2,  PPC::S3,
+  PPC::S4,  PPC::S5,  PPC::S6,  PPC::S7,
+  PPC::S8,  PPC::S9,  PPC::S10, PPC::S11,
+  PPC::S12, PPC::S13, PPC::S14, PPC::S15,
+  PPC::S16, PPC::S17, PPC::S18, PPC::S19,
+  PPC::S20, PPC::S21, PPC::S22, PPC::S23,
+  PPC::S24, PPC::S25, PPC::S26, PPC::S27,
+  PPC::S28, PPC::S29, PPC::S30, PPC::S31
+};
 static const MCPhysReg VFRegs[32] = {
   PPC::VF0,  PPC::VF1,  PPC::VF2,  PPC::VF3,
   PPC::VF4,  PPC::VF5,  PPC::VF6,  PPC::VF7,
@@ -648,6 +658,16 @@ struct PPCOperand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(QFRegs[getReg()]));
   }
 
+  void addRegSPE4RCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(RRegs[getReg()]));
+  }
+
+  void addRegSPERCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(SPERegs[getReg()]));
+  }
+
   void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(CRBITRegs[getCRBit()]));

diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -226,6 +226,17 @@ static const unsigned QFRegs[] = {
   PPC::QF28, PPC::QF29, PPC::QF30, PPC::QF31
 };
 
+static const unsigned SPERegs[] = {
+  PPC::S0, PPC::S1, PPC::S2, PPC::S3,
+  PPC::S4, PPC::S5, PPC::S6, PPC::S7,
+  PPC::S8, PPC::S9, PPC::S10, PPC::S11,
+  PPC::S12, PPC::S13, PPC::S14, PPC::S15,
+  PPC::S16, PPC::S17, PPC::S18, PPC::S19,
+  PPC::S20, PPC::S21, PPC::S22, PPC::S23,
+  PPC::S24, PPC::S25, PPC::S26, PPC::S27,
+  PPC::S28, PPC::S29, PPC::S30, PPC::S31
+};
+
 template <std::size_t N>
 static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
                                         const unsigned (&Regs)[N]) {
@@ -327,6 +338,18 @@ static DecodeStatus DecodeQFRCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return decodeRegisterClass(Inst, RegNo, QFRegs);
 }
 
+static DecodeStatus DecodeSPE4RCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, GPRegs);
+}
+
+static DecodeStatus DecodeSPERCRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SPERegs);
+}
+
 #define DecodeQSRCRegisterClass DecodeQFRCRegisterClass
 #define DecodeQBRCRegisterClass DecodeQFRCRegisterClass
 

diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -50,6 +50,9 @@ namespace PPC {
     PRED_UN_PLUS  = (3 << 5) | 15,
     PRED_NU_PLUS  = (3 << 5) |  7,
 
+    // SPE scalar compare instructions always set the GT bit.
+    PRED_SPE      = PRED_GT,
+
     // When dealing with individual condition-register bits, we have simple set
     // and unset predicates.
     PRED_BIT_SET =   1024,

diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
@@ -61,46 +61,49 @@ def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
                               "Enable 64-bit registers usage for ppc32 [beta]">;
 def FeatureCRBits    : SubtargetFeature<"crbits", "UseCRBits", "true",
                               "Use condition-register bits individually">;
+def FeatureFPU       : SubtargetFeature<"fpu","HasFPU","true",
+                                        "Enable classic FPU instructions",
+                                        [FeatureHardFloat]>;
 def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
                                         "Enable Altivec instructions",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureSPE       : SubtargetFeature<"spe","HasSPE", "true",
                                         "Enable SPE instructions",
                                         [FeatureHardFloat]>;
 def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                         "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                         "Enable the fsqrt instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFCPSGN    : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
                                         "Enable the fcpsgn instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
                                         "Enable the fre instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
                                         "Enable the fres instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFRSQRTE   : SubtargetFeature<"frsqrte", "HasFRSQRTE", "true",
                                         "Enable the frsqrte instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFRSQRTES  : SubtargetFeature<"frsqrtes", "HasFRSQRTES", "true",
                                         "Enable the frsqrtes instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureRecipPrec : SubtargetFeature<"recipprec", "HasRecipPrec", "true",
                               "Assume higher precision reciprocal estimates">;
 def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
                                         "Enable the stfiwx instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureLFIWAX    : SubtargetFeature<"lfiwax","HasLFIWAX", "true",
                                         "Enable the lfiwax instruction",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFPRND     : SubtargetFeature<"fprnd", "HasFPRND", "true",
                                         "Enable the fri[mnpz] instructions",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureFPCVT     : SubtargetFeature<"fpcvt", "HasFPCVT", "true",
   "Enable fc[ft]* (unsigned and single-precision) and lfiwzx instructions",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureISEL      : SubtargetFeature<"isel","HasISEL", "true",
                                         "Enable the isel instruction">;
 def FeatureBPERMD    : SubtargetFeature<"bpermd", "HasBPERMD", "true",
@@ -129,7 +132,7 @@ def FeaturePPC6xx    : SubtargetFeature<"ppc6xx", "IsPPC6xx", "true",
                                         "Enable PPC 6xx instructions">;
 def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
                                         "Enable QPX instructions",
-                                        [FeatureHardFloat]>;
+                                        [FeatureFPU]>;
 def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
                                         "Enable VSX instructions",
                                         [FeatureAltivec]>;
@@ -308,8 +311,8 @@ def : ProcessorModel<"450", PPC440Model, [Directive440, FeatureISEL,
                                           FeatureFRES, FeatureFRSQRTE,
                                           FeatureICBT, FeatureBookE, 
                                           FeatureMSYNC, FeatureMFTB]>;
-def : Processor<"601", G3Itineraries, [Directive601, FeatureHardFloat]>;
-def : Processor<"602", G3Itineraries, [Directive602, FeatureHardFloat,
+def : Processor<"601", G3Itineraries, [Directive601, FeatureFPU]>;
+def : Processor<"602", G3Itineraries, [Directive602, FeatureFPU,
                                        FeatureMFTB]>;
 def : Processor<"603", G3Itineraries, [Directive603,
                                        FeatureFRES, FeatureFRSQRTE,

diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -510,6 +510,32 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   const Module *M = MF->getFunction().getParent();
   PICLevel::Level PL = M->getPICLevel();
 
+#ifndef NDEBUG
+  // Validate that SPE and FPU are mutually exclusive in codegen
+  if (!MI->isInlineAsm()) {
+    for (const MachineOperand &MO: MI->operands()) {
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (Subtarget->hasSPE()) {
+          if (PPC::F4RCRegClass.contains(Reg) ||
+              PPC::F8RCRegClass.contains(Reg) ||
+              PPC::QBRCRegClass.contains(Reg) ||
+              PPC::QFRCRegClass.contains(Reg) ||
+              PPC::QSRCRegClass.contains(Reg) ||
+              PPC::VFRCRegClass.contains(Reg) ||
+              PPC::VRRCRegClass.contains(Reg) ||
+              PPC::VSFRCRegClass.contains(Reg) ||
+              PPC::VSSRCRegClass.contains(Reg)
+              )
+            llvm_unreachable("SPE targets cannot have FPRegs!");
+        } else {
+          if (PPC::SPERCRegClass.contains(Reg))
+            llvm_unreachable("SPE register found in FPU-targeted code!");
+        }
+      }
+    }
+  }
+#endif
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;

diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -83,8 +83,14 @@ def RetCC_PPC : CallingConv<[
 
   // Floating point types returned as "direct" go into F1 .. F8; note that
   // only the ELFv2 ABI fully utilizes all these registers.
-  CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
-  CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfNotSubtarget<"hasSPE()",
+       CCIfType<[f32], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+  CCIfNotSubtarget<"hasSPE()",
+       CCIfType<[f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+  CCIfSubtarget<"hasSPE()",
+       CCIfType<[f32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
+  CCIfSubtarget<"hasSPE()",
+       CCIfType<[f64], CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>,
 
   // For P9, f128 are passed in vector registers.
   CCIfType<[f128],
@@ -188,7 +194,15 @@ def CC_PPC32_SVR4_Common : CallingConv<[
   CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC32_SVR4_Custom_AlignFPArgRegs">>>,
 
   // FP values are passed in F1 - F8.
-  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+  CCIfType<[f32, f64],
+           CCIfNotSubtarget<"hasSPE()",
+                            CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>>,
+  CCIfType<[f64],
+           CCIfSubtarget<"hasSPE()",
+                         CCAssignToReg<[S3, S4, S5, S6, S7, S8, S9, S10]>>>,
+  CCIfType<[f32],
+           CCIfSubtarget<"hasSPE()",
+                         CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
 
   // Split arguments have an alignment of 8 bytes on the stack.
   CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>,
@@ -197,7 +211,11 @@ def CC_PPC32_SVR4_Common : CallingConv<[
 
   // Floats are stored in double precision format, thus they have the same
   // alignment and size as doubles.
-  CCIfType<[f32,f64], CCAssignToStack<8, 8>>,  
+  // With SPE floats are stored as single precision, so have alignment and
+  // size of int.
+  CCIfType<[f32,f64], CCIfNotSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>,
+  CCIfType<[f32], CCIfSubtarget<"hasSPE()", CCAssignToStack<4, 4>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSPE()", CCAssignToStack<8, 8>>>,
 
   // QPX vectors that are stored in double precision need 32-byte alignment.
   CCIfType<[v4f64, v4i1], CCAssignToStack<32, 32>>,
@@ -265,15 +283,23 @@ def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
 
 def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>;
 
-def CSR_SVR432   : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
-                                        R21, R22, R23, R24, R25, R26, R27, R28,
-                                        R29, R30, R31, F14, F15, F16, F17, F18,
+// SPE does not use FPRs, so break out the common register set as base.
+def CSR_SVR432_COMM : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
+                                          R21, R22, R23, R24, R25, R26, R27,
+                                          R28, R29, R30, R31, CR2, CR3, CR4
+                                      )>;
+def CSR_SVR432 :  CalleeSavedRegs<(add CSR_SVR432_COMM, F14, F15, F16, F17, F18,
                                         F19, F20, F21, F22, F23, F24, F25, F26,
-                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                        F27, F28, F29, F30, F31
                                    )>;
+def CSR_SPE : CalleeSavedRegs<(add S14, S15, S16, S17, S18, S19, S20, S21, S22,
+                                   S23, S24, S25, S26, S27, S28, S29, S30, S31
+                              )>;
 
 def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>;
 
+def CSR_SVR432_SPE : CalleeSavedRegs<(add CSR_SVR432_COMM, CSR_SPE)>;
+
 def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
                                         X21, X22, X23, X24, X25, X26, X27, X28,
                                         X29, X30, X31, F14, F15, F16, F17, F18,