[AArch64][SVE] Remove chains of unnecessary SVE reinterpret intrinsics

This commit extends SVEIntrinsicOpts::optimizeConvertFromSVBool to identify and remove longer chains of redundant SVE reintepret intrinsics. For example, the following chain of redundant SVE reinterprets is now recognised as redundant: %a = <vscale x 2 x i1> %1 = <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool(<vscale x 2 x i1> %a) %2 = <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool(<vscale x 16 x i1> %1) %3 = <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool(<vscale x 4 x i1> %2) %4 = <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool(<vscale x 16 x i1> %3) %5 = <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool(<vscale x 4 x i1> %4) %6 = <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool(<vscale x 16 x i1> %5) ret <vscale x 2 x i1> %6 and will be replaced with: ret <vscale x 2 x i1> %a Eliminating these can sometimes mean emitting fewer unnecessary loads/stores when lowering to assembly. Differential Revision: https://reviews.llvm.org/D94074
llvm · Jan 13, 2021 · 3122c66 · 3122c66
1 parent 4cd4853
commit 3122c66
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 11 deletions.
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -177,22 +177,50 @@ bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
   if (isa<PHINode>(I->getArgOperand(0)))
     return processPhiNode(I);
 
-  // If we have a reinterpret intrinsic I of type A which is converting from
-  // another reinterpret Y of type B, and the source type of Y is A, then we can
-  // elide away both reinterprets if there are no other users of Y.
-  auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
-  if (!Y)
-    return false;
+  SmallVector<Instruction *, 32> CandidatesForRemoval;
+  Value *Cursor = I->getOperand(0), *EarliestReplacement = nullptr;
+
+  const auto *IVTy = cast<VectorType>(I->getType());
+
+  // Walk the chain of conversions.
+  while (Cursor) {
+    // If the type of the cursor has fewer lanes than the final result, zeroing
+    // must take place, which breaks the equivalence chain.
+    const auto *CursorVTy = cast<VectorType>(Cursor->getType());
+    if (CursorVTy->getElementCount().getKnownMinValue() <
+        IVTy->getElementCount().getKnownMinValue())
+      break;
+
+    // If the cursor has the same type as I, it is a viable replacement.
+    if (Cursor->getType() == IVTy)
+      EarliestReplacement = Cursor;
 
-  Value *SourceVal = Y->getArgOperand(0);
-  if (I->getType() != SourceVal->getType())
+    auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
+
+    // If this is not an SVE conversion intrinsic, this is the end of the chain.
+    if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
+                                  Intrinsic::aarch64_sve_convert_to_svbool ||
+                              IntrinsicCursor->getIntrinsicID() ==
+                                  Intrinsic::aarch64_sve_convert_from_svbool))
+      break;
+
+    CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
+    Cursor = IntrinsicCursor->getOperand(0);
+  }
+
+  // If no viable replacement in the conversion chain was found, there is
+  // nothing to do.
+  if (!EarliestReplacement)
     return false;
 
-  I->replaceAllUsesWith(SourceVal);
+  I->replaceAllUsesWith(EarliestReplacement);
   I->eraseFromParent();
-  if (Y->use_empty())
-    Y->eraseFromParent();
 
+  while (!CandidatesForRemoval.empty()) {
+    Instruction *Candidate = CandidatesForRemoval.pop_back_val();
+    if (Candidate->use_empty())
+      Candidate->eraseFromParent();
+  }
   return true;
 }
 

diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-reinterpret.ll
@@ -67,6 +67,62 @@ define <vscale x 16 x i1> @reinterpret_test_d_rev(<vscale x 16 x i1> %a) {
   ret <vscale x 16 x i1> %2
 }
 
+define <vscale x 2 x i1> @reinterpret_test_full_chain(<vscale x 2 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_full_chain(
+; OPT: ret <vscale x 2 x i1> %a
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+  %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %3)
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %4)
+  %6 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %5)
+  ret <vscale x 2 x i1> %6
+}
+
+; The last two reinterprets are not necessary, since they are doing the same
+; work as the first two.
+define <vscale x 4 x i1> @reinterpret_test_partial_chain(<vscale x 2 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_partial_chain(
+; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; OPT-NEXT: ret <vscale x 4 x i1> %2
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
+  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+  %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %3)
+  ret <vscale x 4 x i1> %4
+}
+
+; The chain cannot be reduced because of the second reinterpret, which causes
+; zeroing.
+define <vscale x 8 x i1> @reinterpret_test_irreducible_chain(<vscale x 8 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_irreducible_chain(
+; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; OPT-NEXT: %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+; OPT-NEXT: %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+; OPT-NEXT: ret <vscale x 8 x i1> %4
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+  %4 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %3)
+  ret <vscale x 8 x i1> %4
+}
+
+; Here, the candidate list is larger than the number of instructions that we
+; end up removing.
+define <vscale x 4 x i1> @reinterpret_test_keep_some_candidates(<vscale x 8 x i1> %a) {
+; OPT-LABEL: @reinterpret_test_keep_some_candidates(
+; OPT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+; OPT-NEXT: %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+; OPT-NEXT: ret <vscale x 4 x i1> %2
+  %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
+  %2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+  %3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %2)
+  %4 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %3)
+  ret <vscale x 4 x i1> %4
+}
+
 define <vscale x 2 x i1> @reinterpret_reductions(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
 ; OPT-LABEL: reinterpret_reductions
 ; OPT-NOT: convert