llvm · ram-NK · Nov 12, 2025 · Nov 19, 2025 · Nov 17, 2025 · Nov 12, 2025
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3232,6 +3232,9 @@ class LLVM_ABI TargetLoweringBase {
   /// Default to be the minimum interleave factor: 2.
   virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
 
+  /// Return true if the target has interleave with shuffles.
+  virtual bool hasInterleaveWithGatherScatter() const { return false; }
+
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///

diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
 static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
-                               unsigned MaxFactor) {
+                               unsigned MaxFactor,
+                               bool InterleaveWithShuffles) {
   unsigned NumElts = SVI->getShuffleMask().size();
   if (NumElts < 4)
     return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
       return true;
   }
 
+  if (InterleaveWithShuffles) {
+    for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
+      Factor = i * MaxFactor;
+      if (SVI->isInterleave(Factor))
+        return true;
+    }
+  }
   return false;
 }
 
@@ -528,7 +536,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
       cast<FixedVectorType>(SVI->getType())->getNumElements();
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
-  if (!isReInterleaveMask(SVI, Factor, MaxFactor))
+  if (!isReInterleaveMask(SVI, Factor, MaxFactor,
+                          TLI->hasInterleaveWithGatherScatter()))
     return false;
   assert(NumStoredElements % Factor == 0 &&
          "number of stored element should be a multiple of Factor");

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -97,6 +97,7 @@
 #include <cctype>
 #include <cstdint>
 #include <cstdlib>
+#include <deque>
 #include <iterator>
 #include <limits>
 #include <optional>
@@ -18144,12 +18145,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                                                   ShuffleVectorInst *SVI,
                                                   unsigned Factor,
                                                   const APInt &GapMask) const {
-
-  assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
-         "Invalid interleave factor");
+  assert(Factor >= 2 && "Invalid interleave factor");
   auto *SI = dyn_cast<StoreInst>(Store);
   if (!SI)
     return false;
+
+  if (Factor > getMaxSupportedInterleaveFactor())
+    return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
+
   assert(!LaneMask && GapMask.popcount() == Factor &&
          "Unexpected mask on store");
 
@@ -18295,6 +18298,160 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
   return true;
 }
 
+/// If the interleaved vector elements are greater than supported MaxFactor,
+/// interleaving the data with additional shuffles can be used to
+/// achieve the same.
+///
+/// Consider the following data with 8 interleaves which are shuffled to store
+/// stN instructions. Data needs to be stored in this order:
+///     [v0, v1, v2, v3, v4, v5, v6, v7]
+///
+///    v0      v4      v2      v6      v1      v5      v3      v7
+///    |       |       |       |       |       |       |       |
+///     \     /         \     /         \     /         \     /
+///   [zip v0,v4]      [zip v2,v6]    [zip v1,v5]      [zip v3,v7] ==> stN = 4
+///        |               |              |                 |
+///         \             /                \               /
+///          \           /                  \             /
+///           \         /                    \           /
+///       [zip [v0,v2,v4,v6]]            [zip [v1,v3,v5,v7]]     ==> stN = 2
+///
+/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored
+/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with
+/// another st4.
+///
+/// For stN = 2, upper half of interleaved data V0, V1 is stored
+/// with one st2 instruction. Second set V2, V3 is stored with another st2.
+/// Total of 4 st2's are required here.
+bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
+    StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
+  unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
+
+  auto *VecTy = cast<FixedVectorType>(SVI->getType());
+  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
+
+  unsigned LaneLen = VecTy->getNumElements() / Factor;
+  Type *EltTy = VecTy->getElementType();
+  auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
+
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  bool UseScalable;
+
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() ||
+      !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
+    return false;
+
+  if (UseScalable)
+    return false;
+
+  std::deque<Value *> Shuffles;
+  // If Only one operand is there in root shuffle.
+  if (isa<PoisonValue>(SVI->getOperand(1)) &&
+      SVI->getType() == SVI->getOperand(0)->getType()) {
+    Value *Op0 = SVI->getOperand(0);
+    Shuffles.push_back(dyn_cast<Value>(Op0));
+  } else
+    Shuffles.push_back(SVI);
+  unsigned ConcatLevel = Factor;
+  unsigned ConcatElt = Factor * LaneLen;
+  // Getting all the interleaved operands.
+  while (ConcatLevel > 1) {
+    unsigned InterleavedOperands = Shuffles.size();
+    for (unsigned Ops = 0; Ops < InterleavedOperands; Ops++) {
+      auto *V = Shuffles.front();
+      Shuffles.pop_front();
+      if (isa<ConstantAggregateZero, PoisonValue>(V)) {
+        VectorType *Ty = cast<VectorType>(V->getType());
+        auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
+        Value *SplitValue = nullptr;
+        if (isa<ConstantAggregateZero>(V))
+          SplitValue = ConstantAggregateZero::get(HalfTy);
+        else
+          SplitValue = PoisonValue::get(HalfTy);
+
+        Shuffles.push_back(SplitValue);
+        Shuffles.push_back(SplitValue);
+        continue;
+      }
+      if (V->getType() == SubVecTy) {
+        Shuffles.push_back(V);
+        continue;
+      }
+      ShuffleVectorInst *SFL = dyn_cast<ShuffleVectorInst>(V);
+      if (!SFL)
+        return false;
+      if (SVI != SFL && !SFL->isIdentityMask(SFL->getShuffleMask(), ConcatElt))
+        return false;
+
+      Value *Op0 = SFL->getOperand(0);
+      Value *Op1 = SFL->getOperand(1);
+
+      Shuffles.push_back(dyn_cast<Value>(Op0));
+      Shuffles.push_back(dyn_cast<Value>(Op1));
+    }
+    ConcatLevel >>= 1;
+    ConcatElt >>= 1;
+  }
+
+  IRBuilder<> Builder(SI);
+  auto Mask = createInterleaveMask(LaneLen, 2);
+  SmallVector<int, 16> UpperHalfMask(LaneLen), LowerHalfMask(LaneLen);
+  for (unsigned Idx = 0; Idx < LaneLen; Idx++) {
+    LowerHalfMask[Idx] = Mask[Idx];
+    UpperHalfMask[Idx] = Mask[Idx + LaneLen];
+  }
+
+  unsigned InterleaveFactor = Factor >> 1;
+  while (InterleaveFactor >= MaxSupportedFactor) {
+    std::deque<Value *> ShufflesIntermediate;
+    ShufflesIntermediate.resize(Factor);
+    for (unsigned Idx = 0; Idx < Factor; Idx += (InterleaveFactor * 2)) {
+      for (unsigned GroupIdx = 0; GroupIdx < InterleaveFactor; GroupIdx++) {
+        assert(Shuffles[Idx + GroupIdx]->getType() == SubVecTy &&
+               Shuffles[Idx + GroupIdx + InterleaveFactor]->getType() ==
+                   SubVecTy &&
+               "Type of interleaving candidates are not matching\n");
+        auto *Shuffle = Builder.CreateShuffleVector(
+            Shuffles[Idx + GroupIdx],
+            Shuffles[Idx + GroupIdx + InterleaveFactor], LowerHalfMask);
+        ShufflesIntermediate[Idx + GroupIdx] = Shuffle;
+        Shuffle = Builder.CreateShuffleVector(
+            Shuffles[Idx + GroupIdx],
+            Shuffles[Idx + GroupIdx + InterleaveFactor], UpperHalfMask);
+        ShufflesIntermediate[Idx + GroupIdx + InterleaveFactor] = Shuffle;
+      }
+    }
+    Shuffles = ShufflesIntermediate;
+    InterleaveFactor >>= 1;
+  }
+
+  Type *PtrTy = SI->getPointerOperandType();
+  auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
+
+  Value *BaseAddr = SI->getPointerOperand();
+  Function *StNFunc = getStructuredStoreFunction(
+      SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
+  for (unsigned N = 0; N < (Factor / MaxSupportedFactor); N++) {
+    SmallVector<Value *, 5> Ops;
+    for (unsigned OpIdx = 0; OpIdx < MaxSupportedFactor; OpIdx++)
+      Ops.push_back(Shuffles[N * MaxSupportedFactor + OpIdx]);
+
+    if (N > 0) {
+      // We will compute the pointer operand of each store from the original
+      // base address using GEPs. Cast the base address to a pointer to the
+      // scalar  element type.
+      BaseAddr = Builder.CreateConstGEP1_32(
+          SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
+    }
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+    Builder.CreateCall(StNFunc, Ops);
+  }
+  return true;
+}
+
 bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
     Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
   const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -229,6 +229,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
 
+  bool hasInterleaveWithGatherScatter() const override { return true; }
+
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
   bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +241,9 @@ class AArch64TargetLowering : public TargetLowering {
                              ShuffleVectorInst *SVI, unsigned Factor,
                              const APInt &GapMask) const override;
 
+  bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
+                                        unsigned Factor) const;
+
   bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
                                         IntrinsicInst *DI) const override;
 

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4932,19 +4932,39 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
   if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
     return InstructionCost::getInvalid();
 
-  if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+  unsigned NumLoadStores = 1;
+  InstructionCost ShuffleCost = 0;
+  bool isInterleaveWithShuffle = false;
+  unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
+
+  auto *SubVecTy =
+      VectorType::get(VecVTy->getElementType(),
+                      VecVTy->getElementCount().divideCoefficientBy(Factor));
+
+  if (TLI->hasInterleaveWithGatherScatter() && Opcode == Instruction::Store &&
+      (0 == Factor % MaxSupportedFactor) && Factor > MaxSupportedFactor) {
+    isInterleaveWithShuffle = true;
+
+    NumLoadStores = Factor / MaxSupportedFactor;
+    ShuffleCost =
+        (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
+                                 {}, CostKind, 0, SubVecTy));
+  }
+
+  if (!UseMaskForGaps &&
+      (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
     unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
-    auto *SubVecTy =
-        VectorType::get(VecVTy->getElementType(),
-                        VecVTy->getElementCount().divideCoefficientBy(Factor));
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
     // matched to more than one ldN/stN instruction.
     bool UseScalable;
     if (MinElts % Factor == 0 &&
         TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
-      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
+      return (Factor *
+              TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
+              NumLoadStores) +
+             ShuffleCost;
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,