diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4e6b3a224b79b..31529d0addfcb 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23056,12 +23056,18 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, {VTy, SI->getPointerOperandType(), XLenTy}); SmallVector Ops; + SmallVector NewShuffleMask; for (unsigned i = 0; i < Factor; i++) { + // Collect shuffle mask for this lane. + for (unsigned j = 0; j < VTy->getNumElements(); j++) + NewShuffleMask.push_back(Mask[i + Factor * j]); + Value *Shuffle = Builder.CreateShuffleVector( - SVI->getOperand(0), SVI->getOperand(1), - createSequentialMask(Mask[i], VTy->getNumElements(), 0)); + SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask); Ops.push_back(Shuffle); + + NewShuffleMask.clear(); } // This VL should be OK (should be executable in one vsseg instruction, // potentially under larger LMULs) because we checked that the fixed vector diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 4200837227899..7cc8c0c3f2d89 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1394,16 +1394,12 @@ define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) { ret void } -; TODO: This could be a vslidedown followed by a strided store define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) { ; CHECK-LABEL: store_factor4_one_active_slidedown: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v9, v8, 1 -; CHECK-NEXT: vslideup.vi v10, v8, 1 -; CHECK-NEXT: vmv.v.v v11, v10 -; CHECK-NEXT: vmv.v.v v12, v10 -; CHECK-NEXT: vsseg4e32.v v9, (a0) +; CHECK-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-NEXT: vsseg4e32.v v8, (a0) ; CHECK-NEXT: ret %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> store <16 x i32> %v0, ptr %ptr