diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h index b87ea6e2ec32b9..a8752c8070aa66 100644 --- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h +++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h @@ -37,6 +37,10 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { #include "LoongArchGenSubtargetInfo.inc" unsigned GRLen = 32; + // TODO: The default value is empirical and conservative. Override the + // default in initializeProperties once we support optimizing for more + // uarches. + uint8_t MaxInterleaveFactor = 2; MVT GRLenVT = MVT::i32; LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown; LoongArchFrameLowering FrameLowering; @@ -99,6 +103,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo { Align getPrefFunctionAlignment() const { return PrefFunctionAlignment; } Align getPrefLoopAlignment() const { return PrefLoopAlignment; } unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; } + unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } bool enableMachineScheduler() const override { return true; } }; } // end namespace llvm diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp index add1c60d89d21c..710650acba3046 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp @@ -69,6 +69,10 @@ unsigned LoongArchTTIImpl::getRegisterClassForType(bool Vector, return LoongArchRegisterClass::GPRRC; } +unsigned LoongArchTTIImpl::getMaxInterleaveFactor(ElementCount VF) { + return ST->getMaxInterleaveFactor(); +} + const char *LoongArchTTIImpl::getRegisterClassName(unsigned ClassID) const { switch (ClassID) { case LoongArchRegisterClass::GPRRC: diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h index 34c18163bbdb6e..06a03d29931d1e 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h @@ -43,6 +43,7 @@ class LoongArchTTIImpl : public BasicTTIImplBase { TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const; unsigned getNumberOfRegisters(unsigned ClassID) const; unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const; + unsigned getMaxInterleaveFactor(ElementCount VF); const char *getRegisterClassName(unsigned ClassID) const; // TODO: Implement more hooks to provide TTI machinery for LoongArch. diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll index 7172f0907e77ec..28c1eef84e2575 100644 --- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll +++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll @@ -22,14 +22,20 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP4]], align 8 +; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP5]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/loongarch-interleaved.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/loongarch-interleaved.ll new file mode 100644 index 00000000000000..be9b170491b9ca --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/LoongArch/loongarch-interleaved.ll @@ -0,0 +1,39 @@ +; REQUIRES: asserts +; RUN: opt --passes=loop-vectorize,dce,instcombine --mtriple loongarch64 \ +; RUN: -S < %s 2>&1 | FileCheck %s + +; CHECK-LABEL: foo +; CHECK: %{{.*}} = add {{.*}}, 2 + +; Function Attrs: nofree norecurse nosync nounwind writeonly +define dso_local void @foo(i32 signext %n, ptr nocapture %A) local_unnamed_addr #0 { +entry: + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + %0 = trunc i64 %indvars.iv to i32 + store i32 %0, ptr %arrayidx, align 4, !tbaa !4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !8 +} + +!4 = !{!5, !5, i64 0} +!5 = !{!"int", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9} +!9 = !{!"llvm.loop.mustprogress"}