diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 52b416fdd755b..27837e260ebc1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -33214,6 +33214,37 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode( Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); } +bool AArch64TargetLowering::canCombineStoreAndExtract(Type *VectorTy, + Value *Idx, + unsigned &Cost) const { + // If we do not have NEON, fixed-width vector types are not natively + // supported. + if (!Subtarget->hasNEON()) + return false; + + // Floating point values and vector values map to the same register file. + // Therefore, although we could do a store extract of a vector type, this is + // better to leave at float as we have more freedom in the addressing mode for + // those. + if (VectorTy->isFPOrFPVectorTy() || VectorTy->isScalableTy()) + return false; + + // If the index is unknown at compile time, this is very expensive to lower + // and it is not possible to combine the store with the extract. + if (!isa(Idx)) + return false; + + assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); + unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue(); + // We can do a store + vector extract on any vector that fits perfectly in a D + // or Q register. + if (BitWidth == 64 || BitWidth == 128) { + Cost = 0; + return true; + } + return false; +} + bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode( SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 58efdd3e18fc0..50d4541405d36 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -894,6 +894,9 @@ class AArch64TargetLowering : public TargetLowering { bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const override; + bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, + unsigned &Cost) const override; + bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, diff --git a/llvm/test/CodeGen/AArch64/vector-promotion.ll b/llvm/test/CodeGen/AArch64/vector-promotion.ll new file mode 100644 index 0000000000000..610f8ba0224e2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vector-promotion.ll @@ -0,0 +1,69 @@ +; RUN: opt -passes='require,function(codegenprepare)' -mtriple=aarch64 %s -o - -S | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-NORMAL %s +; RUN: opt -passes='require,function(codegenprepare)' -mtriple=aarch64 %s -o - -S -stress-cgp-store-extract | FileCheck --check-prefix=IR-BOTH --check-prefix=IR-STRESS %s +; RUN: llc -mtriple=aarch64 %s -o - | FileCheck --check-prefix=ASM %s + +; IR-BOTH-LABEL: @simpleOneInstructionPromotion +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1 +; IR-BOTH-NEXT: [[VECTOR_OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR]], i32 1 +; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest +; IR-BOTH-NEXT: ret +; ASM-LABEL: simpleOneInstructionPromotion: +; ASM-NOT: umov +define void @simpleOneInstructionPromotion(ptr %addr1, ptr %dest) { + %in1 = load <2 x i32>, ptr %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 1 + %out = or i32 %extract, 1 + store i32 %out, ptr %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @unsupportedInstructionForPromotion +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1 +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 0 +; IR-BOTH-NEXT: [[CMP:%[a-zA-Z_0-9-]+]] = icmp eq i32 [[EXTRACT]], %in2 +; IR-BOTH-NEXT: store i1 [[CMP]], ptr %dest +; IR-BOTH-NEXT: ret +define void @unsupportedInstructionForPromotion(ptr %addr1, i32 %in2, ptr %dest) { + %in1 = load <2 x i32>, ptr %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 0 + %out = icmp eq i32 %extract, %in2 + store i1 %out, ptr %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @chainOfInstructionsToPromote +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1 +; IR-BOTH-NEXT: [[VECTOR_OR1:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], +; IR-BOTH-NEXT: [[VECTOR_OR2:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR1]], +; IR-BOTH-NEXT: [[VECTOR_OR3:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[VECTOR_OR2]], +; IR-BOTH-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[VECTOR_OR3]], i32 0 +; IR-BOTH-NEXT: store i32 [[EXTRACT]], ptr %dest +; IR-BOTH-NEXT: ret +define void @chainOfInstructionsToPromote(ptr %addr1, ptr %dest) { + %in1 = load <2 x i32>, ptr %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 0 + %out1 = or i32 %extract, 1 + %out2 = or i32 %out1, 1 + %out3 = or i32 %out2, 1 + store i32 %out3, ptr %dest, align 4 + ret void +} + +; IR-BOTH-LABEL: @simpleOneInstructionPromotionVariableIdx +; IR-BOTH: [[LOAD:%[a-zA-Z_0-9-]+]] = load <2 x i32>, ptr %addr1 +; Scalar version: +; IR-NORMAL-NEXT: [[EXTRACT:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[LOAD]], i32 %idx +; IR-NORMAL-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = or i32 [[EXTRACT]], 1 +; Vector version: +; IR-STRESS-NEXT: [[OR:%[a-zA-Z_0-9-]+]] = or <2 x i32> [[LOAD]], splat (i32 1) +; IR-STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = extractelement <2 x i32> [[OR]], i32 %idx +; IR-BOTH-NEXT: store i32 [[RES]], ptr %dest +; IR-BOTH-NEXT: ret +define void @simpleOneInstructionPromotionVariableIdx(ptr %addr1, ptr %dest, i32 %idx) { + %in1 = load <2 x i32>, ptr %addr1, align 8 + %extract = extractelement <2 x i32> %in1, i32 %idx + %out = or i32 %extract, 1 + store i32 %out, ptr %dest, align 4 + ret void +}