Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20467,6 +20467,69 @@ performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
}
}

// Given an extract(load) or extract(extend(load)), produce a scalar load
// instead to avoid the cross-register-bank copies.
if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
VT.isInteger() && isa<ConstantSDNode>(N1)) {
SDValue LoadN0 = N0;
// Look through sext/zext and extract_subvector / insert_subvector if
// required.
if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
N0.getOpcode() == ISD::SIGN_EXTEND ||
N0.getOpcode() == ISD::ANY_EXTEND) &&
N0.getOperand(0).hasOneUse())
LoadN0 = N0.getOperand(0);
unsigned OffsetElts = 0;
if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
OffsetElts = LoadN0.getConstantOperandVal(1);
LoadN0 = LoadN0.getOperand(0);
}
if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
LoadN0.getOperand(0).isUndef() &&
isNullConstant(LoadN0.getOperand(2)) &&
LoadN0.getOperand(1).hasOneUse())
LoadN0 = LoadN0.getOperand(1);

// Check all the uses are valid and can be scalarized. We check that all the
// uses are extracts and those extracts are not re-inserted into an
// operation best treated as a vector register.
auto Load = dyn_cast<LoadSDNode>(LoadN0);
if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
Load->getMemoryVT().isByteSized() &&
all_of(N0->uses(), [&](const SDUse &U) {
return U.getResNo() != N0.getResNo() ||
(U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
!any_of(U.getUser()->uses(), [](const SDUse &U2) {
return U2.getUser()->getOpcode() ==
ISD::INSERT_VECTOR_ELT ||
U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
}));
})) {

SDLoc DL(Load);

// Generate a new scalar load.
unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
Load->getValueType(0).getScalarSizeInBits() / 8;
SDValue BasePtr = DAG.getObjectPtrOffset(
DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
ISD::LoadExtType ExtType =
N0.getOpcode() == ISD::ZERO_EXTEND
? ISD::ZEXTLOAD
: (N0.getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD
: ISD::EXTLOAD);
SDValue ScalarLoad =
DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
Load->getPointerInfo().getWithOffset(Offset),
Load->getValueType(0).getScalarType(),
commonAlignment(Load->getAlign(), Offset),
Load->getMemOperand()->getFlags(), Load->getAAInfo());
DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
return ScalarLoad;
}
}

return SDValue();
}

Expand Down
17 changes: 7 additions & 10 deletions llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,15 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q2, [x0]
; CHECK-NEXT: mov x8, v0.d[1]
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: ucvtf s1, x9
; CHECK-NEXT: mov x9, v2.d[1]
; CHECK-NEXT: ucvtf s0, x8
; CHECK-NEXT: fmov x8, d2
; CHECK-NEXT: ucvtf s2, x8
; CHECK-NEXT: ldp x8, x9, [x0]
; CHECK-NEXT: movi v2.4s, #127, msl #8
; CHECK-NEXT: ucvtf s0, x9
; CHECK-NEXT: ucvtf s1, x8
; CHECK-NEXT: ldp x8, x9, [x0, #16]
; CHECK-NEXT: mov v1.s[1], v0.s[0]
; CHECK-NEXT: ucvtf s0, x8
; CHECK-NEXT: mov v1.s[2], v0.s[0]
; CHECK-NEXT: ucvtf s0, x9
; CHECK-NEXT: mov v1.s[2], v2.s[0]
; CHECK-NEXT: movi v2.4s, #127, msl #8
; CHECK-NEXT: mov v1.s[3], v0.s[0]
; CHECK-NEXT: movi v0.4s, #1
; CHECK-NEXT: ushr v3.4s, v1.4s, #16
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
define i32 @foo(ptr %__a) nounwind {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d0, [x0]
; CHECK-NEXT: umov.h w8, v0[0]
; CHECK-NEXT: umov.h w9, v0[0]
; CHECK-NEXT: add w0, w9, w8, uxth #1
; CHECK-NEXT: ldrh w8, [x0]
; CHECK-NEXT: add w0, w8, w8, lsl #1
; CHECK-NEXT: ret
%tmp18 = load <4 x i16>, ptr %__a, align 8
%vget_lane = extractelement <4 x i16> %tmp18, i32 0
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,13 @@ define i32 @ldr_int_volatile(ptr %a) nounwind {
; CHECK: Cluster ld/st SU(1) - SU(3)
; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDRQui
; CHECK: SU(3): %{{[0-9]+}}:fpr128 = LDRQui
define <2 x i64> @ldq_cluster(ptr %p) {
%tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8
define <4 x i32> @ldq_cluster(ptr %p) {
%tmp1 = load <4 x i32>, ptr %p, align 8
%add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2
%tmp2 = add nsw <2 x i64> %tmp1, %tmp1
%tmp3 = load <2 x i64>, ptr %add.ptr2, align 8
%res = mul nsw <2 x i64> %tmp2, %tmp3
ret <2 x i64> %res
%tmp2 = add nsw <4 x i32> %tmp1, %tmp1
%tmp3 = load <4 x i32>, ptr %add.ptr2, align 8
%res = mul nsw <4 x i32> %tmp2, %tmp3
ret <4 x i32> %res
}

; CHECK: ********** MI Scheduling **********
Expand Down Expand Up @@ -215,7 +215,7 @@ exit:
; CHECK: ********** MI Scheduling **********
; CHECK: LDURXi_LDRXui:%bb.0 entry
; CHECK: Cluster ld/st SU(3) - SU(4)
; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi
; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui
;
define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) {
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/AArch64/complex-int-to-fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
define void @autogen_SD19655(ptr %addr, ptr %addrfloat) {
; CHECK-LABEL: autogen_SD19655:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr q0, [x0]
; CHECK-NEXT: mov.d x8, v0[1]
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: scvtf s1, x9
; CHECK-NEXT: scvtf s0, x8
; CHECK-NEXT: ldp x8, x9, [x0]
; CHECK-NEXT: scvtf s0, x9
; CHECK-NEXT: scvtf s1, x8
; CHECK-NEXT: mov.s v1[1], v0[0]
; CHECK-NEXT: str d1, [x1]
; CHECK-NEXT: ret
Expand Down
14 changes: 4 additions & 10 deletions llvm/test/CodeGen/AArch64/extract-vector-elt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1114,16 +1114,10 @@ entry:
}

define ptr @v3ext(<3 x ptr> %a, <3 x ptr> %b, <3 x ptr> %x) {
; CHECK-SD-LABEL: v3ext:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldr d0, [sp]
; CHECK-SD-NEXT: fmov x0, d0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v3ext:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr x0, [sp]
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v3ext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldr x0, [sp]
; CHECK-NEXT: ret
entry:
%c = extractelement <3 x ptr> %x, i32 2
ret ptr %c
Expand Down
Loading