Skip to content

Commit

Permalink
[X86] Separate the memory size of vzext_load/vextract_store from the …
Browse files Browse the repository at this point in the history
…element size of the result type. Use them improve the codegen of v2f32 loads/stores with sse1 only.

Summary:
SSE1 only supports v4f32. But does have instructions like movlps/movhps that load/store 64-bits of memory.

This patch breaks the connection between the node VT of the vzext_load/vextract_store patterns and the memory VT. Enabling a v4f32 node with a 64-bit memory VT. I've used i64 as the memory VT here. I've written the PatFrag predicate to just check the store size not the specific VT. I think the VT will only matter for CSE purposes. We could use v2f32, but if we want to start using these operations in more places a simple integer type might make the most sense.

I'd like to maybe use this same thing for SSE2 and later as well, but that will need more work to be supported by EltsFromConsecutiveLoads to avoid regressing lit tests. I'd maybe also like to combine bitcasts with these load/stores nodes now that the types are disconnected. And I'd also like to consider canonicalizing (scalar_to_vector + load) to vzext_load.

If you want I can split the mechanical tablegen stuff where I added the 32/64 off from the sse1 change.

Reviewers: spatel, RKSimon

Reviewed By: RKSimon

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D64528

llvm-svn: 366034
  • Loading branch information
topperc committed Jul 15, 2019
1 parent 4066978 commit 635d103
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 197 deletions.
73 changes: 47 additions & 26 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -794,6 +794,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
}

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
Expand Down Expand Up @@ -971,11 +974,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
Expand Down Expand Up @@ -21267,21 +21268,29 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
TargetLowering::TypeWidenVector)
return SDValue();

// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
// and store it.
MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
StoreVT.getVectorNumElements() * 2);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
MVT CastVT = MVT::getVectorVT(StVT, 2);
StoredVal = DAG.getBitcast(CastVT, StoredVal);
StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
DAG.getIntPtrConstant(0, dl));

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
if (Subtarget.hasSSE2()) {
// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
// and store it.
MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
MVT CastVT = MVT::getVectorVT(StVT, 2);
StoredVal = DAG.getBitcast(CastVT, StoredVal);
StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
DAG.getIntPtrConstant(0, dl));

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
}
assert(Subtarget.hasSSE1() && "Expected SSE");
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
St->getMemOperand());
}

// Lower vector extended loads using a shuffle. If SSSE3 is not available we
Expand Down Expand Up @@ -28155,19 +28164,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(),
Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
MVT WideVT = MVT::getVectorVT(LdVT, 2);
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() * 2);
Res = DAG.getBitcast(CastVT, Res);
if (Subtarget.hasSSE2()) {
MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
MVT WideVT = MVT::getVectorVT(LdVT, 2);
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() * 2);
Res = DAG.getBitcast(CastVT, Res);
Results.push_back(Res);
Results.push_back(Chain);
return;
}
assert(Subtarget.hasSSE1() && "Expected SSE");
SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Ld->getMemOperand());
Results.push_back(Res);
Results.push_back(Chain);
Results.push_back(Res.getValue(1));
return;
}
}
Expand Down Expand Up @@ -32016,8 +32034,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// directly if we don't shuffle the lower element and we shuffle the upper
// (zero) elements within themselves.
if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
(cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
MaskEltSizeInBits) == 0) {
unsigned Scale =
cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
MaskEltSizeInBits;
ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
Expand Down

0 comments on commit 635d103

Please sign in to comment.