Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 41 additions & 28 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16717,38 +16717,51 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
}

// fold (conv (load x)) -> (load (conv*)x)
// fold (conv (freeze (load x))) -> (freeze (load (conv*)x))
// If the resultant load doesn't need a higher alignment than the original!
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not remove the cast if the types differ in endian layout.
TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
// If the load is volatile, we only want to change the load type if the
// resulting load is legal. Otherwise we might increase the number of
// memory accesses. We don't care if the original type was legal or not
// as we assume software couldn't rely on the number of accesses of an
// illegal type.
((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
TLI.isOperationLegal(ISD::LOAD, VT))) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
auto CastLoad = [this, &VT](SDValue N0, const SDLoc &DL) {
auto *LN0 = dyn_cast<LoadSDNode>(N0);
if (!LN0 || !ISD::isNormalLoad(LN0) || !N0.hasOneUse())
return SDValue();

if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
*LN0->getMemOperand())) {
// If the range metadata type does not match the new memory
// operation type, remove the range metadata.
if (const MDNode *MD = LN0->getRanges()) {
ConstantInt *Lower = mdconst::extract<ConstantInt>(MD->getOperand(0));
if (Lower->getBitWidth() != VT.getScalarSizeInBits() ||
!VT.isInteger()) {
LN0->getMemOperand()->clearRanges();
}
// Do not remove the cast if the types differ in endian layout.
if (TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) !=
TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()))
return SDValue();

// If the load is volatile, we only want to change the load type if the
// resulting load is legal. Otherwise we might increase the number of
// memory accesses. We don't care if the original type was legal or not
// as we assume software couldn't rely on the number of accesses of an
// illegal type.
if (((LegalOperations || !LN0->isSimple()) &&
!TLI.isOperationLegal(ISD::LOAD, VT)))
return SDValue();

if (!TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
*LN0->getMemOperand()))
return SDValue();

// If the range metadata type does not match the new memory
// operation type, remove the range metadata.
if (const MDNode *MD = LN0->getRanges()) {
ConstantInt *Lower = mdconst::extract<ConstantInt>(MD->getOperand(0));
if (Lower->getBitWidth() != VT.getScalarSizeInBits() || !VT.isInteger()) {
LN0->getMemOperand()->clearRanges();
}
SDValue Load =
DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
LN0->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
return Load;
}
}
SDValue Load = DAG.getLoad(VT, DL, LN0->getChain(), LN0->getBasePtr(),
LN0->getMemOperand());
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
return Load;
};

if (SDValue NewLd = CastLoad(N0, SDLoc(N)))
return NewLd;

if (N0.getOpcode() == ISD::FREEZE && N0.hasOneUse())
if (SDValue NewLd = CastLoad(N0.getOperand(0), SDLoc(N)))
return DAG.getFreeze(NewLd);

if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
return V;
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3454,6 +3454,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
return true;

// If we have a large vector type (even if illegal), don't bitcast to large
// (illegal) scalar types. Better to load fewer vectors and extract.
if (LoadVT.isVector() && !BitcastVT.isVector() && LoadVT.isInteger() &&
BitcastVT.isInteger() && (LoadVT.getSizeInBits() % 128) == 0)
return false;

return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512:
; X86: # %bb.0:
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08]
; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/avx10_2bf16-arith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src,
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08]
; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1]
Expand Down Expand Up @@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8
;
; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128:
; X86: # %bb.0:
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2]
; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08]
; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1]
Expand Down
Loading