-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] X86FixupVectorConstants - load+zero vector constants that can be stored in a truncated form #80428
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesFurther develops the vsextload support added in #79815 - reduces the size of the vector constant by storing it in the constant pool in a truncated form, and zero-extend it as part of the load. Patch is 224.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80428.diff 68 Files Affected:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 619328af12719..e8a044b82eb80 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -1318,7 +1318,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXBW, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_PMOVZX(PMOVZXBW, m)
+ CASE_MASK_PMOVZX(PMOVZXBW, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBW, m)
DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1327,7 +1328,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXBD, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_PMOVZX(PMOVZXBD, m)
+ CASE_MASK_PMOVZX(PMOVZXBD, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBD, m)
DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1336,7 +1338,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXBQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_PMOVZX(PMOVZXBQ, m)
+ CASE_MASK_PMOVZX(PMOVZXBQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBQ, m)
DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1345,7 +1348,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXWD, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_PMOVZX(PMOVZXWD, m)
+ CASE_MASK_PMOVZX(PMOVZXWD, m)
+ CASE_MASKZ_PMOVZX(PMOVZXWD, m)
DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1354,7 +1358,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXWQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_PMOVZX(PMOVZXWQ, m)
+ CASE_MASK_PMOVZX(PMOVZXWQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXWQ, m)
DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1363,7 +1368,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXDQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_PMOVZX(PMOVZXDQ, m)
+ CASE_MASK_PMOVZX(PMOVZXDQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXDQ, m)
DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 5917c1497d80e..f65fa5a2298ac 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -299,6 +299,10 @@ static Constant *rebuildSExtCst(const Constant *C, unsigned NumElts,
unsigned SrcEltBitWidth) {
return rebuildExtCst(C, true, NumElts, SrcEltBitWidth);
}
+static Constant *rebuildZExtCst(const Constant *C, unsigned NumElts,
+ unsigned SrcEltBitWidth) {
+ return rebuildExtCst(C, false, NumElts, SrcEltBitWidth);
+}
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
MachineBasicBlock &MBB,
@@ -416,13 +420,19 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
case X86::MOVDQUrm: {
FixupEntry Fixups[] = {
{HasSSE41 ? X86::PMOVSXBQrm : 0, 2, 8, rebuildSExtCst},
+ {HasSSE41 ? X86::PMOVZXBQrm : 0, 2, 8, rebuildZExtCst},
{X86::MOVDI2PDIrm, 1, 32, rebuildZeroUpperCst},
{HasSSE41 ? X86::PMOVSXBDrm : 0, 4, 8, rebuildSExtCst},
+ {HasSSE41 ? X86::PMOVZXBDrm : 0, 4, 8, rebuildZExtCst},
{HasSSE41 ? X86::PMOVSXWQrm : 0, 2, 16, rebuildSExtCst},
+ {HasSSE41 ? X86::PMOVZXWQrm : 0, 2, 16, rebuildZExtCst},
{X86::MOVQI2PQIrm, 1, 64, rebuildZeroUpperCst},
{HasSSE41 ? X86::PMOVSXBWrm : 0, 8, 8, rebuildSExtCst},
+ {HasSSE41 ? X86::PMOVZXBWrm : 0, 8, 8, rebuildZExtCst},
{HasSSE41 ? X86::PMOVSXWDrm : 0, 4, 16, rebuildSExtCst},
- {HasSSE41 ? X86::PMOVSXDQrm : 0, 2, 32, rebuildSExtCst}};
+ {HasSSE41 ? X86::PMOVZXWDrm : 0, 4, 16, rebuildZExtCst},
+ {HasSSE41 ? X86::PMOVSXDQrm : 0, 2, 32, rebuildSExtCst},
+ {HasSSE41 ? X86::PMOVZXDQrm : 0, 2, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 1);
}
case X86::VMOVDQArm:
@@ -431,17 +441,23 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{HasAVX2 ? X86::VPBROADCASTBrm : 0, 1, 8, rebuildSplatCst},
{HasAVX2 ? X86::VPBROADCASTWrm : 0, 1, 16, rebuildSplatCst},
{X86::VPMOVSXBQrm, 2, 8, rebuildSExtCst},
+ {X86::VPMOVZXBQrm, 2, 8, rebuildZExtCst},
{X86::VMOVDI2PDIrm, 1, 32, rebuildZeroUpperCst},
{HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm, 1, 32,
rebuildSplatCst},
{X86::VPMOVSXBDrm, 4, 8, rebuildSExtCst},
+ {X86::VPMOVZXBDrm, 4, 8, rebuildZExtCst},
{X86::VPMOVSXWQrm, 2, 16, rebuildSExtCst},
+ {X86::VPMOVZXWQrm, 2, 16, rebuildZExtCst},
{X86::VMOVQI2PQIrm, 1, 64, rebuildZeroUpperCst},
{HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm, 1, 64,
rebuildSplatCst},
{X86::VPMOVSXBWrm, 8, 8, rebuildSExtCst},
+ {X86::VPMOVZXBWrm, 8, 8, rebuildZExtCst},
{X86::VPMOVSXWDrm, 4, 16, rebuildSExtCst},
- {X86::VPMOVSXDQrm, 2, 32, rebuildSExtCst}};
+ {X86::VPMOVZXWDrm, 4, 16, rebuildZExtCst},
+ {X86::VPMOVSXDQrm, 2, 32, rebuildSExtCst},
+ {X86::VPMOVZXDQrm, 2, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 1);
}
case X86::VMOVDQAYrm:
@@ -452,15 +468,21 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm, 1, 32,
rebuildSplatCst},
{HasAVX2 ? X86::VPMOVSXBQYrm : 0, 4, 8, rebuildSExtCst},
+ {HasAVX2 ? X86::VPMOVZXBQYrm : 0, 4, 8, rebuildZExtCst},
{HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm, 1, 64,
rebuildSplatCst},
{HasAVX2 ? X86::VPMOVSXBDYrm : 0, 8, 8, rebuildSExtCst},
+ {HasAVX2 ? X86::VPMOVZXBDYrm : 0, 8, 8, rebuildZExtCst},
{HasAVX2 ? X86::VPMOVSXWQYrm : 0, 4, 16, rebuildSExtCst},
+ {HasAVX2 ? X86::VPMOVZXWQYrm : 0, 4, 16, rebuildZExtCst},
{HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm, 1, 128,
rebuildSplatCst},
{HasAVX2 ? X86::VPMOVSXBWYrm : 0, 16, 8, rebuildSExtCst},
+ {HasAVX2 ? X86::VPMOVZXBWYrm : 0, 16, 8, rebuildZExtCst},
{HasAVX2 ? X86::VPMOVSXWDYrm : 0, 8, 16, rebuildSExtCst},
- {HasAVX2 ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst}};
+ {HasAVX2 ? X86::VPMOVZXWDYrm : 0, 8, 16, rebuildZExtCst},
+ {HasAVX2 ? X86::VPMOVSXDQYrm : 0, 4, 32, rebuildSExtCst},
+ {HasAVX2 ? X86::VPMOVZXDQYrm : 0, 4, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 1);
}
case X86::VMOVDQA32Z128rm:
@@ -471,15 +493,21 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1, 8, rebuildSplatCst},
{HasBWI ? X86::VPBROADCASTWZ128rm : 0, 1, 16, rebuildSplatCst},
{X86::VPMOVSXBQZ128rm, 2, 8, rebuildSExtCst},
+ {X86::VPMOVZXBQZ128rm, 2, 8, rebuildZExtCst},
{X86::VMOVDI2PDIZrm, 1, 32, rebuildZeroUpperCst},
{X86::VPBROADCASTDZ128rm, 1, 32, rebuildSplatCst},
{X86::VPMOVSXBDZ128rm, 4, 8, rebuildSExtCst},
+ {X86::VPMOVZXBDZ128rm, 4, 8, rebuildZExtCst},
{X86::VPMOVSXWQZ128rm, 2, 16, rebuildSExtCst},
+ {X86::VPMOVZXWQZ128rm, 2, 16, rebuildZExtCst},
{X86::VMOVQI2PQIZrm, 1, 64, rebuildZeroUpperCst},
{X86::VPBROADCASTQZ128rm, 1, 64, rebuildSplatCst},
{HasBWI ? X86::VPMOVSXBWZ128rm : 0, 8, 8, rebuildSExtCst},
+ {HasBWI ? X86::VPMOVZXBWZ128rm : 0, 8, 8, rebuildZExtCst},
{X86::VPMOVSXWDZ128rm, 4, 16, rebuildSExtCst},
- {X86::VPMOVSXDQZ128rm, 2, 32, rebuildSExtCst}};
+ {X86::VPMOVZXWDZ128rm, 4, 16, rebuildZExtCst},
+ {X86::VPMOVSXDQZ128rm, 2, 32, rebuildSExtCst},
+ {X86::VPMOVZXDQZ128rm, 2, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 1);
}
case X86::VMOVDQA32Z256rm:
@@ -491,13 +519,19 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{HasBWI ? X86::VPBROADCASTWZ256rm : 0, 1, 16, rebuildSplatCst},
{X86::VPBROADCASTDZ256rm, 1, 32, rebuildSplatCst},
{X86::VPMOVSXBQZ256rm, 4, 8, rebuildSExtCst},
+ {X86::VPMOVZXBQZ256rm, 4, 8, rebuildZExtCst},
{X86::VPBROADCASTQZ256rm, 1, 64, rebuildSplatCst},
{X86::VPMOVSXBDZ256rm, 8, 8, rebuildSExtCst},
+ {X86::VPMOVZXBDZ256rm, 8, 8, rebuildZExtCst},
{X86::VPMOVSXWQZ256rm, 4, 16, rebuildSExtCst},
+ {X86::VPMOVZXWQZ256rm, 4, 16, rebuildZExtCst},
{X86::VBROADCASTI32X4Z256rm, 1, 128, rebuildSplatCst},
{HasBWI ? X86::VPMOVSXBWZ256rm : 0, 16, 8, rebuildSExtCst},
+ {HasBWI ? X86::VPMOVZXBWZ256rm : 0, 16, 8, rebuildZExtCst},
{X86::VPMOVSXWDZ256rm, 8, 16, rebuildSExtCst},
- {X86::VPMOVSXDQZ256rm, 4, 32, rebuildSExtCst}};
+ {X86::VPMOVZXWDZ256rm, 8, 16, rebuildZExtCst},
+ {X86::VPMOVSXDQZ256rm, 4, 32, rebuildSExtCst},
+ {X86::VPMOVZXDQZ256rm, 4, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 1);
}
case X86::VMOVDQA32Zrm:
@@ -510,13 +544,19 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
{X86::VPBROADCASTDZrm, 1, 32, rebuildSplatCst},
{X86::VPBROADCASTQZrm, 1, 64, rebuildSplatCst},
{X86::VPMOVSXBQZrm, 8, 8, rebuildSExtCst},
+ {X86::VPMOVZXBQZrm, 8, 8, rebuildZExtCst},
{X86::VBROADCASTI32X4rm, 1, 128, rebuildSplatCst},
{X86::VPMOVSXBDZrm, 16, 8, rebuildSExtCst},
+ {X86::VPMOVZXBDZrm, 16, 8, rebuildZExtCst},
{X86::VPMOVSXWQZrm, 8, 16, rebuildSExtCst},
+ {X86::VPMOVZXWQZrm, 8, 16, rebuildZExtCst},
{X86::VBROADCASTI64X4rm, 1, 256, rebuildSplatCst},
{HasBWI ? X86::VPMOVSXBWZrm : 0, 32, 8, rebuildSExtCst},
+ {HasBWI ? X86::VPMOVZXBWZrm : 0, 32, 8, rebuildZExtCst},
{X86::VPMOVSXWDZrm, 16, 16, rebuildSExtCst},
- {X86::VPMOVSXDQZrm, 8, 32, rebuildSExtCst}};
+ {X86::VPMOVZXWDZrm, 16, 16, rebuildZExtCst},
+ {X86::VPMOVSXDQZrm, 8, 32, rebuildSExtCst},
+ {X86::VPMOVZXDQZrm, 8, 32, rebuildZExtCst}};
return FixupConstant(Fixups, 1);
}
}
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 1a26489460882..45dd98cde6fa1 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1388,6 +1388,18 @@ PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
return MBBI;
}
+static unsigned getRegisterWidth(const MCOperandInfo &Info) {
+ if (Info.RegClass == X86::VR128RegClassID ||
+ Info.RegClass == X86::VR128XRegClassID)
+ return 128;
+ if (Info.RegClass == X86::VR256RegClassID ||
+ Info.RegClass == X86::VR256XRegClassID)
+ return 256;
+ if (Info.RegClass == X86::VR512RegClassID)
+ return 512;
+ llvm_unreachable("Unknown register class!");
+}
+
static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
unsigned SrcOp2Idx, ArrayRef<int> Mask) {
std::string Comment;
@@ -1582,8 +1594,8 @@ static void printBroadcast(const MachineInstr *MI, MCStreamer &OutStreamer,
}
}
-static bool printSignExtend(const MachineInstr *MI, MCStreamer &OutStreamer,
- int SrcEltBits, int DstEltBits) {
+static bool printExtend(const MachineInstr *MI, MCStreamer &OutStreamer,
+ int SrcEltBits, int DstEltBits, bool IsSext) {
auto *C = X86::getConstantFromPool(*MI, 1);
if (C && C->getType()->getScalarSizeInBits() == unsigned(SrcEltBits)) {
if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
@@ -1598,7 +1610,8 @@ static bool printSignExtend(const MachineInstr *MI, MCStreamer &OutStreamer,
if (i != 0)
CS << ",";
if (CDS->getElementType()->isIntegerTy()) {
- APInt Elt = CDS->getElementAsAPInt(i).sext(DstEltBits);
+ APInt Elt = CDS->getElementAsAPInt(i);
+ Elt = IsSext ? Elt.sext(DstEltBits) : Elt.zext(DstEltBits);
printConstant(Elt, CS);
} else
CS << "?";
@@ -1611,6 +1624,36 @@ static bool printSignExtend(const MachineInstr *MI, MCStreamer &OutStreamer,
return false;
}
+static void printSignExtend(const MachineInstr *MI, MCStreamer &OutStreamer,
+ int SrcEltBits, int DstEltBits) {
+ printExtend(MI, OutStreamer, SrcEltBits, DstEltBits, true);
+}
+static void printZeroExtend(const MachineInstr *MI, MCStreamer &OutStreamer,
+ int SrcEltBits, int DstEltBits) {
+ if (printExtend(MI, OutStreamer, SrcEltBits, DstEltBits, false))
+ return;
+
+ // We didn't find a constant load, fallback to a shuffle mask decode.
+ std::string Comment;
+ raw_string_ostream CS(Comment);
+
+ const MachineOperand &DstOp = MI->getOperand(0);
+ CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+
+ unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
+ assert((Width % DstEltBits) == 0 && (DstEltBits % SrcEltBits) == 0 &&
+ "Illegal extension ratio");
+ unsigned NumElts = Width / DstEltBits;
+ unsigned Scale = DstEltBits / SrcEltBits;
+ for (unsigned I = 0; I != NumElts; ++I) {
+ if (I != 0)
+ CS << ",";
+ CS << "mem[" << I << "]";
+ for (unsigned S = 1; S != Scale; ++S)
+ CS << ",zero";
+ }
+ OutStreamer.AddComment(CS.str());
+}
void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
@@ -1688,18 +1731,6 @@ void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
}
}
-static unsigned getRegisterWidth(const MCOperandInfo &Info) {
- if (Info.RegClass == X86::VR128RegClassID ||
- Info.RegClass == X86::VR128XRegClassID)
- return 128;
- if (Info.RegClass == X86::VR256RegClassID ||
- Info.RegClass == X86::VR256XRegClassID)
- return 256;
- if (Info.RegClass == X86::VR512RegClassID)
- return 512;
- llvm_unreachable("Unknown register class!");
-}
-
static void addConstantComments(const MachineInstr *MI,
MCStreamer &OutStreamer) {
switch (MI->getOpcode()) {
@@ -2039,6 +2070,25 @@ static void addConstantComments(const MachineInstr *MI,
CASE_MOVX_RM(SX, WQ)
printSignExtend(MI, OutStreamer, 16, 64);
break;
+
+ CASE_MOVX_RM(ZX, BD)
+ printZeroExtend(MI, OutStreamer, 8, 32);
+ break;
+ CASE_MOVX_RM(ZX, BQ)
+ printZeroExtend(MI, OutStreamer, 8, 64);
+ break;
+ CASE_MOVX_RM(ZX, BW)
+ printZeroExtend(MI, OutStreamer, 8, 16);
+ break;
+ CASE_MOVX_RM(ZX, DQ)
+ printZeroExtend(MI, OutStreamer, 32, 64);
+ break;
+ CASE_MOVX_RM(ZX, WD)
+ printZeroExtend(MI, OutStreamer, 16, 32);
+ break;
+ CASE_MOVX_RM(ZX, WQ)
+ printZeroExtend(MI, OutStreamer, 16, 64);
+ break;
}
}
diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
index 4bf2e2456482e..983c69d1a1c2e 100644
--- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -60,14 +60,14 @@ define <8 x i32> @test_vpslld_var(i32 %shift) {
; X86-LABEL: test_vpslld_var:
; X86: # %bb.0:
; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT: vpmovsxwd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
+; X86-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
; X86-NEXT: vpslld %xmm0, %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: test_vpslld_var:
; X64: # %bb.0:
; X64-NEXT: vmovd %edi, %xmm0
-; X64-NEXT: vpmovsxwd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
+; X64-NEXT: vpmovzxbd {{.*#+}} ymm1 = [192,193,194,195,196,197,198,199]
; X64-NEXT: vpslld %xmm0, %ymm1, %ymm0
; X64-NEXT: retq
%amt = insertelement <8 x i32> undef, i32 %shift, i32 0
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index a16659eab9763..6255621d870e1 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -108,7 +108,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -117,7 +117,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
@@ -268,7 +268,7 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -445,7 +445,7 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [1,2,4,8]
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [16,32,64,128]
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = [16,32,64,128]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
@@ -505,10 +505,10 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1
-; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index 48abed8b6f222..bc8964f30938b 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -134,7 +134,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff 84ea236af9f36d409d2c45c66f8a8b6eb027935d a371d904bfd61e9c6180c3bf1ac454b8084ad164 -- llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp llvm/lib/Target/X86/X86FixupVectorConstants.cpp llvm/lib/Target/X86/X86MCInstLower.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index e8a044b82e..07a87701dc 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -1318,8 +1318,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXBW, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_MASK_PMOVZX(PMOVZXBW, m)
- CASE_MASKZ_PMOVZX(PMOVZXBW, m)
+ CASE_MASK_PMOVZX(PMOVZXBW, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBW, m)
DecodeZeroExtendMask(8, 16, getRegOperandNumElts(MI, 16, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1328,8 +1328,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXBD, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_MASK_PMOVZX(PMOVZXBD, m)
- CASE_MASKZ_PMOVZX(PMOVZXBD, m)
+ CASE_MASK_PMOVZX(PMOVZXBD, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBD, m)
DecodeZeroExtendMask(8, 32, getRegOperandNumElts(MI, 32, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1338,8 +1338,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXBQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_MASK_PMOVZX(PMOVZXBQ, m)
- CASE_MASKZ_PMOVZX(PMOVZXBQ, m)
+ CASE_MASK_PMOVZX(PMOVZXBQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXBQ, m)
DecodeZeroExtendMask(8, 64, getRegOperandNumElts(MI, 64, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1348,8 +1348,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXWD, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_MASK_PMOVZX(PMOVZXWD, m)
- CASE_MASKZ_PMOVZX(PMOVZXWD, m)
+ CASE_MASK_PMOVZX(PMOVZXWD, m)
+ CASE_MASKZ_PMOVZX(PMOVZXWD, m)
DecodeZeroExtendMask(16, 32, getRegOperandNumElts(MI, 32, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1358,8 +1358,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXWQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_MASK_PMOVZX(PMOVZXWQ, m)
- CASE_MASKZ_PMOVZX(PMOVZXWQ, m)
+ CASE_MASK_PMOVZX(PMOVZXWQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXWQ, m)
DecodeZeroExtendMask(16, 64, getRegOperandNumElts(MI, 64, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
@@ -1368,8 +1368,8 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
CASE_PMOVZX(PMOVZXDQ, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
[[fallthrough]];
- CASE_MASK_PMOVZX(PMOVZXDQ, m)
- CASE_MASKZ_PMOVZX(PMOVZXDQ, m)
+ CASE_MASK_PMOVZX(PMOVZXDQ, m)
+ CASE_MASKZ_PMOVZX(PMOVZXDQ, m)
DecodeZeroExtendMask(32, 64, getRegOperandNumElts(MI, 64, 0), false,
ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
|
@@ -109,13 +109,13 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ | |||
define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { | |||
; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: | |||
; AVX512F: # %bb.0: | |||
; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't we prefer vmovd
to vpmovzxbq
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess it comes down to whether we want to save those 16-bits in the constant pool in exchange for a shuffle uop
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, I forgot the size difference.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
@@ -109,13 +109,13 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_ | |||
define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) { | |||
; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: | |||
; AVX512F: # %bb.0: | |||
; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, I forgot the size difference.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…e stored in a truncated form Further develops the vsextload support added in llvm#79815 - reduces the size of the vector constant by storing it in the constant pool in a truncated form, and zero-extend it as part of the load.
ca2ac31
to
a371d90
Compare
…e stored in a truncated form (llvm#80428) Further develops the vsextload support added in llvm#79815 / b5d35fe - reduces the size of the vector constant by storing it in the constant pool in a truncated form, and zero-extend it as part of the load.
Further develops the vsextload support added in #79815 - reduces the size of the vector constant by storing it in the constant pool in a truncated form, and zero-extend it as part of the load.