250 changes: 163 additions & 87 deletions llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::AddrInfo &AI) {

LLVM_ATTRIBUTE_UNUSED
raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
OS << "IsLoad:" << (MG.IsLoad ? "yes" : "no");
OS << ", IsHvx:" << (MG.IsHvx ? "yes" : "no") << '\n';
OS << "Main\n";
for (Instruction *I : MG.Main)
OS << " " << *I << '\n';
Expand All @@ -316,8 +318,14 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
LLVM_ATTRIBUTE_UNUSED
raw_ostream &operator<<(raw_ostream &OS,
const AlignVectors::ByteSpan::Block &B) {
OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
<< *B.Seg.Val;
OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] ";
if (B.Seg.Val == reinterpret_cast<const Value *>(&B)) {
OS << "(self:" << B.Seg.Val << ')';
} else if (B.Seg.Val != nullptr) {
OS << *B.Seg.Val;
} else {
OS << "(null)";
}
return OS;
}

Expand Down Expand Up @@ -573,26 +581,26 @@ auto AlignVectors::createAdjustedPointer(IRBuilderBase &Builder, Value *Ptr,
Type *ElemTy = PtrTy->getNonOpaquePointerElementType();
int ElemSize = HVC.getSizeOf(ElemTy, HVC.Alloc);
if (Adjust % ElemSize == 0 && Adjust != 0) {
Value *Tmp0 =
Builder.CreateGEP(ElemTy, Ptr, HVC.getConstInt(Adjust / ElemSize));
return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo());
Value *Tmp0 = Builder.CreateGEP(
ElemTy, Ptr, HVC.getConstInt(Adjust / ElemSize), "gep");
return Builder.CreatePointerCast(Tmp0, ValTy->getPointerTo(), "cst");
}
}

PointerType *CharPtrTy = Type::getInt8PtrTy(HVC.F.getContext());
Value *Tmp0 = Builder.CreatePointerCast(Ptr, CharPtrTy);
Value *Tmp0 = Builder.CreatePointerCast(Ptr, CharPtrTy, "cst");
Value *Tmp1 = Builder.CreateGEP(Type::getInt8Ty(HVC.F.getContext()), Tmp0,
HVC.getConstInt(Adjust));
return Builder.CreatePointerCast(Tmp1, ValTy->getPointerTo());
HVC.getConstInt(Adjust), "gep");
return Builder.CreatePointerCast(Tmp1, ValTy->getPointerTo(), "cst");
}

auto AlignVectors::createAlignedPointer(IRBuilderBase &Builder, Value *Ptr,
Type *ValTy, int Alignment) const
-> Value * {
Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy());
Value *AsInt = Builder.CreatePtrToInt(Ptr, HVC.getIntTy(), "pti");
Value *Mask = HVC.getConstInt(-Alignment);
Value *And = Builder.CreateAnd(AsInt, Mask);
return Builder.CreateIntToPtr(And, ValTy->getPointerTo());
Value *And = Builder.CreateAnd(AsInt, Mask, "add");
return Builder.CreateIntToPtr(And, ValTy->getPointerTo(), "itp");
}

auto AlignVectors::createAlignedLoad(IRBuilderBase &Builder, Type *ValTy,
Expand All @@ -602,8 +610,9 @@ auto AlignVectors::createAlignedLoad(IRBuilderBase &Builder, Type *ValTy,
if (HVC.isZero(Mask))
return PassThru;
if (Mask == ConstantInt::getTrue(Mask->getType()))
return Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment));
return Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment), Mask, PassThru);
return Builder.CreateAlignedLoad(ValTy, Ptr, Align(Alignment), "ald");
return Builder.CreateMaskedLoad(ValTy, Ptr, Align(Alignment), Mask, PassThru,
"mld");
}

auto AlignVectors::createAlignedStore(IRBuilderBase &Builder, Value *Val,
Expand Down Expand Up @@ -814,6 +823,8 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
const ByteSpan &VSpan, int ScLen,
Value *AlignVal, Value *AlignAddr) const
-> void {
LLVM_DEBUG(dbgs() << __func__ << "\n");

Type *SecTy = HVC.getByteTy(ScLen);
int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
bool DoAlign = !HVC.isZero(AlignVal);
Expand Down Expand Up @@ -869,7 +880,7 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
assert(I != nullptr && "Load used in a non-instruction?");
// Make sure we only consider at users in this block, but we need
// to remember if there were users outside the block too. This is
// because if there are no users, aligned loads will not be created.
// because if no users are found, aligned loads will not be created.
if (I->getParent() == BaseBlock) {
if (!isa<PHINode>(I))
User = std::min(User, I, isEarlier);
Expand All @@ -888,6 +899,14 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
}
}

LLVM_DEBUG({
dbgs() << "ASpan:\n" << ASpan << '\n';
dbgs() << "Earliest users of ASpan:\n";
for (auto &[Val, User] : EarliestUser) {
dbgs() << Val << "\n ->" << *User << '\n';
}
});

auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
int Index) {
Value *Ptr =
Expand All @@ -913,28 +932,35 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
};

// Generate necessary loads at appropriate locations.
LLVM_DEBUG(dbgs() << "Creating loads for ASpan sectors\n");
for (int Index = 0; Index != NumSectors + 1; ++Index) {
// In ASpan, each block will be either a single aligned load, or a
// valign of a pair of loads. In the latter case, an aligned load j
// will belong to the current valign, and the one in the previous
// block (for j > 0).
// Place the load at a location which will dominate the valign, assuming
// the valign will be placed right before the earliest user.
Instruction *PrevAt =
DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
Instruction *ThisAt =
Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
Builder.SetInsertPoint(Where);
Loads[Index] = createLoad(Builder, VSpan, Index);
// We know it's safe to put the load at BasePos, so if it's not safe
// to move it from this location to BasePos, then the current location
// is not valid.
// We know it's safe to put the load at BasePos, but we'd prefer to put
// it at "Where". To see if the load is safe to be placed at Where, put
// it there first and then check if it's safe to move it to BasePos.
// If not, then the load needs to be placed at BasePos.
// We can't do this check proactively because we need the load to exist
// in order to check legality.
if (!HVC.isSafeToMoveBeforeInBB(*Loads[Index], BasePos))
moveBefore(Loads[Index], &*BasePos);
LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
}
}

// Generate valigns if needed, and fill in proper values in ASpan
LLVM_DEBUG(dbgs() << "Creating values for ASpan sectors\n");
for (int Index = 0; Index != NumSectors; ++Index) {
ASpan[Index].Seg.Val = nullptr;
if (auto *Where = EarliestUser[&ASpan[Index]]) {
Expand All @@ -947,6 +973,7 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
}
ASpan[Index].Seg.Val = Val;
LLVM_DEBUG(dbgs() << "ASpan[" << Index << "]:" << *Val << '\n');
}
}

Expand All @@ -955,15 +982,27 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));

// We're generating a reduction, where each instruction depends on
// the previous one, so we need to order them according to the position
// of their inputs in the code.
std::vector<ByteSpan::Block *> ABlocks;
for (ByteSpan::Block &S : ASection) {
if (S.Seg.Val == nullptr)
continue;
if (S.Seg.Val != nullptr)
ABlocks.push_back(&S);
}
llvm::sort(ABlocks,
[&](const ByteSpan::Block *A, const ByteSpan::Block *B) {
return isEarlier(cast<Instruction>(A->Seg.Val),
cast<Instruction>(B->Seg.Val));
});
for (ByteSpan::Block *S : ABlocks) {
// The processing of the data loaded by the aligned loads
// needs to be inserted after the data is available.
Instruction *SegI = cast<Instruction>(S.Seg.Val);
Instruction *SegI = cast<Instruction>(S->Seg.Val);
Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
Accum = HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
Value *Pay = HVC.vbytes(Builder, getPayload(S->Seg.Val));
Accum =
HVC.insertb(Builder, Accum, Pay, S->Seg.Start, S->Seg.Size, S->Pos);
}
// Instead of casting everything to bytes for the vselect, cast to the
// original value type. This will avoid complications with casting masks.
Expand All @@ -972,9 +1011,9 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
// but if the mask is not exactly of HVX length, extra handling would be
// needed to make it work.
Type *ValTy = getPayload(B.Seg.Val)->getType();
Value *Cast = Builder.CreateBitCast(Accum, ValTy);
Value *Cast = Builder.CreateBitCast(Accum, ValTy, "cst");
Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
getPassThrough(B.Seg.Val));
getPassThrough(B.Seg.Val), "sel");
B.Seg.Val->replaceAllUsesWith(Sel);
}
}
Expand All @@ -983,6 +1022,8 @@ auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
const ByteSpan &VSpan, int ScLen,
Value *AlignVal, Value *AlignAddr) const
-> void {
LLVM_DEBUG(dbgs() << __func__ << "\n");

Type *SecTy = HVC.getByteTy(ScLen);
int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
bool DoAlign = !HVC.isZero(AlignVal);
Expand All @@ -997,7 +1038,7 @@ auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
if (Ty->isVectorTy())
return Val;
auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
return Builder.CreateBitCast(Val, VecTy);
return Builder.CreateBitCast(Val, VecTy, "cst");
};

// Create an extra "undef" sector at the beginning and at the end.
Expand Down Expand Up @@ -1050,6 +1091,8 @@ auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
}

auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
LLVM_DEBUG(dbgs() << "Realigning group:\n" << Move << '\n');

// TODO: Needs support for masked loads/stores of "scalar" vectors.
if (!Move.IsHvx)
return false;
Expand Down Expand Up @@ -1137,7 +1180,8 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
// aligned instruction with an address that is not really aligned.
AlignAddr = createAlignedPointer(Builder, WithMinOffset.Addr,
WithMinOffset.ValTy, MinNeeded.value());
AlignVal = Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy());
AlignVal =
Builder.CreatePtrToInt(WithMinOffset.Addr, HVC.getIntTy(), "pti");
}

ByteSpan VSpan;
Expand All @@ -1154,6 +1198,13 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
assert(Move.IsHvx || ScLen == 4 || ScLen == 8);

LLVM_DEBUG({
dbgs() << "ScLen: " << ScLen << "\n";
dbgs() << "AlignVal:" << *AlignVal << "\n";
dbgs() << "AlignAddr:" << *AlignAddr << "\n";
dbgs() << "VSpan:\n" << VSpan << '\n';
});

if (Move.IsLoad)
realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
else
Expand All @@ -1175,9 +1226,18 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
}

auto AlignVectors::run() -> bool {
LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors\n");
if (!createAddressGroups())
return false;

LLVM_DEBUG({
dbgs() << "Address groups(" << AddrGroups.size() << "):\n";
for (auto &[In, AL] : AddrGroups) {
for (const AddrInfo &AI : AL)
dbgs() << "---\n" << AI << '\n';
}
});

bool Changed = false;
MoveList LoadGroups, StoreGroups;

Expand All @@ -1186,11 +1246,22 @@ auto AlignVectors::run() -> bool {
llvm::append_range(StoreGroups, createStoreGroups(G.second));
}

LLVM_DEBUG({
dbgs() << "\nLoad groups(" << LoadGroups.size() << "):\n";
for (const MoveGroup &G : LoadGroups)
dbgs() << G << "\n";
dbgs() << "Store groups(" << StoreGroups.size() << "):\n";
for (const MoveGroup &G : StoreGroups)
dbgs() << G << "\n";
});

for (auto &M : LoadGroups)
Changed |= move(M);
for (auto &M : StoreGroups)
Changed |= move(M);

LLVM_DEBUG(dbgs() << "After move:\n" << HVC.F);

for (auto &M : LoadGroups)
Changed |= realignGroup(M);
for (auto &M : StoreGroups)
Expand Down Expand Up @@ -1356,13 +1427,13 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const

auto *ResizeTy = VectorType::get(HVC.getIntTy(Width), VecTy);
if (Width < ElemWidth) {
X = Builder.CreateTrunc(X, ResizeTy);
Y = Builder.CreateTrunc(Y, ResizeTy);
X = Builder.CreateTrunc(X, ResizeTy, "trn");
Y = Builder.CreateTrunc(Y, ResizeTy, "trn");
} else if (Width > ElemWidth) {
X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy)
: Builder.CreateZExt(X, ResizeTy);
Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy)
: Builder.CreateZExt(Y, ResizeTy);
X = SignX == Signed ? Builder.CreateSExt(X, ResizeTy, "sxt")
: Builder.CreateZExt(X, ResizeTy, "zxt");
Y = SignY == Signed ? Builder.CreateSExt(Y, ResizeTy, "sxt")
: Builder.CreateZExt(Y, ResizeTy, "zxt");
};

assert(X->getType() == Y->getType() && X->getType() == ResizeTy);
Expand All @@ -1387,8 +1458,8 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const

Value *Cat = HVC.concat(Builder, Results);
Value *Ext = SignX == Signed || SignY == Signed
? Builder.CreateSExt(Cat, VecTy)
: Builder.CreateZExt(Cat, VecTy);
? Builder.CreateSExt(Cat, VecTy, "sxt")
: Builder.CreateZExt(Cat, VecTy, "zxt");
return Ext;
}

Expand Down Expand Up @@ -1434,14 +1505,14 @@ auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
Value *Prod32 = createMul16(Builder, Op.X, Op.Y);
if (Rounding) {
Value *RoundVal = HVC.getConstSplat(Prod32->getType(), 1 << *Op.RoundAt);
Prod32 = Builder.CreateAdd(Prod32, RoundVal);
Prod32 = Builder.CreateAdd(Prod32, RoundVal, "add");
}

Value *ShiftAmt = HVC.getConstSplat(Prod32->getType(), Op.Frac);
Value *Shifted = Op.X.Sgn == Signed || Op.Y.Sgn == Signed
? Builder.CreateAShr(Prod32, ShiftAmt)
: Builder.CreateLShr(Prod32, ShiftAmt);
return Builder.CreateTrunc(Shifted, InpTy);
? Builder.CreateAShr(Prod32, ShiftAmt, "asr")
: Builder.CreateLShr(Prod32, ShiftAmt, "lsr");
return Builder.CreateTrunc(Shifted, InpTy, "trn");
}

// Width >= 32
Expand Down Expand Up @@ -1475,10 +1546,11 @@ auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
if (Src + 1 < End) {
Value *Hi = WordP[Src + 1];
WordP[Dst] = Builder.CreateIntrinsic(HvxWordTy, Intrinsic::fshr,
{Hi, Lo, ShiftAmt});
{Hi, Lo, ShiftAmt},
/*FMFSource*/ nullptr, "int");
} else {
// The shift of the most significant word.
WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt);
WordP[Dst] = Builder.CreateAShr(Lo, ShiftAmt, "asr");
}
}
if (SkipWords != 0)
Expand Down Expand Up @@ -1540,8 +1612,8 @@ auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
}
Value *Ret = HVC.createHvxIntrinsic(Builder, AddCarry,
/*RetTy=*/nullptr, Args);
Value *Result = Builder.CreateExtractValue(Ret, {0});
Value *CarryOut = Builder.CreateExtractValue(Ret, {1});
Value *Result = Builder.CreateExtractValue(Ret, {0}, "ext");
Value *CarryOut = Builder.CreateExtractValue(Ret, {1}, "ext");
return {Result, CarryOut};
}

Expand All @@ -1560,13 +1632,13 @@ auto HvxIdioms::createAddCarry(IRBuilderBase &Builder, Value *X, Value *Y,
Value *ValueIn =
HVC.createHvxIntrinsic(Builder, V6_vandqrt, /*RetTy=*/nullptr,
{CarryIn, HVC.getConstInt(Mask)});
Result1 = Builder.CreateAdd(X, ValueIn);
Result1 = Builder.CreateAdd(X, ValueIn, "add");
}

Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X);
Value *Result2 = Builder.CreateAdd(Result1, Y);
Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y);
return {Result2, Builder.CreateOr(CarryOut1, CarryOut2)};
Value *CarryOut1 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result1, X, "cmp");
Value *Result2 = Builder.CreateAdd(Result1, Y, "add");
Value *CarryOut2 = Builder.CreateCmp(CmpInst::ICMP_ULT, Result2, Y, "cmp");
return {Result2, Builder.CreateOr(CarryOut1, CarryOut2, "orb")};
}

auto HvxIdioms::createMul16(IRBuilderBase &Builder, SValue X, SValue Y) const
Expand Down Expand Up @@ -1603,15 +1675,16 @@ auto HvxIdioms::createMulH16(IRBuilderBase &Builder, SValue X, SValue Y) const
}

Type *HvxP16Ty = HVC.getHvxTy(HVC.getIntTy(16), /*Pair=*/true);
Value *Pair16 = Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty);
Value *Pair16 =
Builder.CreateBitCast(createMul16(Builder, X, Y), HvxP16Ty, "cst");
unsigned Len = HVC.length(HvxP16Ty) / 2;

SmallVector<int, 128> PickOdd(Len);
for (int i = 0; i != static_cast<int>(Len); ++i)
PickOdd[i] = 2 * i + 1;

return Builder.CreateShuffleVector(HVC.sublo(Builder, Pair16),
HVC.subhi(Builder, Pair16), PickOdd);
return Builder.CreateShuffleVector(
HVC.sublo(Builder, Pair16), HVC.subhi(Builder, Pair16), PickOdd, "shf");
}

auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const
Expand All @@ -1632,8 +1705,8 @@ auto HvxIdioms::createMul32(IRBuilderBase &Builder, SValue X, SValue Y) const

Value *Parts = HVC.createHvxIntrinsic(Builder, V6_vmpy_parts, nullptr,
{X.Val, Y.Val}, {HvxI32Ty});
Value *Hi = Builder.CreateExtractValue(Parts, {0});
Value *Lo = Builder.CreateExtractValue(Parts, {1});
Value *Hi = Builder.CreateExtractValue(Parts, {0}, "ext");
Value *Lo = Builder.CreateExtractValue(Parts, {1}, "ext");
return {Lo, Hi};
}

Expand Down Expand Up @@ -1899,7 +1972,7 @@ auto HexagonVectorCombine::insertb(IRBuilderBase &Builder, Value *Dst,
(Where <= i && i < Where + Length) ? P2Len + Start + (i - Where) : i;
}

Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask);
Value *P2Insert = Builder.CreateShuffleVector(P2Dst, P2Src, SMask, "shf");
return vresize(Builder, P2Insert, DstLen, Undef);
}

Expand All @@ -1922,12 +1995,14 @@ auto HexagonVectorCombine::vlalignb(IRBuilderBase &Builder, Value *Lo,

if (VecLen == 4) {
Value *Pair = concat(Builder, {Lo, Hi});
Value *Shift = Builder.CreateLShr(Builder.CreateShl(Pair, Amt), 32);
Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
return Builder.CreateBitCast(Trunc, Hi->getType());
Value *Shift =
Builder.CreateLShr(Builder.CreateShl(Pair, Amt, "shl"), 32, "lsr");
Value *Trunc =
Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
return Builder.CreateBitCast(Trunc, Hi->getType(), "cst");
}
if (VecLen == 8) {
Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt);
Value *Sub = Builder.CreateSub(getConstInt(VecLen), Amt, "sub");
return vralignb(Builder, Lo, Hi, Sub);
}
llvm_unreachable("Unexpected vector length");
Expand All @@ -1951,18 +2026,19 @@ auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,

if (VecLen == 4) {
Value *Pair = concat(Builder, {Lo, Hi});
Value *Shift = Builder.CreateLShr(Pair, Amt);
Value *Trunc = Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()));
return Builder.CreateBitCast(Trunc, Lo->getType());
Value *Shift = Builder.CreateLShr(Pair, Amt, "lsr");
Value *Trunc =
Builder.CreateTrunc(Shift, Type::getInt32Ty(F.getContext()), "trn");
return Builder.CreateBitCast(Trunc, Lo->getType(), "cst");
}
if (VecLen == 8) {
Type *Int64Ty = Type::getInt64Ty(F.getContext());
Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty);
Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty);
Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
Function *FI = Intrinsic::getDeclaration(F.getParent(),
Intrinsic::hexagon_S2_valignrb);
Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt});
return Builder.CreateBitCast(Call, Lo->getType());
Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt}, "cup");
return Builder.CreateBitCast(Call, Lo->getType(), "cst");
}
llvm_unreachable("Unexpected vector length");
}
Expand All @@ -1985,8 +2061,8 @@ auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
if (Work[ThisW].size() % 2 != 0)
Work[ThisW].push_back(UndefValue::get(Ty));
for (int i = 0, e = Work[ThisW].size(); i < e; i += 2) {
Value *Joined = Builder.CreateShuffleVector(Work[ThisW][i],
Work[ThisW][i + 1], SMask);
Value *Joined = Builder.CreateShuffleVector(
Work[ThisW][i], Work[ThisW][i + 1], SMask, "shf");
Work[OtherW].push_back(Joined);
}
std::swap(ThisW, OtherW);
Expand All @@ -1998,7 +2074,7 @@ auto HexagonVectorCombine::concat(IRBuilderBase &Builder,
SMask.resize(Vecs.size() * length(Vecs.front()->getType()));
std::iota(SMask.begin(), SMask.end(), 0);
Value *Total = Work[ThisW].front();
return Builder.CreateShuffleVector(Total, SMask);
return Builder.CreateShuffleVector(Total, SMask, "shf");
}

auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
Expand All @@ -2017,8 +2093,8 @@ auto HexagonVectorCombine::vresize(IRBuilderBase &Builder, Value *Val,
SmallVector<int, 128> SMask(NewSize);
std::iota(SMask.begin(), SMask.begin() + CurSize, 0);
std::fill(SMask.begin() + CurSize, SMask.end(), CurSize);
Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad);
return Builder.CreateShuffleVector(Val, PadVec, SMask);
Value *PadVec = Builder.CreateVectorSplat(CurSize, Pad, "spt");
return Builder.CreateShuffleVector(Val, PadVec, SMask, "shf");
}

auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
Expand Down Expand Up @@ -2048,11 +2124,11 @@ auto HexagonVectorCombine::rescale(IRBuilderBase &Builder, Value *Mask,
// Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
// -> trunc to <M x i1>.
Value *Ext = Builder.CreateSExt(
Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false));
Mask, VectorType::get(FromITy, FromCount, /*Scalable=*/false), "sxt");
Value *Cast = Builder.CreateBitCast(
Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false));
Ext, VectorType::get(ToITy, ToCount, /*Scalable=*/false), "cst");
return Builder.CreateTrunc(
Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false));
Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable=*/false), "trn");
}

// Bitcast to bytes, and return least significant bits.
Expand All @@ -2064,10 +2140,10 @@ auto HexagonVectorCombine::vlsb(IRBuilderBase &Builder, Value *Val) const

Value *Bytes = vbytes(Builder, Val);
if (auto *VecTy = dyn_cast<VectorType>(Bytes->getType()))
return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)));
return Builder.CreateTrunc(Bytes, getBoolTy(getSizeOf(VecTy)), "trn");
// If Bytes is a scalar (i.e. Val was a scalar byte), return i1, not
// <1 x i1>.
return Builder.CreateTrunc(Bytes, getBoolTy());
return Builder.CreateTrunc(Bytes, getBoolTy(), "trn");
}

// Bitcast to bytes for non-bool. For bool, convert i1 -> i8.
Expand All @@ -2078,11 +2154,11 @@ auto HexagonVectorCombine::vbytes(IRBuilderBase &Builder, Value *Val) const
return Val;

if (ScalarTy != getBoolTy())
return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)));
return Builder.CreateBitCast(Val, getByteTy(getSizeOf(Val)), "cst");
// For bool, return a sext from i1 to i8.
if (auto *VecTy = dyn_cast<VectorType>(Val->getType()))
return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy));
return Builder.CreateSExt(Val, getByteTy());
return Builder.CreateSExt(Val, VectorType::get(getByteTy(), VecTy), "sxt");
return Builder.CreateSExt(Val, getByteTy(), "sxt");
}

auto HexagonVectorCombine::subvector(IRBuilderBase &Builder, Value *Val,
Expand Down Expand Up @@ -2116,7 +2192,7 @@ auto HexagonVectorCombine::vdeal(IRBuilderBase &Builder, Value *Val0,
Mask[i] = 2 * i; // Even
Mask[i + Len] = 2 * i + 1; // Odd
}
return Builder.CreateShuffleVector(Val0, Val1, Mask);
return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
}

auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
Expand All @@ -2129,7 +2205,7 @@ auto HexagonVectorCombine::vshuff(IRBuilderBase &Builder, Value *Val0,
Mask[2 * i + 0] = i; // Val0
Mask[2 * i + 1] = i + Len; // Val1
}
return Builder.CreateShuffleVector(Val0, Val1, Mask);
return Builder.CreateShuffleVector(Val0, Val1, Mask, "shf");
}

auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
Expand All @@ -2149,15 +2225,15 @@ auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,

Type *BoolTy = Type::getInt1Ty(F.getContext());
if (cast<VectorType>(SrcTy)->getElementType() != BoolTy)
return Builder.CreateBitCast(Val, DestTy);
return Builder.CreateBitCast(Val, DestTy, "cst");

// Predicate HVX vector.
unsigned HwLen = HST.getVectorLength();
Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
: Intrinsic::hexagon_V6_pred_typecast_128B;
Function *FI =
Intrinsic::getDeclaration(F.getParent(), TC, {DestTy, Val->getType()});
return Builder.CreateCall(FI, {Val});
return Builder.CreateCall(FI, {Val}, "cup");
};

Function *IntrFn = Intrinsic::getDeclaration(F.getParent(), IntID, ArgTys);
Expand All @@ -2173,7 +2249,7 @@ auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
IntrArgs.push_back(A);
}
}
Value *Call = Builder.CreateCall(IntrFn, IntrArgs);
Value *Call = Builder.CreateCall(IntrFn, IntrArgs, "cup");

Type *CallTy = Call->getType();
if (RetTy == nullptr || CallTy == RetTy)
Expand Down Expand Up @@ -2223,7 +2299,7 @@ auto HexagonVectorCombine::splitVectorElements(IRBuilderBase &Builder,
unsigned Width = Val->getType()->getScalarSizeInBits();

auto *VTy = VectorType::get(getIntTy(Width / 2), 2 * Length, false);
Value *VVal = Builder.CreateBitCast(Val, VTy);
Value *VVal = Builder.CreateBitCast(Val, VTy, "cst");

Value *Res = vdeal(Builder, sublo(Builder, VVal), subhi(Builder, VVal));

Expand Down Expand Up @@ -2265,8 +2341,8 @@ auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
// Having too many inputs is ok: drop the high bits (usual wrap-around).
// If there are too few, fill them with the sign bit.
Value *Last = Inputs.back();
Value *Sign =
Builder.CreateAShr(Last, getConstSplat(Last->getType(), Width - 1));
Value *Sign = Builder.CreateAShr(
Last, getConstSplat(Last->getType(), Width - 1), "asr");
Inputs.resize(NeedInputs, Sign);
}

Expand All @@ -2275,7 +2351,7 @@ auto HexagonVectorCombine::joinVectorElements(IRBuilderBase &Builder,
auto *VTy = VectorType::get(getIntTy(Width), Length, false);
for (int i = 0, e = Inputs.size(); i < e; i += 2) {
Value *Res = vshuff(Builder, Inputs[i], Inputs[i + 1]);
Inputs[i / 2] = Builder.CreateBitCast(Res, VTy);
Inputs[i / 2] = Builder.CreateBitCast(Res, VTy, "cst");
}
Inputs.resize(Inputs.size() / 2);
}
Expand Down Expand Up @@ -2468,7 +2544,7 @@ auto HexagonVectorCombine::getElementRange(IRBuilderBase &Builder, Value *Lo,
assert(0 <= Start && size_t(Start + Length) < length(Lo) + length(Hi));
SmallVector<int, 128> SMask(Length);
std::iota(SMask.begin(), SMask.end(), Start);
return Builder.CreateShuffleVector(Lo, Hi, SMask);
return Builder.CreateShuffleVector(Lo, Hi, SMask, "shf");
}

// Pass management.
Expand Down
65 changes: 65 additions & 0 deletions llvm/test/CodeGen/Hexagon/autohvx/vector-align-order.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
; RUN: llc -march=hexagon < %s | FileCheck %s

; Check that this doesn't crash.
; CHECK: vmem

target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
target triple = "hexagon"

@g0 = external dllexport local_unnamed_addr global ptr, align 4
@g1 = external dllexport local_unnamed_addr global ptr, align 4

define dso_local fastcc void @f0(ptr %a0, i32 %a1) unnamed_addr #0 {
b0:
%v0 = load ptr, ptr @g1, align 4
%v1 = tail call ptr %v0(i32 16, i32 %a1, i64 2400, i32 0, i32 32)
br i1 poison, label %b2, label %b1

b1: ; preds = %b0
%v2 = load <64 x i8>, ptr poison, align 64
%v3 = zext <64 x i8> %v2 to <64 x i32>
%v4 = load <128 x i8>, ptr poison, align 64
%v5 = zext <128 x i8> %v4 to <128 x i32>
%v6 = load <128 x i8>, ptr poison, align 64
%v7 = zext <128 x i8> %v6 to <128 x i32>
%v8 = getelementptr i8, ptr %a0, i32 576
%v9 = getelementptr i8, ptr %v8, i32 -64
%v10 = load <64 x i8>, ptr %v9, align 64
%v11 = sext <64 x i8> %v10 to <64 x i32>
%v12 = add nsw <64 x i32> %v11, <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>
%v13 = mul nuw nsw <64 x i32> %v12, %v3
%v14 = extractelement <64 x i32> %v13, i32 0
store i32 %v14, ptr %v1, align 8, !tbaa !0
%v15 = getelementptr i8, ptr %v8, i32 384
%v16 = load <128 x i8>, ptr %v15, align 64, !tbaa !3
%v17 = sext <128 x i8> %v16 to <128 x i32>
%v18 = add nsw <128 x i32> %v17, <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>
%v19 = mul nuw nsw <128 x i32> %v18, %v5
%v20 = getelementptr i8, ptr %v8, i32 256
%v21 = load <128 x i8>, ptr %v20, align 64, !tbaa !3
%v22 = sext <128 x i8> %v21 to <128 x i32>
%v23 = add nsw <128 x i32> %v22, <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>
%v24 = mul nuw nsw <128 x i32> %v23, %v7
%v25 = add nsw <128 x i32> %v19, %v24
%v26 = extractelement <128 x i32> %v25, i32 0
%v27 = insertelement <64 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %v26, i64 0
%v28 = extractelement <64 x i32> %v27, i32 0
%v29 = getelementptr i8, ptr %v1, i32 4
store i32 %v28, ptr %v29, align 4
%v30 = load ptr, ptr @g0, align 4
%v31 = call i32 %v30(ptr nonnull @f1, ptr nonnull poison, i32 0)
unreachable

b2: ; preds = %b0
ret void
}

declare dso_local i32 @f1(i32, ptr, ptr) #0

attributes #0 = { "target-features"="+hvxv68,+hvx-length128b,+hvx-qfloat,-hvx-ieee-fp" }

!0 = !{!1, !1, i64 0}
!1 = !{!"0x555e886c7bc0", !2, i64 0}
!2 = !{!"tvm-tbaa"}
!3 = !{!4, !4, i64 0}
!4 = !{!"0x555e88cb2ce0", !2, i64 0}