diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 1cfcd3ffbd664..c2158ed7620ca 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1916,6 +1916,19 @@ class BoUpSLP { class ShuffleCostEstimator; class ShuffleInstructionBuilder; + /// If we decide to generate strided load / store, this struct contains all + /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate + /// and analyzeConstantStrideCandidate. Note that Stride can be given either + /// as a SCEV or as a Value if it already exists. To get the stride in bytes, + /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the + /// size of element of FixedVectorType. + struct StridedPtrInfo { + Value *StrideVal = nullptr; + const SCEV *StrideSCEV = nullptr; + FixedVectorType *Ty = nullptr; + }; + SmallDenseMap TreeEntryToStridedPtrInfoMap; + public: /// Tracks the state we can represent the loads in the given sequence. enum class LoadsState { @@ -2076,6 +2089,7 @@ class BoUpSLP { UserIgnoreList = nullptr; PostponedGathers.clear(); ValueToGatherNodes.clear(); + TreeEntryToStridedPtrInfoMap.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -2212,6 +2226,35 @@ class BoUpSLP { /// may not be necessary. bool isLoadCombineCandidate(ArrayRef Stores) const; + /// Suppose we are given pointers of the form: %b + x * %s + y * %c + /// where %c is constant. Check if the pointers can be rearranged as follows: + /// %b + 0 * %s + 0 + /// %b + 0 * %s + 1 + /// %b + 0 * %s + 2 + /// ... + /// %b + 0 * %s + w + /// + /// %b + 1 * %s + 0 + /// %b + 1 * %s + 1 + /// %b + 1 * %s + 2 + /// ... + /// %b + 1 * %s + w + /// ... + /// + /// If the pointers can be rearanged in the above pattern, it means that the + /// memory can be accessed with a strided loads of width `w` and stride `%s`. + bool analyzeRtStrideCandidate(ArrayRef PointerOps, Type *ElemTy, + Align CommonAlignment, + SmallVectorImpl &SortedIndices, + StridedPtrInfo &SPtrInfo) const; + + /// Same as analyzeRtStrideCandidate, but for constant strides. + bool analyzeConstantStrideCandidate(ArrayRef PointerOps, + Type *ElemTy, Align CommonAlignment, + SmallVectorImpl &SortedIndices, + StridedPtrInfo &SPtrInfo, int64_t Diff, + Value *Ptr0, Value *PtrN) const; + /// Checks if the given array of loads can be represented as a vectorized, /// scatter or just simple gather. /// \param VL list of loads. @@ -2225,6 +2268,7 @@ class BoUpSLP { LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, SmallVectorImpl &PointerOps, + StridedPtrInfo &SPtrInfo, unsigned *BestVF = nullptr, bool TryRecursiveCheck = true) const; @@ -4469,11 +4513,10 @@ class BoUpSLP { /// Checks if the specified list of the instructions/values can be vectorized /// and fills required data before actual scheduling of the instructions. - TreeEntry::EntryState - getScalarsVectorizationState(const InstructionsState &S, ArrayRef VL, - bool IsScatterVectorizeUserTE, - OrdersType &CurrentOrder, - SmallVectorImpl &PointerOps); + TreeEntry::EntryState getScalarsVectorizationState( + const InstructionsState &S, ArrayRef VL, + bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, + SmallVectorImpl &PointerOps, StridedPtrInfo &SPtrInfo); /// Maps a specific scalar to its tree entry(ies). SmallDenseMap> ScalarToTreeEntries; @@ -6343,7 +6386,8 @@ static bool isReverseOrder(ArrayRef Order) { /// Otherwise, SCEV* of the stride value is returned. static const SCEV *calculateRtStride(ArrayRef PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, - SmallVectorImpl &SortedIndices) { + SmallVectorImpl &SortedIndices, + SmallVectorImpl &Coeffs) { SmallVector SCEVs; const SCEV *PtrSCEVLowest = nullptr; const SCEV *PtrSCEVHighest = nullptr; @@ -6418,11 +6462,14 @@ static const SCEV *calculateRtStride(ArrayRef PointerOps, Type *ElemTy, const auto *SC = dyn_cast(Coeff); if (!SC || isa(SC)) return nullptr; + Coeffs.push_back((int64_t)SC->getAPInt().getLimitedValue()); if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest, SE.getMulExpr(Stride, SC))) ->isZero()) return nullptr; Dist = SC->getAPInt().getZExtValue(); + } else { + Coeffs.push_back(0); } // If the strides are not the same or repeated, we can't vectorize. if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) @@ -6437,18 +6484,153 @@ static const SCEV *calculateRtStride(ArrayRef PointerOps, Type *ElemTy, if (Offsets.size() != SCEVs.size()) return nullptr; SortedIndices.clear(); - if (!IsConsecutive) { - // Fill SortedIndices array only if it is non-consecutive. - SortedIndices.resize(PointerOps.size()); - Cnt = 0; - for (const std::pair &Pair : Offsets) { - SortedIndices[Cnt] = Pair.second; - ++Cnt; - } + SortedIndices.resize(PointerOps.size()); + Cnt = 0; + for (const std::pair &Pair : Offsets) { + SortedIndices[Cnt] = Pair.second; + ++Cnt; } return Stride; } +/// Suppose we are given pointers of the form: %b + x * %s + y * %c +/// where %c is constant. Check if the pointers can be rearranged as follows: +/// %b + 0 * %s + 0 +/// %b + 0 * %s + 1 +/// %b + 0 * %s + 2 +/// ... +/// %b + 0 * %s + w +/// +/// %b + 1 * %s + 0 +/// %b + 1 * %s + 1 +/// %b + 1 * %s + 2 +/// ... +/// %b + 1 * %s + w +/// ... +/// +/// If the pointers can be rearanged in the above pattern, it means that the +/// memory can be accessed with a strided loads of width `w` and stride `%s`. +bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef PointerOps, + Type *ElemTy, Align CommonAlignment, + SmallVectorImpl &SortedIndices, + StridedPtrInfo &SPtrInfo) const { + // Group the pointers by constant offset. + SmallDenseMap, SmallVector>> + OffsetToPointerOpIdxMap; + for (auto [Idx, Ptr] : enumerate(PointerOps)) { + const SCEV *PtrSCEV = SE->getSCEV(Ptr); + if (!PtrSCEV) + return false; + + const auto *Add = dyn_cast(PtrSCEV); + int64_t Offset = 0; + if (Add) { + for (int I : seq(Add->getNumOperands())) { + const auto *SC = dyn_cast(Add->getOperand(I)); + if (!SC) + continue; + Offset = SC->getAPInt().getSExtValue(); + break; + } + } + OffsetToPointerOpIdxMap[Offset].first.push_back(Ptr); + OffsetToPointerOpIdxMap[Offset].second.push_back(Idx); + } + int NumOffsets = OffsetToPointerOpIdxMap.size(); + + const unsigned Sz = PointerOps.size(); + unsigned VecSz = Sz; + Type *ScalarTy = ElemTy; + if (NumOffsets > 1) { + if (Sz % NumOffsets != 0) + return false; + VecSz = Sz / NumOffsets; + ScalarTy = Type::getIntNTy(SE->getContext(), + DL->getTypeSizeInBits(ElemTy).getFixedValue() * + NumOffsets); + } + FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz); + if (!TTI->isTypeLegal(StridedLoadTy) || + !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)) + return false; + + SmallVector SortedOffsetsV; + for (auto [K, _] : OffsetToPointerOpIdxMap) + SortedOffsetsV.push_back(K); + sort(SortedOffsetsV); + if (NumOffsets > 1) { + int64_t CommonDiff = SortedOffsetsV[1] - SortedOffsetsV[0]; + if (CommonDiff != 1) + return false; + for (int I : seq(1, SortedOffsetsV.size() - 1)) { + if (SortedOffsetsV[I + 1] - SortedOffsetsV[I] != CommonDiff) + return false; + } + } + + int64_t LowestOffset = SortedOffsetsV[0]; + SmallVector &PointerOps0 = + OffsetToPointerOpIdxMap[LowestOffset].first; + SmallVector &IndicesInAllPointerOps0 = + OffsetToPointerOpIdxMap[LowestOffset].second; + + SmallVector Coeffs0; + SmallVector SortedIndicesForOffset0; + const SCEV *Stride0 = calculateRtStride(PointerOps0, ElemTy, *DL, *SE, + SortedIndicesForOffset0, Coeffs0); + if (!Stride0) + return false; + unsigned NumCoeffs0 = Coeffs0.size(); + if (NumCoeffs0 * NumOffsets != Sz) + return false; + sort(Coeffs0); + + SmallVector SortedIndicesDraft; + SortedIndicesDraft.resize(Sz); + auto UpdateSortedIndices = + [&](SmallVectorImpl &SortedIndicesForOffset, + SmallVectorImpl &IndicesInAllPointerOps, + int64_t OffsetNum) { + for (const auto [Num, Idx] : enumerate(SortedIndicesForOffset)) { + SortedIndicesDraft[Num * NumOffsets + OffsetNum] = + IndicesInAllPointerOps[Idx]; + } + }; + + UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0); + + SmallVector Coeffs; + SmallVector SortedIndicesForOffset; + for (int I : seq(1, NumOffsets)) { + Coeffs.clear(); + SortedIndicesForOffset.clear(); + + int64_t Offset = SortedOffsetsV[I]; + SmallVector &PointerOpsForOffset = + OffsetToPointerOpIdxMap[Offset].first; + SmallVector &IndicesInAllPointerOps = + OffsetToPointerOpIdxMap[Offset].second; + const SCEV *StrideWithinGroup = calculateRtStride( + PointerOpsForOffset, ElemTy, *DL, *SE, SortedIndicesForOffset, Coeffs); + + if (!StrideWithinGroup || StrideWithinGroup != Stride0) + return false; + if (Coeffs.size() != NumCoeffs0) + return false; + sort(Coeffs); + if (Coeffs != Coeffs0) + return false; + + UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, I); + } + + SortedIndices.clear(); + SortedIndices = SortedIndicesDraft; + SPtrInfo.StrideSCEV = Stride0; + SPtrInfo.Ty = StridedLoadTy; + return true; +} + static std::pair getGEPCosts(const TargetTransformInfo &TTI, ArrayRef Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, @@ -6776,77 +6958,132 @@ isMaskedLoadCompress(ArrayRef VL, ArrayRef PointerOps, CompressMask, LoadVecTy); } -/// Checks if strided loads can be generated out of \p VL loads with pointers \p -/// PointerOps: -/// 1. Target with strided load support is detected. -/// 2. The number of loads is greater than MinProfitableStridedLoads, or the -/// potential stride <= MaxProfitableLoadStride and the potential stride is -/// power-of-2 (to avoid perf regressions for the very small number of loads) -/// and max distance > number of loads, or potential stride is -1. -/// 3. The loads are ordered, or number of unordered loads <= -/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is -/// to avoid extra costs for very expensive shuffles). -/// 4. Any pointer operand is an instruction with the users outside of the -/// current graph (for masked gathers extra extractelement instructions -/// might be required). -static bool isStridedLoad(ArrayRef VL, ArrayRef PointerOps, - ArrayRef Order, - const TargetTransformInfo &TTI, const DataLayout &DL, - ScalarEvolution &SE, - const bool IsAnyPointerUsedOutGraph, - const int64_t Diff) { - const size_t Sz = VL.size(); - const uint64_t AbsoluteDiff = std::abs(Diff); - Type *ScalarTy = VL.front()->getType(); - auto *VecTy = getWidenedType(ScalarTy, Sz); - if (IsAnyPointerUsedOutGraph || - (AbsoluteDiff > Sz && - (Sz > MinProfitableStridedLoads || - (AbsoluteDiff <= MaxProfitableLoadStride * Sz && - AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) || - Diff == -(static_cast(Sz) - 1)) { - int64_t Stride = Diff / static_cast(Sz - 1); - if (Diff != Stride * static_cast(Sz - 1)) +/// Same as analyzeRtStrideCandidate, but for constant strides. +bool BoUpSLP::analyzeConstantStrideCandidate( + ArrayRef PointerOps, Type *ElemTy, Align CommonAlignment, + SmallVectorImpl &SortedIndices, StridedPtrInfo &SPtrInfo, + int64_t Diff, Value *Ptr0, Value *PtrN) const { + const unsigned Sz = PointerOps.size(); + SmallVector SortedOffsetsFromBase; + SortedOffsetsFromBase.resize(Sz); + for (unsigned I : seq(Sz)) { + Value *Ptr = + SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]]; + SortedOffsetsFromBase[I] = + *getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, *DL, *SE); + } + + // Find where the first group ends. + assert(SortedOffsetsFromBase.size() > 1 && + "Trying to generate strided load for less than 2 loads"); + int64_t StrideWithinGroup = + SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0]; + unsigned GroupSize = 1; + for (; GroupSize != SortedOffsetsFromBase.size(); ++GroupSize) { + if (SortedOffsetsFromBase[GroupSize] - + SortedOffsetsFromBase[GroupSize - 1] != + StrideWithinGroup) + break; + } + unsigned VecSz = Sz; + Type *ScalarTy = ElemTy; + int64_t StrideIntVal = StrideWithinGroup; + FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy, VecSz); + + if (Sz != GroupSize) { + if (Sz % GroupSize != 0) return false; - Align Alignment = - cast(Order.empty() ? VL.front() : VL[Order.front()]) - ->getAlign(); - if (!TTI.isLegalStridedLoadStore(VecTy, Alignment)) + VecSz = Sz / GroupSize; + + if (StrideWithinGroup != 1) return false; - Value *Ptr0; - Value *PtrN; - if (Order.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); - } else { - Ptr0 = PointerOps[Order.front()]; - PtrN = PointerOps[Order.back()]; - } - // Iterate through all pointers and check if all distances are - // unique multiple of Dist. - SmallSet Dists; - for (Value *Ptr : PointerOps) { - int64_t Dist = 0; - if (Ptr == PtrN) - Dist = Diff; - else if (Ptr != Ptr0) - Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, DL, SE); - // If the strides are not the same or repeated, we can't - // vectorize. - if (((Dist / Stride) * Stride) != Dist || !Dists.insert(Dist).second) + unsigned VecSz = Sz / GroupSize; + ScalarTy = Type::getIntNTy(SE->getContext(), + DL->getTypeSizeInBits(ElemTy).getFixedValue() * + GroupSize); + StridedLoadTy = getWidenedType(ScalarTy, VecSz); + if (!TTI->isTypeLegal(StridedLoadTy) || + !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)) + return false; + + unsigned PrevGroupStartIdx = 0; + unsigned CurrentGroupStartIdx = GroupSize; + int64_t StrideBetweenGroups = + SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0]; + StrideIntVal = StrideBetweenGroups; + while (CurrentGroupStartIdx != Sz) { + if (SortedOffsetsFromBase[CurrentGroupStartIdx] - + SortedOffsetsFromBase[PrevGroupStartIdx] != + StrideBetweenGroups) break; + PrevGroupStartIdx = CurrentGroupStartIdx; + CurrentGroupStartIdx += GroupSize; + } + if (CurrentGroupStartIdx != Sz) + return false; + + auto CheckGroup = [&](unsigned StartIdx, unsigned GroupSize0, + int64_t StrideWithinGroup) -> bool { + unsigned GroupEndIdx = StartIdx + 1; + for (; GroupEndIdx != Sz; ++GroupEndIdx) { + if (SortedOffsetsFromBase[GroupEndIdx] - + SortedOffsetsFromBase[GroupEndIdx - 1] != + StrideWithinGroup) + break; + } + return GroupEndIdx - StartIdx == GroupSize0; + }; + for (unsigned I = 0; I < Sz; I += GroupSize) { + if (!CheckGroup(I, GroupSize, StrideWithinGroup)) + return false; } - if (Dists.size() == Sz) - return true; + } + + // Try to generate strided load node if: + // 1. Target with strided load support is detected. + // 2. The number of loads is greater than MinProfitableStridedLoads, + // or the potential stride <= MaxProfitableLoadStride and the + // potential stride is power-of-2 (to avoid perf regressions for the very + // small number of loads) and max distance > number of loads, or potential + // stride is -1. + // 3. The loads are ordered, or number of unordered loads <= + // MaxProfitableUnorderedLoads, or loads are in reversed order. + // (this check is to avoid extra costs for very expensive shuffles). + // 4. Any pointer operand is an instruction with the users outside of the + // current graph (for masked gathers extra extractelement instructions + // might be required). + + if (!TTI->isTypeLegal(StridedLoadTy) || + !TTI->isLegalStridedLoadStore(StridedLoadTy, CommonAlignment)) + return false; + + // Simple check if not a strided access - clear order. + bool IsPossibleStrided = Diff % (VecSz - 1) == 0; + auto IsAnyPointerUsedOutGraph = + IsPossibleStrided && any_of(PointerOps, [&](Value *V) { + return isa(V) && any_of(V->users(), [&](User *U) { + return !isVectorized(U) && !MustGather.contains(U); + }); + }); + const unsigned AbsoluteDiff = std::abs(Diff); + if (IsAnyPointerUsedOutGraph || + ((VecSz > MinProfitableStridedLoads || + (AbsoluteDiff <= MaxProfitableLoadStride * VecSz && + has_single_bit(AbsoluteDiff))) && + AbsoluteDiff > VecSz) || + Diff == -(static_cast(VecSz) - 1)) { + Type *StrideTy = DL->getIndexType(Ptr0->getType()); + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, StrideIntVal); + SPtrInfo.Ty = StridedLoadTy; + return true; } return false; } -BoUpSLP::LoadsState -BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, - SmallVectorImpl &Order, - SmallVectorImpl &PointerOps, - unsigned *BestVF, bool TryRecursiveCheck) const { +BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( + ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, + SmallVectorImpl &PointerOps, StridedPtrInfo &SPtrInfo, + unsigned *BestVF, bool TryRecursiveCheck) const { // Check that a vectorized load would load the same memory as a scalar // load. For example, we don't want to vectorize loads that are smaller // than 8-bit. Even though we have a packed struct {} LLVM @@ -6883,11 +7120,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, auto *VecTy = getWidenedType(ScalarTy, Sz); Align CommonAlignment = computeCommonAlignment(VL); if (!IsSorted) { - if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) { - if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) && - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order)) - return LoadsState::StridedVectorize; - } + if (Sz > MinProfitableStridedLoads && + analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, Order, + SPtrInfo)) + return LoadsState::StridedVectorize; if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) @@ -6920,17 +7156,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, })) return LoadsState::CompressVectorize; // Simple check if not a strided access - clear order. - bool IsPossibleStrided = *Diff % (Sz - 1) == 0; - // Try to generate strided load node. - auto IsAnyPointerUsedOutGraph = - IsPossibleStrided && any_of(PointerOps, [&](Value *V) { - return isa(V) && any_of(V->users(), [&](User *U) { - return !isVectorized(U) && !MustGather.contains(U); - }); - }); - if (IsPossibleStrided && - isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE, - IsAnyPointerUsedOutGraph, *Diff)) + if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, CommonAlignment, + Order, SPtrInfo, *Diff, Ptr0, PtrN)) return LoadsState::StridedVectorize; } if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || @@ -7014,9 +7241,9 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, ArrayRef Slice = VL.slice(Cnt, VF); SmallVector Order; SmallVector PointerOps; - LoadsState LS = - canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF, - /*TryRecursiveCheck=*/false); + LoadsState LS = canVectorizeLoads(Slice, Slice.front(), Order, + PointerOps, SPtrInfo, BestVF, + /*TryRecursiveCheck=*/false); // Check that the sorted loads are consecutive. if (LS == LoadsState::Gather) { if (BestVF) { @@ -7689,8 +7916,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, if (TE.hasState() && TE.getOpcode() == Instruction::Load) { SmallVector PointerOps; OrdersType CurrentOrder; + StridedPtrInfo SPtrInfo; LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), - CurrentOrder, PointerOps); + CurrentOrder, PointerOps, SPtrInfo); if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize || Res == LoadsState::CompressVectorize) return std::move(CurrentOrder); @@ -9193,8 +9421,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads( // Try to build vector load. ArrayRef Values( reinterpret_cast(Slice.begin()), Slice.size()); + StridedPtrInfo SPtrInfo; LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder, - PointerOps, &BestVF); + PointerOps, SPtrInfo, &BestVF); if (LS != LoadsState::Gather || (BestVF > 1 && static_cast(NumElts) == 2 * BestVF)) { if (LS == LoadsState::ScatterVectorize) { @@ -9388,6 +9617,7 @@ void BoUpSLP::tryToVectorizeGatheredLoads( unsigned VF = *CommonVF; OrdersType Order; SmallVector PointerOps; + StridedPtrInfo SPtrInfo; // Segmented load detected - vectorize at maximum vector factor. if (InterleaveFactor <= Slice.size() && TTI.isLegalInterleavedAccessType( @@ -9396,8 +9626,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( cast(Slice.front())->getAlign(), cast(Slice.front()) ->getPointerAddressSpace()) && - canVectorizeLoads(Slice, Slice.front(), Order, - PointerOps) == LoadsState::Vectorize) { + canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, + SPtrInfo) == LoadsState::Vectorize) { UserMaxVF = InterleaveFactor * VF; } else { InterleaveFactor = 0; @@ -9419,8 +9649,9 @@ void BoUpSLP::tryToVectorizeGatheredLoads( ArrayRef VL = TE.Scalars; OrdersType Order; SmallVector PointerOps; + StridedPtrInfo SPtrInfo; LoadsState State = canVectorizeLoads( - VL, VL.front(), Order, PointerOps); + VL, VL.front(), Order, PointerOps, SPtrInfo); if (State == LoadsState::ScatterVectorize || State == LoadsState::CompressVectorize) return false; @@ -9438,11 +9669,11 @@ void BoUpSLP::tryToVectorizeGatheredLoads( [&, Slice = Slice](unsigned Idx) { OrdersType Order; SmallVector PointerOps; + StridedPtrInfo SPtrInfo; return canVectorizeLoads( Slice.slice(Idx * UserMaxVF, UserMaxVF), - Slice[Idx * UserMaxVF], Order, - PointerOps) == - LoadsState::ScatterVectorize; + Slice[Idx * UserMaxVF], Order, PointerOps, + SPtrInfo) == LoadsState::ScatterVectorize; })) UserMaxVF = MaxVF; if (Slice.size() != ConsecutiveNodesSize) @@ -9799,7 +10030,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( const InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, - SmallVectorImpl &PointerOps) { + SmallVectorImpl &PointerOps, StridedPtrInfo &SPtrInfo) { assert(S.getMainOp() && "Expected instructions with same/alternate opcodes only."); @@ -9901,7 +10132,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( }); }); }; - switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) { + switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps, SPtrInfo)) { case LoadsState::Vectorize: return TreeEntry::Vectorize; case LoadsState::CompressVectorize: @@ -11374,8 +11605,9 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; OrdersType CurrentOrder; SmallVector PointerOps; + StridedPtrInfo SPtrInfo; TreeEntry::EntryState State = getScalarsVectorizationState( - S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps); + S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo); if (State == TreeEntry::NeedToGather) { newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); return; @@ -11535,6 +11767,7 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, // Vectorizing non-consecutive loads with `llvm.masked.gather`. TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndices, CurrentOrder); + TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo; LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n"; TE->dump()); break; @@ -12923,8 +13156,9 @@ void BoUpSLP::transformNodes() { if (S.getOpcode() == Instruction::Load) { OrdersType Order; SmallVector PointerOps; - LoadsState Res = - canVectorizeLoads(Slice, Slice.front(), Order, PointerOps); + StridedPtrInfo SPtrInfo; + LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order, + PointerOps, SPtrInfo); AllStrided &= Res == LoadsState::StridedVectorize || Res == LoadsState::ScatterVectorize || Res == LoadsState::Gather; @@ -13030,10 +13264,18 @@ void BoUpSLP::transformNodes() { InstructionCost StridedCost = TTI->getStridedMemoryOpCost( Instruction::Load, VecTy, BaseLI->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI); - if (StridedCost < OriginalVecCost) + if (StridedCost < OriginalVecCost) { // Strided load is more profitable than consecutive load + reverse - // transform the node to strided load. + Type *StrideTy = DL->getIndexType(cast(E.Scalars.front()) + ->getPointerOperand() + ->getType()); + StridedPtrInfo SPtrInfo; + SPtrInfo.StrideVal = ConstantInt::get(StrideTy, 1); + SPtrInfo.Ty = VecTy; + TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo; E.State = TreeEntry::StridedVectorize; + } } break; } @@ -14817,11 +15059,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, } break; case TreeEntry::StridedVectorize: { + const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E); + FixedVectorType *StridedLoadTy = SPtrInfo.Ty; + assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry."); Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); VecLdCost = TTI->getStridedMemoryOpCost( - Instruction::Load, VecTy, LI0->getPointerOperand(), + Instruction::Load, StridedLoadTy, LI0->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind); + if (StridedLoadTy != VecTy) + VecLdCost += + TTI->getCastInstrCost(Instruction::BitCast, VecTy, StridedLoadTy, + getCastContextHint(*E), CostKind); + break; } case TreeEntry::CompressVectorize: { @@ -19474,6 +19724,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { LoadInst *LI = cast(VL0); Instruction *NewLI; + FixedVectorType *StridedLoadTy = nullptr; Value *PO = LI->getPointerOperand(); if (E->State == TreeEntry::Vectorize) { NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign()); @@ -19511,43 +19762,36 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *Ptr0 = cast(E->Scalars.front())->getPointerOperand(); Value *PtrN = cast(E->Scalars.back())->getPointerOperand(); PO = IsReverseOrder ? PtrN : Ptr0; - std::optional Diff = getPointersDiff( - VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE); Type *StrideTy = DL->getIndexType(PO->getType()); Value *StrideVal; - if (Diff) { - int64_t Stride = - *Diff / (static_cast(E->Scalars.size()) - 1); - StrideVal = - ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride * - DL->getTypeAllocSize(ScalarTy)); - } else { - SmallVector PointerOps(E->Scalars.size(), nullptr); - transform(E->Scalars, PointerOps.begin(), [](Value *V) { - return cast(V)->getPointerOperand(); - }); - OrdersType Order; - const SCEV *StrideSCEV = - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order); - assert(StrideSCEV && "At this point stride should be known"); + const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(E); + StridedLoadTy = SPtrInfo.Ty; + assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry."); + unsigned StridedLoadEC = + StridedLoadTy->getElementCount().getKnownMinValue(); + + Value *Stride = SPtrInfo.StrideVal; + if (!Stride) { + const SCEV *StrideSCEV = SPtrInfo.StrideSCEV; + assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set."); SCEVExpander Expander(*SE, *DL, "strided-load-vec"); - Value *Stride = Expander.expandCodeFor( - StrideSCEV, StrideSCEV->getType(), &*Builder.GetInsertPoint()); - Value *NewStride = - Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true); - StrideVal = Builder.CreateMul( - NewStride, - ConstantInt::get( - StrideTy, - (IsReverseOrder ? -1 : 1) * - static_cast(DL->getTypeAllocSize(ScalarTy)))); - } + Stride = Expander.expandCodeFor(StrideSCEV, StrideSCEV->getType(), + &*Builder.GetInsertPoint()); + } + Value *NewStride = + Builder.CreateIntCast(Stride, StrideTy, /*isSigned=*/true); + StrideVal = Builder.CreateMul( + NewStride, ConstantInt::get( + StrideTy, (IsReverseOrder ? -1 : 1) * + static_cast( + DL->getTypeAllocSize(ScalarTy)))); Align CommonAlignment = computeCommonAlignment(E->Scalars); auto *Inst = Builder.CreateIntrinsic( Intrinsic::experimental_vp_strided_load, - {VecTy, PO->getType(), StrideTy}, - {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()), - Builder.getInt32(E->Scalars.size())}); + {StridedLoadTy, PO->getType(), StrideTy}, + {PO, StrideVal, + Builder.getAllOnesMask(ElementCount::getFixed(StridedLoadEC)), + Builder.getInt32(StridedLoadEC)}); Inst->addParamAttr( /*ArgNo=*/0, Attribute::getWithAlignment(Inst->getContext(), CommonAlignment)); @@ -19584,6 +19828,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ? NewLI : ::propagateMetadata(NewLI, E->Scalars); + if (StridedLoadTy) + V = Builder.CreateBitOrPointerCast(V, VecTy); V = FinalShuffle(V, E); E->VectorizedValue = V; ++NumVectorInstructions; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll index 645dbc49269f0..0135d3c01d9f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll @@ -630,25 +630,11 @@ define void @rt_stride_widen_no_reordering(ptr %pl, i64 %stride, ptr %ps) { ; CHECK-LABEL: define void @rt_stride_widen_no_reordering( ; CHECK-SAME: ptr [[PL:%.*]], i64 [[STRIDE:%.*]], ptr [[PS:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[OFFSET0:%.*]] = mul nsw i64 [[STRIDE]], 0 -; CHECK-NEXT: [[OFFSET4:%.*]] = mul nsw i64 [[STRIDE]], 1 -; CHECK-NEXT: [[OFFSET8:%.*]] = mul nsw i64 [[STRIDE]], 2 -; CHECK-NEXT: [[OFFSET12:%.*]] = mul nsw i64 [[STRIDE]], 3 ; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET0]] -; CHECK-NEXT: [[GEP_L4:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET4]] -; CHECK-NEXT: [[GEP_L8:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET8]] -; CHECK-NEXT: [[GEP_L12:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 [[OFFSET12]] ; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[GEP_L0]], align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_L4]], align 16 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_L8]], align 16 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[GEP_L12]], align 16 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP2]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP7]], <16 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[STRIDE]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 16 [[GEP_L0]], i64 [[TMP1]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> ; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[GEP_S0]], align 16 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll new file mode 100644 index 0000000000000..33249a8e66657 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/x264-satd-8x4.ll @@ -0,0 +1,482 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem \ +; RUN: -passes=slp-vectorizer -S < %s | FileCheck %s +; Function Attrs: nounwind uwtable vscale_range(8,1024) +define i32 @x264_pixel_satd_8x4(ptr %pix1, i32 %i_pix1, ptr %pix2, i32 %i_pix2) { +; CHECK-LABEL: define i32 @x264_pixel_satd_8x4( +; CHECK-SAME: ptr [[PIX1:%.*]], i32 [[I_PIX1:%.*]], ptr [[PIX2:%.*]], i32 [[I_PIX2:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[I_PIX1]] to i64 +; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[I_PIX2]] to i64 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[PIX2]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[IDX_EXT]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX1]], i64 [[TMP0]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +; CHECK-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[TMP19]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[IDX_EXT63]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[PIX2]], i64 [[TMP4]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8> +; CHECK-NEXT: [[TMP29:%.*]] = zext <16 x i8> [[TMP28]] to <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = sub nsw <16 x i32> [[TMP20]], [[TMP29]] +; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[IDX_EXT]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 [[TMP9]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP38:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8> +; CHECK-NEXT: [[TMP39:%.*]] = zext <16 x i8> [[TMP38]] to <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[IDX_EXT63]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = call <4 x i32> @llvm.experimental.vp.strided.load.v4i32.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 [[TMP13]], <4 x i1> splat (i1 true), i32 4) +; CHECK-NEXT: [[TMP47:%.*]] = bitcast <4 x i32> [[TMP14]] to <16 x i8> +; CHECK-NEXT: [[TMP48:%.*]] = zext <16 x i8> [[TMP47]] to <16 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = sub nsw <16 x i32> [[TMP39]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = shl nsw <16 x i32> [[TMP49]], splat (i32 16) +; CHECK-NEXT: [[TMP51:%.*]] = add nsw <16 x i32> [[TMP50]], [[TMP30]] +; CHECK-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[TMP52]], [[TMP51]] +; CHECK-NEXT: [[TMP54:%.*]] = sub nsw <16 x i32> [[TMP52]], [[TMP51]] +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = add nsw <16 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = sub nsw <16 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = sub nsw <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = add nsw <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP63]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = add nsw <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP66:%.*]] = sub nsw <16 x i32> [[TMP63]], [[TMP64]] +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <16 x i32> [[TMP65]], <16 x i32> [[TMP66]], <16 x i32> +; CHECK-NEXT: [[TMP68:%.*]] = lshr <16 x i32> [[TMP67]], splat (i32 15) +; CHECK-NEXT: [[TMP69:%.*]] = and <16 x i32> [[TMP68]], splat (i32 65537) +; CHECK-NEXT: [[TMP70:%.*]] = mul nuw <16 x i32> [[TMP69]], splat (i32 65535) +; CHECK-NEXT: [[TMP71:%.*]] = add <16 x i32> [[TMP70]], [[TMP67]] +; CHECK-NEXT: [[TMP72:%.*]] = xor <16 x i32> [[TMP71]], [[TMP70]] +; CHECK-NEXT: [[TMP73:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP72]]) +; CHECK-NEXT: [[CONV118:%.*]] = and i32 [[TMP73]], 65535 +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP73]], 16 +; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 [[CONV118]], [[SHR]] +; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 +; CHECK-NEXT: ret i32 [[SHR120]] +; +entry: + %idx.ext = sext i32 %i_pix1 to i64 + %idx.ext63 = sext i32 %i_pix2 to i64 + %0 = load i8, ptr %pix1, align 1 + %conv = zext i8 %0 to i32 + %1 = load i8, ptr %pix2, align 1 + %conv2 = zext i8 %1 to i32 + %sub = sub nsw i32 %conv, %conv2 + %arrayidx3 = getelementptr inbounds nuw i8, ptr %pix1, i64 4 + %2 = load i8, ptr %arrayidx3, align 1 + %conv4 = zext i8 %2 to i32 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %pix2, i64 4 + %3 = load i8, ptr %arrayidx5, align 1 + %conv6 = zext i8 %3 to i32 + %sub7 = sub nsw i32 %conv4, %conv6 + %shl = shl nsw i32 %sub7, 16 + %add = add nsw i32 %shl, %sub + %arrayidx8 = getelementptr inbounds nuw i8, ptr %pix1, i64 1 + %4 = load i8, ptr %arrayidx8, align 1 + %conv9 = zext i8 %4 to i32 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %pix2, i64 1 + %5 = load i8, ptr %arrayidx10, align 1 + %conv11 = zext i8 %5 to i32 + %sub12 = sub nsw i32 %conv9, %conv11 + %arrayidx13 = getelementptr inbounds nuw i8, ptr %pix1, i64 5 + %6 = load i8, ptr %arrayidx13, align 1 + %conv14 = zext i8 %6 to i32 + %arrayidx15 = getelementptr inbounds nuw i8, ptr %pix2, i64 5 + %7 = load i8, ptr %arrayidx15, align 1 + %conv16 = zext i8 %7 to i32 + %sub17 = sub nsw i32 %conv14, %conv16 + %shl18 = shl nsw i32 %sub17, 16 + %add19 = add nsw i32 %shl18, %sub12 + %arrayidx20 = getelementptr inbounds nuw i8, ptr %pix1, i64 2 + %8 = load i8, ptr %arrayidx20, align 1 + %conv21 = zext i8 %8 to i32 + %arrayidx22 = getelementptr inbounds nuw i8, ptr %pix2, i64 2 + %9 = load i8, ptr %arrayidx22, align 1 + %conv23 = zext i8 %9 to i32 + %sub24 = sub nsw i32 %conv21, %conv23 + %arrayidx25 = getelementptr inbounds nuw i8, ptr %pix1, i64 6 + %10 = load i8, ptr %arrayidx25, align 1 + %conv26 = zext i8 %10 to i32 + %arrayidx27 = getelementptr inbounds nuw i8, ptr %pix2, i64 6 + %11 = load i8, ptr %arrayidx27, align 1 + %conv28 = zext i8 %11 to i32 + %sub29 = sub nsw i32 %conv26, %conv28 + %shl30 = shl nsw i32 %sub29, 16 + %add31 = add nsw i32 %shl30, %sub24 + %arrayidx32 = getelementptr inbounds nuw i8, ptr %pix1, i64 3 + %12 = load i8, ptr %arrayidx32, align 1 + %conv33 = zext i8 %12 to i32 + %arrayidx34 = getelementptr inbounds nuw i8, ptr %pix2, i64 3 + %13 = load i8, ptr %arrayidx34, align 1 + %conv35 = zext i8 %13 to i32 + %sub36 = sub nsw i32 %conv33, %conv35 + %arrayidx37 = getelementptr inbounds nuw i8, ptr %pix1, i64 7 + %14 = load i8, ptr %arrayidx37, align 1 + %conv38 = zext i8 %14 to i32 + %arrayidx39 = getelementptr inbounds nuw i8, ptr %pix2, i64 7 + %15 = load i8, ptr %arrayidx39, align 1 + %conv40 = zext i8 %15 to i32 + %sub41 = sub nsw i32 %conv38, %conv40 + %shl42 = shl nsw i32 %sub41, 16 + %add43 = add nsw i32 %shl42, %sub36 + %add44 = add nsw i32 %add19, %add + %sub45 = sub nsw i32 %add, %add19 + %add46 = add nsw i32 %add43, %add31 + %sub47 = sub nsw i32 %add31, %add43 + %add48 = add nsw i32 %add46, %add44 + %sub51 = sub nsw i32 %add44, %add46 + %add55 = add nsw i32 %sub47, %sub45 + %sub59 = sub nsw i32 %sub45, %sub47 + %add.ptr = getelementptr inbounds i8, ptr %pix1, i64 %idx.ext + %add.ptr64 = getelementptr inbounds i8, ptr %pix2, i64 %idx.ext63 + %16 = load i8, ptr %add.ptr, align 1 + %conv.1 = zext i8 %16 to i32 + %17 = load i8, ptr %add.ptr64, align 1 + %conv2.1 = zext i8 %17 to i32 + %sub.1 = sub nsw i32 %conv.1, %conv2.1 + %arrayidx3.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 4 + %18 = load i8, ptr %arrayidx3.1, align 1 + %conv4.1 = zext i8 %18 to i32 + %arrayidx5.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 4 + %19 = load i8, ptr %arrayidx5.1, align 1 + %conv6.1 = zext i8 %19 to i32 + %sub7.1 = sub nsw i32 %conv4.1, %conv6.1 + %shl.1 = shl nsw i32 %sub7.1, 16 + %add.1 = add nsw i32 %shl.1, %sub.1 + %arrayidx8.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 1 + %20 = load i8, ptr %arrayidx8.1, align 1 + %conv9.1 = zext i8 %20 to i32 + %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 1 + %21 = load i8, ptr %arrayidx10.1, align 1 + %conv11.1 = zext i8 %21 to i32 + %sub12.1 = sub nsw i32 %conv9.1, %conv11.1 + %arrayidx13.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 5 + %22 = load i8, ptr %arrayidx13.1, align 1 + %conv14.1 = zext i8 %22 to i32 + %arrayidx15.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 5 + %23 = load i8, ptr %arrayidx15.1, align 1 + %conv16.1 = zext i8 %23 to i32 + %sub17.1 = sub nsw i32 %conv14.1, %conv16.1 + %shl18.1 = shl nsw i32 %sub17.1, 16 + %add19.1 = add nsw i32 %shl18.1, %sub12.1 + %arrayidx20.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 2 + %24 = load i8, ptr %arrayidx20.1, align 1 + %conv21.1 = zext i8 %24 to i32 + %arrayidx22.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 2 + %25 = load i8, ptr %arrayidx22.1, align 1 + %conv23.1 = zext i8 %25 to i32 + %sub24.1 = sub nsw i32 %conv21.1, %conv23.1 + %arrayidx25.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 6 + %26 = load i8, ptr %arrayidx25.1, align 1 + %conv26.1 = zext i8 %26 to i32 + %arrayidx27.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 6 + %27 = load i8, ptr %arrayidx27.1, align 1 + %conv28.1 = zext i8 %27 to i32 + %sub29.1 = sub nsw i32 %conv26.1, %conv28.1 + %shl30.1 = shl nsw i32 %sub29.1, 16 + %add31.1 = add nsw i32 %shl30.1, %sub24.1 + %arrayidx32.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 3 + %28 = load i8, ptr %arrayidx32.1, align 1 + %conv33.1 = zext i8 %28 to i32 + %arrayidx34.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 3 + %29 = load i8, ptr %arrayidx34.1, align 1 + %conv35.1 = zext i8 %29 to i32 + %sub36.1 = sub nsw i32 %conv33.1, %conv35.1 + %arrayidx37.1 = getelementptr inbounds nuw i8, ptr %add.ptr, i64 7 + %30 = load i8, ptr %arrayidx37.1, align 1 + %conv38.1 = zext i8 %30 to i32 + %arrayidx39.1 = getelementptr inbounds nuw i8, ptr %add.ptr64, i64 7 + %31 = load i8, ptr %arrayidx39.1, align 1 + %conv40.1 = zext i8 %31 to i32 + %sub41.1 = sub nsw i32 %conv38.1, %conv40.1 + %shl42.1 = shl nsw i32 %sub41.1, 16 + %add43.1 = add nsw i32 %shl42.1, %sub36.1 + %add44.1 = add nsw i32 %add19.1, %add.1 + %sub45.1 = sub nsw i32 %add.1, %add19.1 + %add46.1 = add nsw i32 %add43.1, %add31.1 + %sub47.1 = sub nsw i32 %add31.1, %add43.1 + %add48.1 = add nsw i32 %add46.1, %add44.1 + %sub51.1 = sub nsw i32 %add44.1, %add46.1 + %add55.1 = add nsw i32 %sub47.1, %sub45.1 + %sub59.1 = sub nsw i32 %sub45.1, %sub47.1 + %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext + %add.ptr64.1 = getelementptr inbounds i8, ptr %add.ptr64, i64 %idx.ext63 + %32 = load i8, ptr %add.ptr.1, align 1 + %conv.2 = zext i8 %32 to i32 + %33 = load i8, ptr %add.ptr64.1, align 1 + %conv2.2 = zext i8 %33 to i32 + %sub.2 = sub nsw i32 %conv.2, %conv2.2 + %arrayidx3.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 4 + %34 = load i8, ptr %arrayidx3.2, align 1 + %conv4.2 = zext i8 %34 to i32 + %arrayidx5.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 4 + %35 = load i8, ptr %arrayidx5.2, align 1 + %conv6.2 = zext i8 %35 to i32 + %sub7.2 = sub nsw i32 %conv4.2, %conv6.2 + %shl.2 = shl nsw i32 %sub7.2, 16 + %add.2 = add nsw i32 %shl.2, %sub.2 + %arrayidx8.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 1 + %36 = load i8, ptr %arrayidx8.2, align 1 + %conv9.2 = zext i8 %36 to i32 + %arrayidx10.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 1 + %37 = load i8, ptr %arrayidx10.2, align 1 + %conv11.2 = zext i8 %37 to i32 + %sub12.2 = sub nsw i32 %conv9.2, %conv11.2 + %arrayidx13.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 5 + %38 = load i8, ptr %arrayidx13.2, align 1 + %conv14.2 = zext i8 %38 to i32 + %arrayidx15.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 5 + %39 = load i8, ptr %arrayidx15.2, align 1 + %conv16.2 = zext i8 %39 to i32 + %sub17.2 = sub nsw i32 %conv14.2, %conv16.2 + %shl18.2 = shl nsw i32 %sub17.2, 16 + %add19.2 = add nsw i32 %shl18.2, %sub12.2 + %arrayidx20.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 2 + %40 = load i8, ptr %arrayidx20.2, align 1 + %conv21.2 = zext i8 %40 to i32 + %arrayidx22.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 2 + %41 = load i8, ptr %arrayidx22.2, align 1 + %conv23.2 = zext i8 %41 to i32 + %sub24.2 = sub nsw i32 %conv21.2, %conv23.2 + %arrayidx25.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 6 + %42 = load i8, ptr %arrayidx25.2, align 1 + %conv26.2 = zext i8 %42 to i32 + %arrayidx27.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 6 + %43 = load i8, ptr %arrayidx27.2, align 1 + %conv28.2 = zext i8 %43 to i32 + %sub29.2 = sub nsw i32 %conv26.2, %conv28.2 + %shl30.2 = shl nsw i32 %sub29.2, 16 + %add31.2 = add nsw i32 %shl30.2, %sub24.2 + %arrayidx32.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 3 + %44 = load i8, ptr %arrayidx32.2, align 1 + %conv33.2 = zext i8 %44 to i32 + %arrayidx34.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 3 + %45 = load i8, ptr %arrayidx34.2, align 1 + %conv35.2 = zext i8 %45 to i32 + %sub36.2 = sub nsw i32 %conv33.2, %conv35.2 + %arrayidx37.2 = getelementptr inbounds nuw i8, ptr %add.ptr.1, i64 7 + %46 = load i8, ptr %arrayidx37.2, align 1 + %conv38.2 = zext i8 %46 to i32 + %arrayidx39.2 = getelementptr inbounds nuw i8, ptr %add.ptr64.1, i64 7 + %47 = load i8, ptr %arrayidx39.2, align 1 + %conv40.2 = zext i8 %47 to i32 + %sub41.2 = sub nsw i32 %conv38.2, %conv40.2 + %shl42.2 = shl nsw i32 %sub41.2, 16 + %add43.2 = add nsw i32 %shl42.2, %sub36.2 + %add44.2 = add nsw i32 %add19.2, %add.2 + %sub45.2 = sub nsw i32 %add.2, %add19.2 + %add46.2 = add nsw i32 %add43.2, %add31.2 + %sub47.2 = sub nsw i32 %add31.2, %add43.2 + %add48.2 = add nsw i32 %add46.2, %add44.2 + %sub51.2 = sub nsw i32 %add44.2, %add46.2 + %add55.2 = add nsw i32 %sub47.2, %sub45.2 + %sub59.2 = sub nsw i32 %sub45.2, %sub47.2 + %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext + %add.ptr64.2 = getelementptr inbounds i8, ptr %add.ptr64.1, i64 %idx.ext63 + %48 = load i8, ptr %add.ptr.2, align 1 + %conv.3 = zext i8 %48 to i32 + %49 = load i8, ptr %add.ptr64.2, align 1 + %conv2.3 = zext i8 %49 to i32 + %sub.3 = sub nsw i32 %conv.3, %conv2.3 + %arrayidx3.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 4 + %50 = load i8, ptr %arrayidx3.3, align 1 + %conv4.3 = zext i8 %50 to i32 + %arrayidx5.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 4 + %51 = load i8, ptr %arrayidx5.3, align 1 + %conv6.3 = zext i8 %51 to i32 + %sub7.3 = sub nsw i32 %conv4.3, %conv6.3 + %shl.3 = shl nsw i32 %sub7.3, 16 + %add.3 = add nsw i32 %shl.3, %sub.3 + %arrayidx8.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 1 + %52 = load i8, ptr %arrayidx8.3, align 1 + %conv9.3 = zext i8 %52 to i32 + %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 1 + %53 = load i8, ptr %arrayidx10.3, align 1 + %conv11.3 = zext i8 %53 to i32 + %sub12.3 = sub nsw i32 %conv9.3, %conv11.3 + %arrayidx13.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 5 + %54 = load i8, ptr %arrayidx13.3, align 1 + %conv14.3 = zext i8 %54 to i32 + %arrayidx15.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 5 + %55 = load i8, ptr %arrayidx15.3, align 1 + %conv16.3 = zext i8 %55 to i32 + %sub17.3 = sub nsw i32 %conv14.3, %conv16.3 + %shl18.3 = shl nsw i32 %sub17.3, 16 + %add19.3 = add nsw i32 %shl18.3, %sub12.3 + %arrayidx20.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 2 + %56 = load i8, ptr %arrayidx20.3, align 1 + %conv21.3 = zext i8 %56 to i32 + %arrayidx22.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 2 + %57 = load i8, ptr %arrayidx22.3, align 1 + %conv23.3 = zext i8 %57 to i32 + %sub24.3 = sub nsw i32 %conv21.3, %conv23.3 + %arrayidx25.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 6 + %58 = load i8, ptr %arrayidx25.3, align 1 + %conv26.3 = zext i8 %58 to i32 + %arrayidx27.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 6 + %59 = load i8, ptr %arrayidx27.3, align 1 + %conv28.3 = zext i8 %59 to i32 + %sub29.3 = sub nsw i32 %conv26.3, %conv28.3 + %shl30.3 = shl nsw i32 %sub29.3, 16 + %add31.3 = add nsw i32 %shl30.3, %sub24.3 + %arrayidx32.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 3 + %60 = load i8, ptr %arrayidx32.3, align 1 + %conv33.3 = zext i8 %60 to i32 + %arrayidx34.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 3 + %61 = load i8, ptr %arrayidx34.3, align 1 + %conv35.3 = zext i8 %61 to i32 + %sub36.3 = sub nsw i32 %conv33.3, %conv35.3 + %arrayidx37.3 = getelementptr inbounds nuw i8, ptr %add.ptr.2, i64 7 + %62 = load i8, ptr %arrayidx37.3, align 1 + %conv38.3 = zext i8 %62 to i32 + %arrayidx39.3 = getelementptr inbounds nuw i8, ptr %add.ptr64.2, i64 7 + %63 = load i8, ptr %arrayidx39.3, align 1 + %conv40.3 = zext i8 %63 to i32 + %sub41.3 = sub nsw i32 %conv38.3, %conv40.3 + %shl42.3 = shl nsw i32 %sub41.3, 16 + %add43.3 = add nsw i32 %shl42.3, %sub36.3 + %add44.3 = add nsw i32 %add19.3, %add.3 + %sub45.3 = sub nsw i32 %add.3, %add19.3 + %add46.3 = add nsw i32 %add43.3, %add31.3 + %sub47.3 = sub nsw i32 %add31.3, %add43.3 + %add48.3 = add nsw i32 %add46.3, %add44.3 + %sub51.3 = sub nsw i32 %add44.3, %add46.3 + %add55.3 = add nsw i32 %sub47.3, %sub45.3 + %sub59.3 = sub nsw i32 %sub45.3, %sub47.3 + %add78 = add nsw i32 %add48.1, %add48 + %sub86 = sub nsw i32 %add48, %add48.1 + %add94 = add nsw i32 %add48.3, %add48.2 + %sub102 = sub nsw i32 %add48.2, %add48.3 + %add103 = add nsw i32 %add94, %add78 + %sub104 = sub nsw i32 %add78, %add94 + %add105 = add nsw i32 %sub102, %sub86 + %sub106 = sub nsw i32 %sub86, %sub102 + %shr.i = lshr i32 %add103, 15 + %and.i = and i32 %shr.i, 65537 + %mul.i = mul nuw i32 %and.i, 65535 + %add.i = add i32 %mul.i, %add103 + %xor.i = xor i32 %add.i, %mul.i + %shr.i169 = lshr i32 %add105, 15 + %and.i170 = and i32 %shr.i169, 65537 + %mul.i171 = mul nuw i32 %and.i170, 65535 + %add.i172 = add i32 %mul.i171, %add105 + %xor.i173 = xor i32 %add.i172, %mul.i171 + %shr.i174 = lshr i32 %sub104, 15 + %and.i175 = and i32 %shr.i174, 65537 + %mul.i176 = mul nuw i32 %and.i175, 65535 + %add.i177 = add i32 %mul.i176, %sub104 + %xor.i178 = xor i32 %add.i177, %mul.i176 + %shr.i179 = lshr i32 %sub106, 15 + %and.i180 = and i32 %shr.i179, 65537 + %mul.i181 = mul nuw i32 %and.i180, 65535 + %add.i182 = add i32 %mul.i181, %sub106 + %xor.i183 = xor i32 %add.i182, %mul.i181 + %add110 = add i32 %xor.i173, %xor.i + %add112 = add i32 %add110, %xor.i178 + %add113 = add i32 %add112, %xor.i183 + %add78.1 = add nsw i32 %add55.1, %add55 + %sub86.1 = sub nsw i32 %add55, %add55.1 + %add94.1 = add nsw i32 %add55.3, %add55.2 + %sub102.1 = sub nsw i32 %add55.2, %add55.3 + %add103.1 = add nsw i32 %add94.1, %add78.1 + %sub104.1 = sub nsw i32 %add78.1, %add94.1 + %add105.1 = add nsw i32 %sub102.1, %sub86.1 + %sub106.1 = sub nsw i32 %sub86.1, %sub102.1 + %shr.i.1 = lshr i32 %add103.1, 15 + %and.i.1 = and i32 %shr.i.1, 65537 + %mul.i.1 = mul nuw i32 %and.i.1, 65535 + %add.i.1 = add i32 %mul.i.1, %add103.1 + %xor.i.1 = xor i32 %add.i.1, %mul.i.1 + %shr.i169.1 = lshr i32 %add105.1, 15 + %and.i170.1 = and i32 %shr.i169.1, 65537 + %mul.i171.1 = mul nuw i32 %and.i170.1, 65535 + %add.i172.1 = add i32 %mul.i171.1, %add105.1 + %xor.i173.1 = xor i32 %add.i172.1, %mul.i171.1 + %shr.i174.1 = lshr i32 %sub104.1, 15 + %and.i175.1 = and i32 %shr.i174.1, 65537 + %mul.i176.1 = mul nuw i32 %and.i175.1, 65535 + %add.i177.1 = add i32 %mul.i176.1, %sub104.1 + %xor.i178.1 = xor i32 %add.i177.1, %mul.i176.1 + %shr.i179.1 = lshr i32 %sub106.1, 15 + %and.i180.1 = and i32 %shr.i179.1, 65537 + %mul.i181.1 = mul nuw i32 %and.i180.1, 65535 + %add.i182.1 = add i32 %mul.i181.1, %sub106.1 + %xor.i183.1 = xor i32 %add.i182.1, %mul.i181.1 + %add108.1 = add i32 %xor.i173.1, %add113 + %add110.1 = add i32 %add108.1, %xor.i.1 + %add112.1 = add i32 %add110.1, %xor.i178.1 + %add113.1 = add i32 %add112.1, %xor.i183.1 + %add78.2 = add nsw i32 %sub51.1, %sub51 + %sub86.2 = sub nsw i32 %sub51, %sub51.1 + %add94.2 = add nsw i32 %sub51.3, %sub51.2 + %sub102.2 = sub nsw i32 %sub51.2, %sub51.3 + %add103.2 = add nsw i32 %add94.2, %add78.2 + %sub104.2 = sub nsw i32 %add78.2, %add94.2 + %add105.2 = add nsw i32 %sub102.2, %sub86.2 + %sub106.2 = sub nsw i32 %sub86.2, %sub102.2 + %shr.i.2 = lshr i32 %add103.2, 15 + %and.i.2 = and i32 %shr.i.2, 65537 + %mul.i.2 = mul nuw i32 %and.i.2, 65535 + %add.i.2 = add i32 %mul.i.2, %add103.2 + %xor.i.2 = xor i32 %add.i.2, %mul.i.2 + %shr.i169.2 = lshr i32 %add105.2, 15 + %and.i170.2 = and i32 %shr.i169.2, 65537 + %mul.i171.2 = mul nuw i32 %and.i170.2, 65535 + %add.i172.2 = add i32 %mul.i171.2, %add105.2 + %xor.i173.2 = xor i32 %add.i172.2, %mul.i171.2 + %shr.i174.2 = lshr i32 %sub104.2, 15 + %and.i175.2 = and i32 %shr.i174.2, 65537 + %mul.i176.2 = mul nuw i32 %and.i175.2, 65535 + %add.i177.2 = add i32 %mul.i176.2, %sub104.2 + %xor.i178.2 = xor i32 %add.i177.2, %mul.i176.2 + %shr.i179.2 = lshr i32 %sub106.2, 15 + %and.i180.2 = and i32 %shr.i179.2, 65537 + %mul.i181.2 = mul nuw i32 %and.i180.2, 65535 + %add.i182.2 = add i32 %mul.i181.2, %sub106.2 + %xor.i183.2 = xor i32 %add.i182.2, %mul.i181.2 + %add108.2 = add i32 %xor.i173.2, %add113.1 + %add110.2 = add i32 %add108.2, %xor.i.2 + %add112.2 = add i32 %add110.2, %xor.i178.2 + %add113.2 = add i32 %add112.2, %xor.i183.2 + %add78.3 = add nsw i32 %sub59.1, %sub59 + %sub86.3 = sub nsw i32 %sub59, %sub59.1 + %add94.3 = add nsw i32 %sub59.3, %sub59.2 + %sub102.3 = sub nsw i32 %sub59.2, %sub59.3 + %add103.3 = add nsw i32 %add94.3, %add78.3 + %sub104.3 = sub nsw i32 %add78.3, %add94.3 + %add105.3 = add nsw i32 %sub102.3, %sub86.3 + %sub106.3 = sub nsw i32 %sub86.3, %sub102.3 + %shr.i.3 = lshr i32 %add103.3, 15 + %and.i.3 = and i32 %shr.i.3, 65537 + %mul.i.3 = mul nuw i32 %and.i.3, 65535 + %add.i.3 = add i32 %mul.i.3, %add103.3 + %xor.i.3 = xor i32 %add.i.3, %mul.i.3 + %shr.i169.3 = lshr i32 %add105.3, 15 + %and.i170.3 = and i32 %shr.i169.3, 65537 + %mul.i171.3 = mul nuw i32 %and.i170.3, 65535 + %add.i172.3 = add i32 %mul.i171.3, %add105.3 + %xor.i173.3 = xor i32 %add.i172.3, %mul.i171.3 + %shr.i174.3 = lshr i32 %sub104.3, 15 + %and.i175.3 = and i32 %shr.i174.3, 65537 + %mul.i176.3 = mul nuw i32 %and.i175.3, 65535 + %add.i177.3 = add i32 %mul.i176.3, %sub104.3 + %xor.i178.3 = xor i32 %add.i177.3, %mul.i176.3 + %shr.i179.3 = lshr i32 %sub106.3, 15 + %and.i180.3 = and i32 %shr.i179.3, 65537 + %mul.i181.3 = mul nuw i32 %and.i180.3, 65535 + %add.i182.3 = add i32 %mul.i181.3, %sub106.3 + %xor.i183.3 = xor i32 %add.i182.3, %mul.i181.3 + %add108.3 = add i32 %xor.i173.3, %add113.2 + %add110.3 = add i32 %add108.3, %xor.i.3 + %add112.3 = add i32 %add110.3, %xor.i178.3 + %add113.3 = add i32 %add112.3, %xor.i183.3 + %conv118 = and i32 %add113.3, 65535 + %shr = lshr i32 %add113.3, 16 + %add119 = add nuw nsw i32 %conv118, %shr + %shr120 = lshr i32 %add119, 1 + ret i32 %shr120 +}