191 changes: 180 additions & 11 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@

#include "llvm/Transforms/Vectorize.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/SetVector.h"
Expand All @@ -63,6 +62,7 @@
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
Expand Down Expand Up @@ -101,6 +101,7 @@
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <algorithm>
#include <functional>
#include <map>
#include <tuple>

Expand Down Expand Up @@ -280,7 +281,12 @@ class InnerLoopVectorizer {
AddedSafetyChecks(false) {}

// Perform the actual loop widening (vectorization).
void vectorize(LoopVectorizationLegality *L) {
// MinimumBitWidths maps scalar integer values to the smallest bitwidth they
// can be validly truncated to. The cost model has assumed this truncation
// will happen when vectorizing.
void vectorize(LoopVectorizationLegality *L,
DenseMap<Instruction*,uint64_t> MinimumBitWidths) {
MinBWs = MinimumBitWidths;
Legal = L;
// Create a new empty loop. Unlink the old loop and connect the new one.
createEmptyLoop();
Expand Down Expand Up @@ -329,6 +335,9 @@ class InnerLoopVectorizer {
/// See PR14725.
void fixLCSSAPHIs();

/// Shrinks vector element sizes based on information in "MinBWs".
void truncateToMinimalBitwidths();

/// A helper function that computes the predicate of the block BB, assuming
/// that the header block of the loop is set to True. It returns the *entry*
/// mask for the block BB.
Expand All @@ -339,7 +348,7 @@ class InnerLoopVectorizer {

/// A helper function to vectorize a single BB within the innermost loop.
void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);

/// Vectorize a single PHINode in a block. This method handles the induction
/// variable canonicalization. It supports both VF = 1 for unrolled loops and
/// arbitrary length vectors.
Expand Down Expand Up @@ -499,6 +508,10 @@ class InnerLoopVectorizer {
/// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
Value *VectorTripCount;

/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
/// to this type.
DenseMap<Instruction*,uint64_t> MinBWs;
LoopVectorizationLegality *Legal;

// Record whether runtime check is added.
Expand Down Expand Up @@ -1346,10 +1359,11 @@ class LoopVectorizationCostModel {
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
const TargetTransformInfo &TTI,
const TargetLibraryInfo *TLI, AssumptionCache *AC,
const TargetLibraryInfo *TLI, DemandedBits *DB,
AssumptionCache *AC,
const Function *F, const LoopVectorizeHints *Hints,
SmallPtrSetImpl<const Value *> &ValuesToIgnore)
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI),
: TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
TheFunction(F), Hints(Hints), ValuesToIgnore(ValuesToIgnore) {}

/// Information about vectorization costs
Expand Down Expand Up @@ -1419,6 +1433,12 @@ class LoopVectorizationCostModel {
emitAnalysisDiag(TheFunction, TheLoop, *Hints, Message);
}

public:
/// Map of scalar integer values to the smallest bitwidth they can be legally
/// represented as. The vector equivalents of these values should be truncated
/// to this type.
DenseMap<Instruction*,uint64_t> MinBWs;

/// The loop that we evaluate.
Loop *TheLoop;
/// Scev analysis.
Expand All @@ -1431,6 +1451,8 @@ class LoopVectorizationCostModel {
const TargetTransformInfo &TTI;
/// Target Library Info.
const TargetLibraryInfo *TLI;
/// Demanded bits analysis
DemandedBits *DB;
const Function *TheFunction;
// Loop Vectorize Hint.
const LoopVectorizeHints *Hints;
Expand Down Expand Up @@ -1523,6 +1545,7 @@ struct LoopVectorize : public FunctionPass {
DominatorTree *DT;
BlockFrequencyInfo *BFI;
TargetLibraryInfo *TLI;
DemandedBits *DB;
AliasAnalysis *AA;
AssumptionCache *AC;
LoopAccessAnalysis *LAA;
Expand All @@ -1542,6 +1565,7 @@ struct LoopVectorize : public FunctionPass {
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
LAA = &getAnalysis<LoopAccessAnalysis>();
DB = &getAnalysis<DemandedBits>();

// Compute some weights outside of the loop over the loops. Compute this
// using a BranchProbability to re-use its scaling math.
Expand Down Expand Up @@ -1687,7 +1711,7 @@ struct LoopVectorize : public FunctionPass {
}

// Use the cost model.
LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, AC, F, &Hints,
LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, TLI, DB, AC, F, &Hints,
ValuesToIgnore);

// Check the function attributes to find out if this function should be
Expand Down Expand Up @@ -1800,15 +1824,15 @@ struct LoopVectorize : public FunctionPass {
// If we decided that it is not legal to vectorize the loop then
// interleave it.
InnerLoopUnroller Unroller(L, SE, LI, DT, TLI, TTI, IC);
Unroller.vectorize(&LVL);
Unroller.vectorize(&LVL, CM.MinBWs);

emitOptimizationRemark(F->getContext(), LV_NAME, *F, L->getStartLoc(),
Twine("interleaved loop (interleaved count: ") +
Twine(IC) + ")");
} else {
// If we decided that it is *legal* to vectorize the loop then do it.
InnerLoopVectorizer LB(L, SE, LI, DT, TLI, TTI, VF.Width, IC);
LB.vectorize(&LVL);
LB.vectorize(&LVL, CM.MinBWs);
++LoopsVectorized;

// Add metadata to disable runtime unrolling scalar loop when there's no
Expand Down Expand Up @@ -1842,6 +1866,7 @@ struct LoopVectorize : public FunctionPass {
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LoopAccessAnalysis>();
AU.addRequired<DemandedBits>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<BasicAAWrapperPass>();
Expand Down Expand Up @@ -2009,6 +2034,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
// If this scalar is unknown, assume that it is a constant or that it is
// loop invariant. Broadcast V and save the value for future uses.
Value *B = getBroadcastInstrs(V);

return WidenMap.splat(V, B);
}

Expand Down Expand Up @@ -3102,6 +3128,117 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
return TTI.getIntrinsicInstrCost(ID, RetTy, Tys);
}

static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
}
static Type *largestIntegerVectorType(Type *T1, Type *T2) {
IntegerType *I1 = cast<IntegerType>(T1->getVectorElementType());
IntegerType *I2 = cast<IntegerType>(T2->getVectorElementType());
return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
}

void InnerLoopVectorizer::truncateToMinimalBitwidths() {
// For every instruction `I` in MinBWs, truncate the operands, create a
// truncated version of `I` and reextend its result. InstCombine runs
// later and will remove any ext/trunc pairs.
//
for (auto &KV : MinBWs) {
VectorParts &Parts = WidenMap.get(KV.first);
for (Value *&I : Parts) {
if (I->use_empty())
continue;
Type *OriginalTy = I->getType();
Type *ScalarTruncatedTy = IntegerType::get(OriginalTy->getContext(),
KV.second);
Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
OriginalTy->getVectorNumElements());
if (TruncatedTy == OriginalTy)
continue;

IRBuilder<> B(cast<Instruction>(I));
auto ShrinkOperand = [&](Value *V) -> Value* {
if (auto *ZI = dyn_cast<ZExtInst>(V))
if (ZI->getSrcTy() == TruncatedTy)
return ZI->getOperand(0);
return B.CreateZExtOrTrunc(V, TruncatedTy);
};

// The actual instruction modification depends on the instruction type,
// unfortunately.
Value *NewI = nullptr;
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
NewI = B.CreateBinOp(BO->getOpcode(),
ShrinkOperand(BO->getOperand(0)),
ShrinkOperand(BO->getOperand(1)));
cast<BinaryOperator>(NewI)->copyIRFlags(I);
} else if (ICmpInst *CI = dyn_cast<ICmpInst>(I)) {
NewI = B.CreateICmp(CI->getPredicate(),
ShrinkOperand(CI->getOperand(0)),
ShrinkOperand(CI->getOperand(1)));
} else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
NewI = B.CreateSelect(SI->getCondition(),
ShrinkOperand(SI->getTrueValue()),
ShrinkOperand(SI->getFalseValue()));
} else if (CastInst *CI = dyn_cast<CastInst>(I)) {
switch (CI->getOpcode()) {
default: llvm_unreachable("Unhandled cast!");
case Instruction::Trunc:
NewI = ShrinkOperand(CI->getOperand(0));
break;
case Instruction::SExt:
NewI = B.CreateSExtOrTrunc(CI->getOperand(0),
smallestIntegerVectorType(OriginalTy,
TruncatedTy));
break;
case Instruction::ZExt:
NewI = B.CreateZExtOrTrunc(CI->getOperand(0),
smallestIntegerVectorType(OriginalTy,
TruncatedTy));
break;
}
} else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
auto *O0 =
B.CreateZExtOrTrunc(SI->getOperand(0),
VectorType::get(ScalarTruncatedTy, Elements0));
auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
auto *O1 =
B.CreateZExtOrTrunc(SI->getOperand(1),
VectorType::get(ScalarTruncatedTy, Elements1));

NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
} else if (isa<LoadInst>(I)) {
// Don't do anything with the operands, just extend the result.
continue;
} else {
llvm_unreachable("Unhandled instruction type!");
}

// Lastly, extend the result.
NewI->takeName(cast<Instruction>(I));
Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
I->replaceAllUsesWith(Res);
cast<Instruction>(I)->eraseFromParent();
I = Res;
}
}

// We'll have created a bunch of ZExts that are now parentless. Clean up.
for (auto &KV : MinBWs) {
VectorParts &Parts = WidenMap.get(KV.first);
for (Value *&I : Parts) {
ZExtInst *Inst = dyn_cast<ZExtInst>(I);
if (Inst && Inst->use_empty()) {
Value *NewI = Inst->getOperand(0);
Inst->eraseFromParent();
I = NewI;
}
}
}
}

void InnerLoopVectorizer::vectorizeLoop() {
//===------------------------------------------------===//
//
Expand Down Expand Up @@ -3132,6 +3269,11 @@ void InnerLoopVectorizer::vectorizeLoop() {
be = DFS.endRPO(); bb != be; ++bb)
vectorizeBlockInLoop(*bb, &RdxPHIsToFix);

// Insert truncates and extends for any truncated instructions as hints to
// InstCombine.
if (VF > 1)
truncateToMinimalBitwidths();

// At this point every instruction in the original loop is widened to
// a vector form. We are almost done. Now, we need to fix the PHI nodes
// that we vectorized. The PHI nodes are currently empty because we did
Expand Down Expand Up @@ -3565,6 +3707,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// For each instruction in the old loop.
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
VectorParts &Entry = WidenMap.get(it);

switch (it->getOpcode()) {
case Instruction::Br:
// Nothing to do for PHIs and BR, since we already took care of the
Expand Down Expand Up @@ -3628,7 +3771,7 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
VectorParts &Cond = getVectorValue(it->getOperand(0));
VectorParts &Op0 = getVectorValue(it->getOperand(1));
VectorParts &Op1 = getVectorValue(it->getOperand(2));

Value *ScalarCond = (VF == 1) ? Cond[0] :
Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));

Expand Down Expand Up @@ -4563,6 +4706,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
unsigned TC = SE->getSmallConstantTripCount(TheLoop);
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');

MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned WidestType = getWidestType();
unsigned WidestRegister = TTI.getRegisterBitWidth(true);
unsigned MaxSafeDepDist = -1U;
Expand Down Expand Up @@ -5086,6 +5230,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
VF = 1;

Type *RetTy = I->getType();
if (VF > 1 && MinBWs.count(I))
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
Type *VectorTy = ToVectorTy(RetTy, VF);

// TODO: We need to estimate the cost of intrinsic calls.
Expand Down Expand Up @@ -5168,6 +5314,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
if (VF > 1 && MinBWs.count(dyn_cast<Instruction>(I->getOperand(0))))
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
}
Expand Down Expand Up @@ -5291,8 +5439,28 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
Legal->isInductionVariable(I->getOperand(0)))
return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
I->getOperand(0)->getType());

Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);

Type *SrcScalarTy = I->getOperand(0)->getType();
Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
if (VF > 1 && MinBWs.count(I)) {
// This cast is going to be shrunk. This may remove the cast or it might
// turn it into slightly different cast. For example, if MinBW == 16,
// "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
//
// Calculate the modified src and dest types.
Type *MinVecTy = VectorTy;
if (I->getOpcode() == Instruction::Trunc) {
SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
VectorTy = largestIntegerVectorType(ToVectorTy(I->getType(), VF),
MinVecTy);
} else if (I->getOpcode() == Instruction::ZExt ||
I->getOpcode() == Instruction::SExt) {
SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
VectorTy = smallestIntegerVectorType(ToVectorTy(I->getType(), VF),
MinVecTy);
}
}

return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
}
case Instruction::Call: {
Expand Down Expand Up @@ -5343,6 +5511,7 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
INITIALIZE_PASS_DEPENDENCY(DemandedBits)
INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)

namespace llvm {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
; RUN: opt -S < %s -basicaa -loop-vectorize -simplifycfg -instsimplify -instcombine -licm -force-vector-interleave=1 2>&1 | FileCheck %s

target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"

; CHECK-LABEL: @add_a(
; CHECK: load <16 x i8>, <16 x i8>*
; CHECK: add nuw nsw <16 x i8>
; CHECK: store <16 x i8>
; Function Attrs: nounwind
define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i8
%arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv1, i8* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; CHECK-LABEL: @add_b(
; CHECK: load <8 x i16>, <8 x i16>*
; CHECK: add nuw nsw <8 x i16>
; CHECK: store <8 x i16>
; Function Attrs: nounwind
define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp9 = icmp sgt i32 %len, 0
br i1 %cmp9, label %for.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv8 = zext i16 %0 to i32
%add = add nuw nsw i32 %conv8, 2
%conv1 = trunc i32 %add to i16
%arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
store i16 %conv1, i16* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; CHECK-LABEL: @add_c(
; CHECK: load <8 x i8>, <8 x i8>*
; CHECK: add nuw nsw <8 x i16>
; CHECK: store <8 x i16>
; Function Attrs: nounwind
define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp8 = icmp sgt i32 %len, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = add nuw nsw i32 %conv, 2
%conv1 = trunc i32 %add to i16
%arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
store i16 %conv1, i16* %arrayidx3
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; CHECK-LABEL: @add_d(
; CHECK: load <4 x i16>
; CHECK: add nsw <4 x i32>
; CHECK: store <4 x i32>
define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
entry:
%cmp7 = icmp sgt i32 %len, 0
br i1 %cmp7, label %for.body, label %for.cond.cleanup

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv = sext i16 %0 to i32
%add = add nsw i32 %conv, 2
%arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
store i32 %add, i32* %arrayidx2
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; CHECK-LABEL: @add_e(
; CHECK: load <16 x i8>
; CHECK: shl <16 x i8>
; CHECK: add nuw nsw <16 x i8>
; CHECK: or <16 x i8>
; CHECK: mul nuw nsw <16 x i8>
; CHECK: and <16 x i8>
; CHECK: xor <16 x i8>
; CHECK: mul nuw nsw <16 x i8>
; CHECK: store <16 x i8>
define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
entry:
%cmp.32 = icmp sgt i32 %len, 0
br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup

for.body.lr.ph: ; preds = %entry
%conv11 = zext i8 %arg2 to i32
%conv13 = zext i8 %arg1 to i32
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%0 = load i8, i8* %arrayidx
%conv = zext i8 %0 to i32
%add = shl i32 %conv, 4
%conv2 = add nuw nsw i32 %add, 32
%or = or i32 %conv, 51
%mul = mul nuw nsw i32 %or, 60
%and = and i32 %conv2, %conv13
%mul.masked = and i32 %mul, 252
%conv17 = xor i32 %mul.masked, %conv11
%mul18 = mul nuw nsw i32 %conv17, %and
%conv19 = trunc i32 %mul18 to i8
%arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv19, i8* %arrayidx21
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; CHECK-LABEL: @add_f
; CHECK: load <8 x i16>
; CHECK: trunc <8 x i16>
; CHECK: shl <8 x i8>
; CHECK: add nsw <8 x i8>
; CHECK: or <8 x i8>
; CHECK: mul nuw nsw <8 x i8>
; CHECK: and <8 x i8>
; CHECK: xor <8 x i8>
; CHECK: mul nuw nsw <8 x i8>
; CHECK: store <8 x i8>
define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
entry:
%cmp.32 = icmp sgt i32 %len, 0
br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup

for.body.lr.ph: ; preds = %entry
%conv11 = zext i8 %arg2 to i32
%conv13 = zext i8 %arg1 to i32
br label %for.body

for.cond.cleanup: ; preds = %for.body, %entry
ret void

for.body: ; preds = %for.body, %for.body.lr.ph
%indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
%0 = load i16, i16* %arrayidx
%conv = sext i16 %0 to i32
%add = shl i32 %conv, 4
%conv2 = add nsw i32 %add, 32
%or = and i32 %conv, 204
%conv8 = or i32 %or, 51
%mul = mul nuw nsw i32 %conv8, 60
%and = and i32 %conv2, %conv13
%mul.masked = and i32 %mul, 252
%conv17 = xor i32 %mul.masked, %conv11
%mul18 = mul nuw nsw i32 %conv17, %and
%conv19 = trunc i32 %mul18 to i8
%arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
store i8 %conv19, i8* %arrayidx21
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; CHECK-LABEL: @add_g
; CHECK: load <16 x i8>
; CHECK: xor <16 x i8>
; CHECK: icmp ult <16 x i8>
; CHECK: select <16 x i1> {{.*}}, <16 x i8>
; CHECK: store <16 x i8>
define void @add_g(i8* noalias nocapture readonly %p, i8* noalias nocapture readonly %q, i8* noalias nocapture %r, i8 %arg1, i32 %len) #0 {
%1 = icmp sgt i32 %len, 0
br i1 %1, label %.lr.ph, label %._crit_edge

.lr.ph: ; preds = %0
%2 = sext i8 %arg1 to i64
br label %3

._crit_edge: ; preds = %3, %0
ret void

; <label>:3 ; preds = %3, %.lr.ph
%indvars.iv = phi i64 [ 0, %.lr.ph ], [ %indvars.iv.next, %3 ]
%x4 = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
%x5 = load i8, i8* %x4
%x7 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
%x8 = load i8, i8* %x7
%x9 = zext i8 %x5 to i32
%x10 = xor i32 %x9, 255
%x11 = icmp ult i32 %x10, 24
%x12 = select i1 %x11, i32 %x10, i32 24
%x13 = trunc i32 %x12 to i8
store i8 %x13, i8* %x4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp eq i32 %lftr.wideiv, %len
br i1 %exitcond, label %._crit_edge, label %3
}

attributes #0 = { nounwind }