From 1b889fd5e34bef4ce346f7de34cb51c6ffd6222d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 3 Jul 2024 12:41:49 +0200
Subject: [PATCH] [CVP][LVI] Add support for vectors (#97428)

The core change here is to add support for converting vector constants
into constant ranges. The rest is just relaxing isIntegerTy() checks and
making sure we don't use APIs that assume vectors.

There are a couple of places that don't support vectors yet, most
notably the "simplest" fold (comparisons to a constant) isn't supported
yet. I'll leave these to a followup.
---
 llvm/lib/Analysis/LazyValueInfo.cpp           | 32 +++++++++--
 .../Scalar/CorrelatedValuePropagation.cpp     | 53 ++++-------------
 .../CorrelatedValuePropagation/icmp.ll        |  4 +-
 .../CorrelatedValuePropagation/vectors.ll     | 57 ++++++++++++++-----
 4 files changed, 83 insertions(+), 63 deletions(-)
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index d8b03eaa3928fb..e9051e74b4577a 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -650,7 +650,7 @@ LazyValueInfoImpl::solveBlockValueImpl(Value *Val, BasicBlock *BB) {
   if (PT && isKnownNonZero(BBI, DL))
     return ValueLatticeElement::getNot(ConstantPointerNull::get(PT));
 
-  if (BBI->getType()->isIntegerTy()) {
+  if (BBI->getType()->isIntOrIntVectorTy()) {
     if (auto *CI = dyn_cast<CastInst>(BBI))
       return solveBlockValueCast(CI, BB);
 
@@ -836,6 +836,24 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
   }
 }
 
+static ConstantRange getConstantRangeFromFixedVector(Constant *C,
+                                                     FixedVectorType *Ty) {
+  unsigned BW = Ty->getScalarSizeInBits();
+  ConstantRange CR = ConstantRange::getEmpty(BW);
+  for (unsigned I = 0; I < Ty->getNumElements(); ++I) {
+    Constant *Elem = C->getAggregateElement(I);
+    if (!Elem)
+      return ConstantRange::getFull(BW);
+    if (isa<PoisonValue>(Elem))
+      continue;
+    auto *CI = dyn_cast<ConstantInt>(Elem);
+    if (!CI)
+      return ConstantRange::getFull(BW);
+    CR = CR.unionWith(CI->getValue());
+  }
+  return CR;
+}
+
 static ConstantRange toConstantRange(const ValueLatticeElement &Val,
                                      Type *Ty, bool UndefAllowed = false) {
   assert(Ty->isIntOrIntVectorTy() && "Must be integer type");
@@ -844,6 +862,13 @@ static ConstantRange toConstantRange(const ValueLatticeElement &Val,
   unsigned BW = Ty->getScalarSizeInBits();
   if (Val.isUnknown())
     return ConstantRange::getEmpty(BW);
+  if (Val.isConstant() && Ty->isVectorTy()) {
+    if (auto *CI = dyn_cast_or_null<ConstantInt>(
+            Val.getConstant()->getSplatValue(/*AllowPoison=*/true)))
+      return ConstantRange(CI->getValue());
+    if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
+      return getConstantRangeFromFixedVector(Val.getConstant(), VTy);
+  }
   return ConstantRange::getFull(BW);
 }
 
@@ -968,7 +993,7 @@ LazyValueInfoImpl::solveBlockValueCast(CastInst *CI, BasicBlock *BB) {
     return std::nullopt;
   const ConstantRange &LHSRange = *LHSRes;
 
-  const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth();
+  const unsigned ResultBitWidth = CI->getType()->getScalarSizeInBits();
 
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
@@ -1108,7 +1133,7 @@ LazyValueInfoImpl::getValueFromSimpleICmpCondition(CmpInst::Predicate Pred,
                                                    const APInt &Offset,
                                                    Instruction *CxtI,
                                                    bool UseBlockValue) {
-  ConstantRange RHSRange(RHS->getType()->getIntegerBitWidth(),
+  ConstantRange RHSRange(RHS->getType()->getScalarSizeInBits(),
                          /*isFullSet=*/true);
   if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
     RHSRange = ConstantRange(CI->getValue());
@@ -1728,7 +1753,6 @@ Constant *LazyValueInfo::getConstant(Value *V, Instruction *CxtI) {
 
 ConstantRange LazyValueInfo::getConstantRange(Value *V, Instruction *CxtI,
                                               bool UndefAllowed) {
-  assert(V->getType()->isIntegerTy());
   BasicBlock *BB = CxtI->getParent();
   ValueLatticeElement Result =
       getOrCreateImpl(BB->getModule()).getValueInBlock(V, BB, CxtI);
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 875d3ea78fae50..34304c2245e304 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -288,9 +288,8 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
 }
 
 static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) {
-  // Only for signed relational comparisons of scalar integers.
-  if (Cmp->getType()->isVectorTy() ||
-      !Cmp->getOperand(0)->getType()->isIntegerTy())
+  // Only for signed relational comparisons of integers.
+  if (!Cmp->getOperand(0)->getType()->isIntOrIntVectorTy())
     return false;
 
   if (!Cmp->isSigned())
@@ -505,12 +504,8 @@ static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI);
 // because it is negation-invariant.
 static bool processAbsIntrinsic(IntrinsicInst *II, LazyValueInfo *LVI) {
   Value *X = II->getArgOperand(0);
-  Type *Ty = X->getType();
-  if (!Ty->isIntegerTy())
-    return false;
-
   bool IsIntMinPoison = cast<ConstantInt>(II->getArgOperand(1))->isOne();
-  APInt IntMin = APInt::getSignedMinValue(Ty->getScalarSizeInBits());
+  APInt IntMin = APInt::getSignedMinValue(X->getType()->getScalarSizeInBits());
   ConstantRange Range = LVI->getConstantRangeAtUse(
       II->getOperandUse(0), /*UndefAllowed*/ IsIntMinPoison);
 
@@ -679,15 +674,13 @@ static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
   }
 
   if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) {
-    if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
+    if (willNotOverflow(WO, LVI))
       return processOverflowIntrinsic(WO, LVI);
-    }
   }
 
   if (auto *SI = dyn_cast<SaturatingInst>(&CB)) {
-    if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) {
+    if (willNotOverflow(SI, LVI))
       return processSaturatingInst(SI, LVI);
-    }
   }
 
   bool Changed = false;
@@ -761,11 +754,10 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, const ConstantRange &LCR,
                              const ConstantRange &RCR) {
   assert(Instr->getOpcode() == Instruction::SDiv ||
          Instr->getOpcode() == Instruction::SRem);
-  assert(!Instr->getType()->isVectorTy());
 
   // Find the smallest power of two bitwidth that's sufficient to hold Instr's
   // operands.
-  unsigned OrigWidth = Instr->getType()->getIntegerBitWidth();
+  unsigned OrigWidth = Instr->getType()->getScalarSizeInBits();
 
   // What is the smallest bit width that can accommodate the entire value ranges
   // of both of the operands?
@@ -788,7 +780,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, const ConstantRange &LCR,
 
   ++NumSDivSRemsNarrowed;
   IRBuilder<> B{Instr};
-  auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+  auto *TruncTy = Instr->getType()->getWithNewBitWidth(NewWidth);
   auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
                                      Instr->getName() + ".lhs.trunc");
   auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy,
@@ -809,7 +801,6 @@ static bool expandUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR,
   Type *Ty = Instr->getType();
   assert(Instr->getOpcode() == Instruction::UDiv ||
          Instr->getOpcode() == Instruction::URem);
-  assert(!Ty->isVectorTy());
   bool IsRem = Instr->getOpcode() == Instruction::URem;
 
   Value *X = Instr->getOperand(0);
@@ -892,7 +883,6 @@ static bool narrowUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR,
                              const ConstantRange &YCR) {
   assert(Instr->getOpcode() == Instruction::UDiv ||
          Instr->getOpcode() == Instruction::URem);
-  assert(!Instr->getType()->isVectorTy());
 
   // Find the smallest power of two bitwidth that's sufficient to hold Instr's
   // operands.
@@ -905,12 +895,12 @@ static bool narrowUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR,
 
   // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
   // two.
-  if (NewWidth >= Instr->getType()->getIntegerBitWidth())
+  if (NewWidth >= Instr->getType()->getScalarSizeInBits())
     return false;
 
   ++NumUDivURemsNarrowed;
   IRBuilder<> B{Instr};
-  auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+  auto *TruncTy = Instr->getType()->getWithNewBitWidth(NewWidth);
   auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
                                      Instr->getName() + ".lhs.trunc");
   auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy,
@@ -929,9 +919,6 @@ static bool narrowUDivOrURem(BinaryOperator *Instr, const ConstantRange &XCR,
 static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
   assert(Instr->getOpcode() == Instruction::UDiv ||
          Instr->getOpcode() == Instruction::URem);
-  if (Instr->getType()->isVectorTy())
-    return false;
-
   ConstantRange XCR = LVI->getConstantRangeAtUse(Instr->getOperandUse(0),
                                                  /*UndefAllowed*/ false);
   // Allow undef for RHS, as we can assume it is division by zero UB.
@@ -946,7 +933,6 @@ static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
 static bool processSRem(BinaryOperator *SDI, const ConstantRange &LCR,
                         const ConstantRange &RCR, LazyValueInfo *LVI) {
   assert(SDI->getOpcode() == Instruction::SRem);
-  assert(!SDI->getType()->isVectorTy());
 
   if (LCR.abs().icmp(CmpInst::ICMP_ULT, RCR.abs())) {
     SDI->replaceAllUsesWith(SDI->getOperand(0));
@@ -1006,7 +992,6 @@ static bool processSRem(BinaryOperator *SDI, const ConstantRange &LCR,
 static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR,
                         const ConstantRange &RCR, LazyValueInfo *LVI) {
   assert(SDI->getOpcode() == Instruction::SDiv);
-  assert(!SDI->getType()->isVectorTy());
 
   // Check whether the division folds to a constant.
   ConstantRange DivCR = LCR.sdiv(RCR);
@@ -1064,9 +1049,6 @@ static bool processSDiv(BinaryOperator *SDI, const ConstantRange &LCR,
 static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
   assert(Instr->getOpcode() == Instruction::SDiv ||
          Instr->getOpcode() == Instruction::SRem);
-  if (Instr->getType()->isVectorTy())
-    return false;
-
   ConstantRange LCR =
       LVI->getConstantRangeAtUse(Instr->getOperandUse(0), /*AllowUndef*/ false);
   // Allow undef for RHS, as we can assume it is division by zero UB.
@@ -1085,12 +1067,9 @@ static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
 }
 
 static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy())
-    return false;
-
   ConstantRange LRange =
       LVI->getConstantRangeAtUse(SDI->getOperandUse(0), /*UndefAllowed*/ false);
-  unsigned OrigWidth = SDI->getType()->getIntegerBitWidth();
+  unsigned OrigWidth = SDI->getType()->getScalarSizeInBits();
   ConstantRange NegOneOrZero =
       ConstantRange(APInt(OrigWidth, (uint64_t)-1, true), APInt(OrigWidth, 1));
   if (NegOneOrZero.contains(LRange)) {
@@ -1117,9 +1096,6 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
 }
 
 static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
-  if (SDI->getType()->isVectorTy())
-    return false;
-
   const Use &Base = SDI->getOperandUse(0);
   if (!LVI->getConstantRangeAtUse(Base, /*UndefAllowed*/ false)
            .isAllNonNegative())
@@ -1138,9 +1114,6 @@ static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
 }
 
 static bool processPossibleNonNeg(PossiblyNonNegInst *I, LazyValueInfo *LVI) {
-  if (I->getType()->isVectorTy())
-    return false;
-
   if (I->hasNonNeg())
     return false;
 
@@ -1164,9 +1137,6 @@ static bool processUIToFP(UIToFPInst *UIToFP, LazyValueInfo *LVI) {
 }
 
 static bool processSIToFP(SIToFPInst *SIToFP, LazyValueInfo *LVI) {
-  if (SIToFP->getType()->isVectorTy())
-    return false;
-
   const Use &Base = SIToFP->getOperandUse(0);
   if (!LVI->getConstantRangeAtUse(Base, /*UndefAllowed*/ false)
            .isAllNonNegative())
@@ -1187,9 +1157,6 @@ static bool processSIToFP(SIToFPInst *SIToFP, LazyValueInfo *LVI) {
 static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
   using OBO = OverflowingBinaryOperator;
 
-  if (BinOp->getType()->isVectorTy())
-    return false;
-
   bool NSW = BinOp->hasNoSignedWrap();
   bool NUW = BinOp->hasNoUnsignedWrap();
   if (NSW && NUW)
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll b/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll
index ca707134402193..200793918f0ef2 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll
@@ -1246,13 +1246,11 @@ define i1 @non_const_range_minmax(i8 %a, i8 %b) {
   ret i1 %cmp1
 }
 
-; FIXME: Also support vectors.
 define <2 x i1> @non_const_range_minmax_vec(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: @non_const_range_minmax_vec(
 ; CHECK-NEXT:    [[A2:%.*]] = call <2 x i8> @llvm.umin.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 10, i8 10>)
 ; CHECK-NEXT:    [[B2:%.*]] = call <2 x i8> @llvm.umax.v2i8(<2 x i8> [[B:%.*]], <2 x i8> <i8 11, i8 11>)
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <2 x i8> [[A2]], [[B2]]
-; CHECK-NEXT:    ret <2 x i1> [[CMP1]]
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %a2 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> %a, <2 x i8> <i8 10, i8 10>)
   %b2 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> %b, <2 x i8> <i8 11, i8 11>)
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
index 9862dd56e31b23..a06fa2c1066090 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=correlated-propagation < %s | FileCheck %s
 
+; TODO: Add support for this.
 define <2 x i1> @cmp1(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i1> @cmp1(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
@@ -13,6 +14,7 @@ define <2 x i1> @cmp1(<2 x i8> %a) {
   ret <2 x i1> %cmp
 }
 
+; TODO: Add support for this.
 define <2 x i1> @cmp2(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i1> @cmp2(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
@@ -29,7 +31,7 @@ define <2 x i1> @cmp_signedness(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i1> @cmp_signedness(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i16> [[ZEXT]], <i16 5, i16 5>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i16> [[ZEXT]], <i16 5, i16 5>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -41,7 +43,7 @@ define <2 x i16> @infer_nowrap(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @infer_nowrap(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = add <2 x i16> [[ZEXT]], <i16 1, i16 1>
+; CHECK-NEXT:    [[RES:%.*]] = add nuw nsw <2 x i16> [[ZEXT]], <i16 1, i16 1>
 ; CHECK-NEXT:    ret <2 x i16> [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -53,7 +55,7 @@ define <2 x i16> @infer_nowrap_nonsplat(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @infer_nowrap_nonsplat(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = add <2 x i16> [[ZEXT]], <i16 1, i16 2>
+; CHECK-NEXT:    [[RES:%.*]] = add nuw nsw <2 x i16> [[ZEXT]], <i16 1, i16 2>
 ; CHECK-NEXT:    ret <2 x i16> [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -61,11 +63,35 @@ define <2 x i16> @infer_nowrap_nonsplat(<2 x i8> %a) {
   ret <2 x i16> %res
 }
 
+define <vscale x 2 x i16> @infer_nowrap_scalable(<vscale x 2 x i8> %a) {
+; CHECK-LABEL: define <vscale x 2 x i16> @infer_nowrap_scalable(
+; CHECK-SAME: <vscale x 2 x i8> [[A:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <vscale x 2 x i8> [[A]] to <vscale x 2 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = add nuw nsw <vscale x 2 x i16> [[ZEXT]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x i16> [[RES]]
+;
+  %zext = zext <vscale x 2 x i8> %a to <vscale x 2 x i16>
+  %res = add <vscale x 2 x i16> %zext, splat (i16 1)
+  ret <vscale x 2 x i16> %res
+}
+
+define <2 x i16> @infer_nowrap_poison(<2 x i8> %a) {
+; CHECK-LABEL: define <2 x i16> @infer_nowrap_poison(
+; CHECK-SAME: <2 x i8> [[A:%.*]]) {
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
+; CHECK-NEXT:    [[RES:%.*]] = add nuw nsw <2 x i16> [[ZEXT]], <i16 1, i16 poison>
+; CHECK-NEXT:    ret <2 x i16> [[RES]]
+;
+  %zext = zext <2 x i8> %a to <2 x i16>
+  %res = add <2 x i16> %zext, <i16 1, i16 poison>
+  ret <2 x i16> %res
+}
+
 define <2 x i16> @infer_nowrap_nonsplat_nsw_only(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @infer_nowrap_nonsplat_nsw_only(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = add <2 x i16> [[ZEXT]], <i16 1, i16 -1>
+; CHECK-NEXT:    [[RES:%.*]] = add nsw <2 x i16> [[ZEXT]], <i16 1, i16 -1>
 ; CHECK-NEXT:    ret <2 x i16> [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -77,8 +103,7 @@ define <2 x i16> @abs(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @abs(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i16> @llvm.abs.v2i16(<2 x i16> [[ZEXT]], i1 false)
-; CHECK-NEXT:    ret <2 x i16> [[RES]]
+; CHECK-NEXT:    ret <2 x i16> [[ZEXT]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
   %res = call <2 x i16> @llvm.abs(<2 x i16> %zext, i1 false)
@@ -89,7 +114,7 @@ define <2 x i16> @saturating(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @saturating(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[ZEXT]], <2 x i16> <i16 1, i16 1>)
+; CHECK-NEXT:    [[RES:%.*]] = add nuw nsw <2 x i16> [[ZEXT]], <i16 1, i16 1>
 ; CHECK-NEXT:    ret <2 x i16> [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -101,7 +126,8 @@ define {<2 x i16>, <2 x i1>} @with_overflow(<2 x i8> %a) {
 ; CHECK-LABEL: define { <2 x i16>, <2 x i1> } @with_overflow(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = call { <2 x i16>, <2 x i1> } @llvm.uadd.with.overflow.v2i16(<2 x i16> [[ZEXT]], <2 x i16> <i16 1, i16 1>)
+; CHECK-NEXT:    [[RES1:%.*]] = add nuw nsw <2 x i16> [[ZEXT]], <i16 1, i16 1>
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue { <2 x i16>, <2 x i1> } { <2 x i16> poison, <2 x i1> zeroinitializer }, <2 x i16> [[RES1]], 0
 ; CHECK-NEXT:    ret { <2 x i16>, <2 x i1> } [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -113,7 +139,9 @@ define <2 x i16> @srem1(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @srem1(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = srem <2 x i16> [[ZEXT]], <i16 42, i16 42>
+; CHECK-NEXT:    [[RES1_LHS_TRUNC:%.*]] = trunc <2 x i16> [[ZEXT]] to <2 x i8>
+; CHECK-NEXT:    [[RES12:%.*]] = urem <2 x i8> [[RES1_LHS_TRUNC]], <i8 42, i8 42>
+; CHECK-NEXT:    [[RES:%.*]] = zext <2 x i8> [[RES12]] to <2 x i16>
 ; CHECK-NEXT:    ret <2 x i16> [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -125,7 +153,9 @@ define <2 x i16> @srem2(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @srem2(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = sext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = srem <2 x i16> [[ZEXT]], <i16 42, i16 42>
+; CHECK-NEXT:    [[RES_LHS_TRUNC:%.*]] = trunc <2 x i16> [[ZEXT]] to <2 x i8>
+; CHECK-NEXT:    [[RES1:%.*]] = srem <2 x i8> [[RES_LHS_TRUNC]], <i8 42, i8 42>
+; CHECK-NEXT:    [[RES:%.*]] = sext <2 x i8> [[RES1]] to <2 x i16>
 ; CHECK-NEXT:    ret <2 x i16> [[RES]]
 ;
   %zext = sext <2 x i8> %a to <2 x i16>
@@ -137,7 +167,7 @@ define <2 x i16> @ashr(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @ashr(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = ashr <2 x i16> [[ZEXT]], <i16 1, i16 1>
+; CHECK-NEXT:    [[RES:%.*]] = lshr <2 x i16> [[ZEXT]], <i16 1, i16 1>
 ; CHECK-NEXT:    ret <2 x i16> [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -149,7 +179,7 @@ define <2 x i32> @sext(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i32> @sext(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = sext <2 x i16> [[ZEXT]] to <2 x i32>
+; CHECK-NEXT:    [[RES:%.*]] = zext nneg <2 x i16> [[ZEXT]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -161,7 +191,7 @@ define <2 x float> @sitofp(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x float> @sitofp(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16>
-; CHECK-NEXT:    [[RES:%.*]] = sitofp <2 x i16> [[ZEXT]] to <2 x float>
+; CHECK-NEXT:    [[RES:%.*]] = uitofp nneg <2 x i16> [[ZEXT]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[RES]]
 ;
   %zext = zext <2 x i8> %a to <2 x i16>
@@ -169,6 +199,7 @@ define <2 x float> @sitofp(<2 x i8> %a) {
   ret <2 x float> %res
 }
 
+; TODO: Add support for this.
 define <2 x i16> @and(<2 x i8> %a) {
 ; CHECK-LABEL: define <2 x i16> @and(
 ; CHECK-SAME: <2 x i8> [[A:%.*]]) {