diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def index 1eae3586d16b8..23108e429eda8 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -28,6 +28,7 @@ HANDLE_NODETYPE(BR_IF) HANDLE_NODETYPE(BR_TABLE) HANDLE_NODETYPE(DOT) HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U) +HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S) HANDLE_NODETYPE(SHUFFLE) HANDLE_NODETYPE(SWIZZLE) HANDLE_NODETYPE(VEC_SHL) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index fe100dab427ef..aea27ba32d37e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -422,24 +422,30 @@ bool WebAssemblyTargetLowering::shouldExpandPartialReductionIntrinsic( return true; EVT VT = EVT::getEVT(I->getType()); + if (VT.getSizeInBits() > 128) + return true; + auto Op1 = I->getOperand(1); if (auto *InputInst = dyn_cast(Op1)) { - if (InstructionOpcodeToISD(InputInst->getOpcode()) != ISD::MUL) - return true; - - if (isa(InputInst->getOperand(0)) && - isa(InputInst->getOperand(1))) { - // dot only supports signed inputs but also support lowering unsigned. - if (cast(InputInst->getOperand(0))->getOpcode() != - cast(InputInst->getOperand(1))->getOpcode()) - return true; - - EVT Op1VT = EVT::getEVT(Op1->getType()); - if (Op1VT.getVectorElementType() == VT.getVectorElementType() && - ((VT.getVectorElementCount() * 2 == Op1VT.getVectorElementCount()) || - (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount()))) - return false; + unsigned Opcode = InstructionOpcodeToISD(InputInst->getOpcode()); + if (Opcode == ISD::MUL) { + if (isa(InputInst->getOperand(0)) && + isa(InputInst->getOperand(1))) { + // dot only supports signed inputs but also support lowering unsigned. + if (cast(InputInst->getOperand(0))->getOpcode() != + cast(InputInst->getOperand(1))->getOpcode()) + return true; + + EVT Op1VT = EVT::getEVT(Op1->getType()); + if (Op1VT.getVectorElementType() == VT.getVectorElementType() && + ((VT.getVectorElementCount() * 2 == + Op1VT.getVectorElementCount()) || + (VT.getVectorElementCount() * 4 == Op1VT.getVectorElementCount()))) + return false; + } + } else if (ISD::isExtOpcode(Opcode)) { + return false; } } return true; @@ -2117,77 +2123,93 @@ SDValue performLowerPartialReduction(SDNode *N, SelectionDAG &DAG) { assert(N->getValueType(0) == MVT::v4i32 && "can only support v4i32"); SDLoc DL(N); - SDValue Mul = N->getOperand(2); - assert(Mul->getOpcode() == ISD::MUL && "expected mul input"); - - SDValue ExtendLHS = Mul->getOperand(0); - SDValue ExtendRHS = Mul->getOperand(1); - assert((ISD::isExtOpcode(ExtendLHS.getOpcode()) && - ISD::isExtOpcode(ExtendRHS.getOpcode())) && - "expected widening mul"); - assert(ExtendLHS.getOpcode() == ExtendRHS.getOpcode() && - "expected mul to use the same extend for both operands"); - - SDValue ExtendInLHS = ExtendLHS->getOperand(0); - SDValue ExtendInRHS = ExtendRHS->getOperand(0); - bool IsSigned = ExtendLHS->getOpcode() == ISD::SIGN_EXTEND; - - if (ExtendInLHS->getValueType(0) == MVT::v8i16) { - if (IsSigned) { - // i32x4.dot_i16x8_s - SDValue Dot = DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, - ExtendInLHS, ExtendInRHS); - return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Dot); - } - unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_U; - unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_U; + SDValue Input = N->getOperand(2); + if (Input->getOpcode() == ISD::MUL) { + SDValue ExtendLHS = Input->getOperand(0); + SDValue ExtendRHS = Input->getOperand(1); + assert((ISD::isExtOpcode(ExtendLHS.getOpcode()) && + ISD::isExtOpcode(ExtendRHS.getOpcode())) && + "expected widening mul or add"); + assert(ExtendLHS.getOpcode() == ExtendRHS.getOpcode() && + "expected binop to use the same extend for both operands"); + + SDValue ExtendInLHS = ExtendLHS->getOperand(0); + SDValue ExtendInRHS = ExtendRHS->getOperand(0); + bool IsSigned = ExtendLHS->getOpcode() == ISD::SIGN_EXTEND; + unsigned LowOpc = + IsSigned ? WebAssemblyISD::EXTEND_LOW_S : WebAssemblyISD::EXTEND_LOW_U; + unsigned HighOpc = IsSigned ? WebAssemblyISD::EXTEND_HIGH_S + : WebAssemblyISD::EXTEND_HIGH_U; + SDValue LowLHS; + SDValue LowRHS; + SDValue HighLHS; + SDValue HighRHS; + + auto AssignInputs = [&](MVT VT) { + LowLHS = DAG.getNode(LowOpc, DL, VT, ExtendInLHS); + LowRHS = DAG.getNode(LowOpc, DL, VT, ExtendInRHS); + HighLHS = DAG.getNode(HighOpc, DL, VT, ExtendInLHS); + HighRHS = DAG.getNode(HighOpc, DL, VT, ExtendInRHS); + }; - // (add (add (extmul_low_sx lhs, rhs), (extmul_high_sx lhs, rhs))) - SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v4i32, ExtendInLHS); - SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v4i32, ExtendInRHS); - SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v4i32, ExtendInLHS); - SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v4i32, ExtendInRHS); + if (ExtendInLHS->getValueType(0) == MVT::v8i16) { + if (IsSigned) { + // i32x4.dot_i16x8_s + SDValue Dot = DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, + ExtendInLHS, ExtendInRHS); + return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Dot); + } - SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v4i32, LowLHS, LowRHS); - SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v4i32, HighLHS, HighRHS); - SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, MulLow, MulHigh); - return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add); + // (add (add (extmul_low_sx lhs, rhs), (extmul_high_sx lhs, rhs))) + MVT VT = MVT::v4i32; + AssignInputs(VT); + SDValue MulLow = DAG.getNode(ISD::MUL, DL, VT, LowLHS, LowRHS); + SDValue MulHigh = DAG.getNode(ISD::MUL, DL, VT, HighLHS, HighRHS); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, MulLow, MulHigh); + return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(1), Add); + } else { + assert(ExtendInLHS->getValueType(0) == MVT::v16i8 && + "expected v16i8 input types"); + AssignInputs(MVT::v8i16); + // Lower to a wider tree, using twice the operations compared to above. + if (IsSigned) { + // Use two dots + SDValue DotLHS = + DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, LowLHS, LowRHS); + SDValue DotRHS = + DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, HighLHS, HighRHS); + SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, DotLHS, DotRHS); + return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add); + } + + SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS); + SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS); + + SDValue AddLow = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, + MVT::v4i32, MulLow); + SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, + MVT::v4i32, MulHigh); + SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh); + return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add); + } } else { - assert(ExtendInLHS->getValueType(0) == MVT::v16i8 && - "expected v16i8 input types"); - // Lower to a wider tree, using twice the operations compared to above. - if (IsSigned) { - // Use two dots - unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_S; - unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_S; - SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInLHS); - SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInRHS); - SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInLHS); - SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInRHS); - SDValue DotLHS = - DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, LowLHS, LowRHS); - SDValue DotRHS = - DAG.getNode(WebAssemblyISD::DOT, DL, MVT::v4i32, HighLHS, HighRHS); - SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, DotLHS, DotRHS); + // Accumulate the input using extadd_pairwise. + assert(ISD::isExtOpcode(Input.getOpcode()) && "expected extend"); + bool IsSigned = Input->getOpcode() == ISD::SIGN_EXTEND; + unsigned PairwiseOpc = IsSigned ? WebAssemblyISD::EXT_ADD_PAIRWISE_S + : WebAssemblyISD::EXT_ADD_PAIRWISE_U; + SDValue ExtendIn = Input->getOperand(0); + if (ExtendIn->getValueType(0) == MVT::v8i16) { + SDValue Add = DAG.getNode(PairwiseOpc, DL, MVT::v4i32, ExtendIn); return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add); } - unsigned LowOpc = WebAssemblyISD::EXTEND_LOW_U; - unsigned HighOpc = WebAssemblyISD::EXTEND_HIGH_U; - SDValue LowLHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInLHS); - SDValue LowRHS = DAG.getNode(LowOpc, DL, MVT::v8i16, ExtendInRHS); - SDValue HighLHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInLHS); - SDValue HighRHS = DAG.getNode(HighOpc, DL, MVT::v8i16, ExtendInRHS); - - SDValue MulLow = DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS); - SDValue MulHigh = DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS); - - SDValue AddLow = - DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, MVT::v4i32, MulLow); - SDValue AddHigh = DAG.getNode(WebAssemblyISD::EXT_ADD_PAIRWISE_U, DL, - MVT::v4i32, MulHigh); - SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::v4i32, AddLow, AddHigh); + assert(ExtendIn->getValueType(0) == MVT::v16i8 && + "expected v16i8 input types"); + SDValue Add = + DAG.getNode(PairwiseOpc, DL, MVT::v4i32, + DAG.getNode(PairwiseOpc, DL, MVT::v8i16, ExtendIn)); return DAG.getNode(ISD::ADD, DL, MVT::v4i32, N->getOperand(1), Add); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 3c26b453c4482..d8948ad2df037 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1454,12 +1454,13 @@ def : Pat<(t1.vt (bitconvert (t2.vt V128:$v))), (t1.vt V128:$v)>; // Extended pairwise addition def extadd_pairwise_u : SDNode<"WebAssemblyISD::EXT_ADD_PAIRWISE_U", extend_t>; +def extadd_pairwise_s : SDNode<"WebAssemblyISD::EXT_ADD_PAIRWISE_S", extend_t>; -defm "" : SIMDConvert; defm "" : SIMDConvert; -defm "" : SIMDConvert; defm "" : SIMDConvert; @@ -1468,6 +1469,10 @@ def : Pat<(v4i32 (int_wasm_extadd_pairwise_unsigned (v8i16 V128:$in))), (extadd_pairwise_u_I32x4 V128:$in)>; def : Pat<(v8i16 (int_wasm_extadd_pairwise_unsigned (v16i8 V128:$in))), (extadd_pairwise_u_I16x8 V128:$in)>; +def : Pat<(v4i32 (int_wasm_extadd_pairwise_signed (v8i16 V128:$in))), + (extadd_pairwise_s_I32x4 V128:$in)>; +def : Pat<(v8i16 (int_wasm_extadd_pairwise_signed (v16i8 V128:$in))), + (extadd_pairwise_s_I16x8 V128:$in)>; // f64x2 <-> f32x4 conversions def demote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 0eefd3e2b3500..92a9812df2127 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -316,7 +316,13 @@ InstructionCost WebAssemblyTTIImpl::getPartialReductionCost( if (CostKind != TTI::TCK_RecipThroughput) return Invalid; - InstructionCost Cost(TTI::TCC_Basic); + if (Opcode != Instruction::Add) + return Invalid; + + EVT AccumEVT = EVT::getEVT(AccumType); + // TODO: Add i64 accumulator. + if (AccumEVT != MVT::i32) + return Invalid; // Possible options: // - i16x8.extadd_pairwise_i8x16_sx @@ -324,23 +330,26 @@ InstructionCost WebAssemblyTTIImpl::getPartialReductionCost( // - i32x4.dot_i16x8_s // Only try to support dot, for now. - if (Opcode != Instruction::Add) + EVT InputEVT = EVT::getEVT(InputTypeA); + if (!((InputEVT == MVT::i16 && VF.getFixedValue() == 8) || + (InputEVT == MVT::i8 && VF.getFixedValue() == 16))) { return Invalid; + } - if (!BinOp || *BinOp != Instruction::Mul) + if (OpAExtend == TTI::PR_None) return Invalid; - if (InputTypeA != InputTypeB) - return Invalid; + InstructionCost Cost(TTI::TCC_Basic); + if (!BinOp) + return Cost; if (OpAExtend != OpBExtend) return Invalid; - EVT InputEVT = EVT::getEVT(InputTypeA); - EVT AccumEVT = EVT::getEVT(AccumType); + if (*BinOp != Instruction::Mul) + return Invalid; - // TODO: Add i64 accumulator. - if (AccumEVT != MVT::i32) + if (InputTypeA != InputTypeB) return Invalid; // Signed inputs can lower to dot diff --git a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll new file mode 100644 index 0000000000000..47ea762864cc2 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll @@ -0,0 +1,609 @@ +; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s +; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s | llc -mtriple=wasm32 -mattr=+simd128 -verify-machineinstrs -o - | FileCheck %s --check-prefix=MAX-BANDWIDTH + +target triple = "wasm32" + +define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_add_u8_u8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_u +; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i32x4.add +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_u +; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; MAX-BANDWIDTH: i32x4.add + +entry: + %cmp8.not = icmp eq i32 %N, 0 + br i1 %cmp8.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.010 + %0 = load i8, ptr %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.010 + %1 = load i8, ptr %arrayidx1, align 1 + %conv2 = zext i8 %1 to i32 + %add = add i32 %result.09, %conv + %add3 = add i32 %add, %conv2 + %inc = add nuw i32 %i.010, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_add_s8_s8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_add_s8_s8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.add +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp8.not = icmp eq i32 %N, 0 + br i1 %cmp8.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.010 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.010 + %1 = load i8, ptr %arrayidx1, align 1 + %conv2 = sext i8 %1 to i32 + %add = add i32 %result.09, %conv + %add3 = add i32 %add, %conv2 + %inc = add nuw i32 %i.010, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_add_s8_u8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_add_s8_u8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.add +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_u +; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp8.not = icmp eq i32 %N, 0 + br i1 %cmp8.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.010 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.010 + %1 = load i8, ptr %arrayidx1, align 1 + %conv2 = zext i8 %1 to i32 + %add = add i32 %result.09, %conv + %add3 = add i32 %add, %conv2 + %inc = add nuw i32 %i.010, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_add_s8_s16(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_add_s8_s16: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.add +; CHECK: i32x4.load16x4_s +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp8.not = icmp eq i32 %N, 0 + br i1 %cmp8.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.010 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i16, ptr %b, i32 %i.010 + %1 = load i16, ptr %arrayidx1, align 2 + %conv2 = sext i16 %1 to i32 + %add = add i32 %result.09, %conv + %add3 = add i32 %add, %conv2 + %inc = add nuw i32 %i.010, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_shr_u8(ptr noundef readonly %a, i32 noundef %N) { +; CHECK-LABEL: accumulate_shr_u8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i8x16.shr_u +; CHECK: i16x8.extend_low_i8x16_u +; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.shr_u +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp4.not = icmp eq i32 %N, 0 + br i1 %cmp4.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.06 + %0 = load i8, ptr %arrayidx, align 1 + %1 = lshr i8 %0, 1 + %shr = zext nneg i8 %1 to i32 + %add = add i32 %result.05, %shr + %inc = add nuw i32 %i.06, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_shr_s8(ptr noundef readonly %a, i32 noundef %N) { +; CHECK-LABEL: accumulate_shr_s8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i8x16.shr_s +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.shr_s +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp4.not = icmp eq i32 %N, 0 + br i1 %cmp4.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.06 + %0 = load i8, ptr %arrayidx, align 1 + %1 = ashr i8 %0, 1 + %shr = sext i8 %1 to i32 + %add = add nsw i32 %result.05, %shr + %inc = add nuw i32 %i.06, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_max_u8_u8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_max_u8_u8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: v128.load32_zero +; CHECK: i8x16.max_u +; CHECK: i16x8.extend_low_i8x16_u +; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.max_u +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_u +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp17.not = icmp eq i32 %N, 0 + br i1 %cmp17.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.019 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.018 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.019 + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.019 + %1 = load i8, ptr %arrayidx1, align 1 + %. = tail call i8 @llvm.umax.i8(i8 %0, i8 %1) + %cond = zext i8 %. to i32 + %add = add i32 %result.018, %cond + %inc = add nuw i32 %i.019, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_min_s8_s8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_min_s8_s8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: v128.load32_zero +; CHECK: i8x16.min_s +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.min_s +; MAX-BANDWIDTH: i16x8.extadd_pairwise_i8x16_s +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp17.not = icmp eq i32 %N, 0 + br i1 %cmp17.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.019 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.018 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.019 + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.019 + %1 = load i8, ptr %arrayidx1, align 1 + %. = tail call i8 @llvm.smin.i8(i8 %0, i8 %1) + %cond = sext i8 %. to i32 + %add = add nsw i32 %result.018, %cond + %inc = add nuw i32 %i.019, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_add_u16_u16(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_add_u16_u16: +; CHECK: loop +; CHECK: i32x4.load16x4_u +; CHECK: i32x4.add +; CHECK: i32x4.load16x4_u +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp8.not = icmp eq i32 %N, 0 + br i1 %cmp8.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.010 + %0 = load i16, ptr %arrayidx, align 2 + %conv = zext i16 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i16, ptr %b, i32 %i.010 + %1 = load i16, ptr %arrayidx1, align 2 + %conv2 = zext i16 %1 to i32 + %add = add i32 %result.09, %conv + %add3 = add i32 %add, %conv2 + %inc = add nuw i32 %i.010, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_add_s16_s16(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_add_s16_s16: +; CHECK: loop +; CHECK: i32x4.load16x4_s +; CHECK: i32x4.add +; CHECK: i32x4.load16x4_s +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp8.not = icmp eq i32 %N, 0 + br i1 %cmp8.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.010 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.09 = phi i32 [ %add3, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.010 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sext i16 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i16, ptr %b, i32 %i.010 + %1 = load i16, ptr %arrayidx1, align 2 + %conv2 = sext i16 %1 to i32 + %add = add i32 %result.09, %conv + %add3 = add i32 %add, %conv2 + %inc = add nuw i32 %i.010, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_shr_u16(ptr noundef readonly %a, i32 noundef %N) { +; CHECK-LABEL: accumulate_shr_u16: +; CHECK: loop +; CHECK: v128.load64_zero +; CHECK: i16x8.shr_u +; CHECK: i32x4.extend_low_i16x8_u +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.shr_u +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_u +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp4.not = icmp eq i32 %N, 0 + br i1 %cmp4.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.06 + %0 = load i16, ptr %arrayidx, align 2 + %1 = lshr i16 %0, 1 + %shr = zext nneg i16 %1 to i32 + %add = add i32 %result.05, %shr + %inc = add nuw i32 %i.06, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_shr_s16(ptr noundef readonly %a, i32 noundef %N) { +; CHECK-LABEL: accumulate_shr_s16: +; CHECK: loop +; CHECK: v128.load64_zero +; CHECK: i16x8.shr_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.add + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i16x8.shr_s +; MAX-BANDWIDTH: i32x4.extadd_pairwise_i16x8_s +; MAX-BANDWIDTH: i32x4.add +entry: + %cmp4.not = icmp eq i32 %N, 0 + br i1 %cmp4.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.06 + %0 = load i16, ptr %arrayidx, align 2 + %1 = ashr i16 %0, 1 + %shr = sext i16 %1 to i32 + %add = add nsw i32 %result.05, %shr + %inc = add nuw i32 %i.06, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_sub_s8_s8(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_sub_s8_s8: +; CHECK: loop +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.add +; CHECK: v128.load32_zero +; CHECK: i16x8.extend_low_i8x16_s +; CHECK: i32x4.extend_low_i16x8_s +; CHECK: i32x4.sub + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.sub +; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.sub +; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.sub +entry: + %cmp7.not = icmp eq i32 %N, 0 + br i1 %cmp7.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.08 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %i.09 + %0 = load i8, ptr %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %i.09 + %1 = load i8, ptr %arrayidx1, align 1 + %conv2 = sext i8 %1 to i32 + %sub = add i32 %result.08, %conv + %add = sub i32 %sub, %conv2 + %inc = add nuw i32 %i.09, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +define hidden i32 @accumulate_sub_s16_s16(ptr noundef readonly %a, ptr noundef readonly %b, i32 noundef %N) { +; CHECK-LABEL: accumulate_sub_s16_s16: +; CHECK: loop +; CHECK: i32x4.load16x4_s +; CHECK: i32x4.add +; CHECK: i32x4.load16x4_s +; CHECK: i32x4.sub + +; MAX-BANDWIDTH: loop +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: v128.load +; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s +; MAX-BANDWIDTH: i32x4.sub +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.add +; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s +; MAX-BANDWIDTH: i32x4.sub +entry: + %cmp7.not = icmp eq i32 %N, 0 + br i1 %cmp7.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ] + ret i32 %result.0.lcssa + +for.body: ; preds = %entry, %for.body + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %result.08 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds nuw i16, ptr %a, i32 %i.09 + %0 = load i16, ptr %arrayidx, align 2 + %conv = sext i16 %0 to i32 + %arrayidx1 = getelementptr inbounds nuw i16, ptr %b, i32 %i.09 + %1 = load i16, ptr %arrayidx1, align 2 + %conv2 = sext i16 %1 to i32 + %sub = add i32 %result.08, %conv + %add = sub i32 %sub, %conv2 + %inc = add nuw i32 %i.09, 1 + %exitcond.not = icmp eq i32 %inc, %N + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare i8 @llvm.umax.i8(i8, i8) + +declare i8 @llvm.smin.i8(i8, i8)