diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index c2b6ad43ccd6d..4f999edf3d571 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -184,3 +184,355 @@ vector.body:                                      ; preds = %vector.header, %vec
 for.end12:                                        ; preds = %vector.body
   ret void
 }
+
+
+define void @larger_smull(i16* nocapture noundef readonly %x, i16 noundef %y, i32* noalias nocapture noundef writeonly %s, i32 noundef %n) {
+; CHECK-LABEL: larger_smull:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w3, #1
+; CHECK-NEXT:    b.lt .LBB3_8
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    sxth w8, w1
+; CHECK-NEXT:    mov w9, w3
+; CHECK-NEXT:    cmp w3, #15
+; CHECK-NEXT:    b.hi .LBB3_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov x10, xzr
+; CHECK-NEXT:    b .LBB3_6
+; CHECK-NEXT:  .LBB3_3: // %vector.ph
+; CHECK-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-NEXT:    add x11, x2, #32
+; CHECK-NEXT:    add x12, x0, #16
+; CHECK-NEXT:    mov x13, x10
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:  .LBB3_4: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
+; CHECK-NEXT:    subs x13, x13, #16
+; CHECK-NEXT:    add x12, x12, #32
+; CHECK-NEXT:    sshll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    sshll2 v4.4s, v2.8h, #0
+; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
+; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    stp q1, q3, [x11, #-32]
+; CHECK-NEXT:    stp q2, q4, [x11], #64
+; CHECK-NEXT:    b.ne .LBB3_4
+; CHECK-NEXT:  // %bb.5: // %middle.block
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    b.eq .LBB3_8
+; CHECK-NEXT:  .LBB3_6: // %for.body.preheader1
+; CHECK-NEXT:    sub x9, x9, x10
+; CHECK-NEXT:    add x11, x2, x10, lsl #2
+; CHECK-NEXT:    add x10, x0, x10, lsl #1
+; CHECK-NEXT:  .LBB3_7: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrsh w12, [x10], #2
+; CHECK-NEXT:    subs x9, x9, #1
+; CHECK-NEXT:    mul w12, w12, w8
+; CHECK-NEXT:    str w12, [x11], #4
+; CHECK-NEXT:    b.ne .LBB3_7
+; CHECK-NEXT:  .LBB3_8: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %conv1 = sext i16 %y to i32
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 16
+  br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
+  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
+  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16, i16* %x, i64 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
+  %2 = getelementptr inbounds i16, i16* %0, i64 8
+  %3 = bitcast i16* %2 to <8 x i16>*
+  %wide.load11 = load <8 x i16>, <8 x i16>* %3, align 2
+  %4 = sext <8 x i16> %wide.load to <8 x i32>
+  %5 = sext <8 x i16> %wide.load11 to <8 x i32>
+  %6 = mul nsw <8 x i32> %broadcast.splat, %4
+  %7 = mul nsw <8 x i32> %broadcast.splat13, %5
+  %8 = getelementptr inbounds i32, i32* %s, i64 %index
+  %9 = bitcast i32* %8 to <8 x i32>*
+  store <8 x i32> %6, <8 x i32>* %9, align 4
+  %10 = getelementptr inbounds i32, i32* %8, i64 8
+  %11 = bitcast i32* %10 to <8 x i32>*
+  store <8 x i32> %7, <8 x i32>* %11, align 4
+  %index.next = add nuw i64 %index, 16
+  %12 = icmp eq i64 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
+
+for.body.preheader14:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader14, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
+  %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
+  %13 = load i16, i16* %arrayidx, align 2
+  %conv = sext i16 %13 to i32
+  %mul = mul nsw i32 %conv, %conv1
+  %arrayidx3 = getelementptr inbounds i32, i32* %s, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define void @larger_umull(i16* nocapture noundef readonly %x, i16 noundef %y, i32* noalias nocapture noundef writeonly %s, i32 noundef %n) {
+; CHECK-LABEL: larger_umull:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cmp w3, #1
+; CHECK-NEXT:    b.lt .LBB4_8
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    and w8, w1, #0xffff
+; CHECK-NEXT:    mov w9, w3
+; CHECK-NEXT:    cmp w3, #15
+; CHECK-NEXT:    b.hi .LBB4_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov x10, xzr
+; CHECK-NEXT:    b .LBB4_6
+; CHECK-NEXT:  .LBB4_3: // %vector.ph
+; CHECK-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-NEXT:    add x11, x2, #32
+; CHECK-NEXT:    add x12, x0, #16
+; CHECK-NEXT:    mov x13, x10
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:  .LBB4_4: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp q1, q2, [x12, #-16]
+; CHECK-NEXT:    subs x13, x13, #16
+; CHECK-NEXT:    add x12, x12, #32
+; CHECK-NEXT:    ushll2 v3.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v4.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mul v1.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    stp q1, q3, [x11, #-32]
+; CHECK-NEXT:    stp q2, q4, [x11], #64
+; CHECK-NEXT:    b.ne .LBB4_4
+; CHECK-NEXT:  // %bb.5: // %middle.block
+; CHECK-NEXT:    cmp x10, x9
+; CHECK-NEXT:    b.eq .LBB4_8
+; CHECK-NEXT:  .LBB4_6: // %for.body.preheader1
+; CHECK-NEXT:    sub x9, x9, x10
+; CHECK-NEXT:    add x11, x2, x10, lsl #2
+; CHECK-NEXT:    add x10, x0, x10, lsl #1
+; CHECK-NEXT:  .LBB4_7: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrh w12, [x10], #2
+; CHECK-NEXT:    subs x9, x9, #1
+; CHECK-NEXT:    mul w12, w12, w8
+; CHECK-NEXT:    str w12, [x11], #4
+; CHECK-NEXT:    b.ne .LBB4_7
+; CHECK-NEXT:  .LBB4_8: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  %conv1 = zext i16 %y to i32
+  %cmp8 = icmp sgt i32 %n, 0
+  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 16
+  br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0
+  %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer
+  %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0
+  %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16, i16* %x, i64 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
+  %2 = getelementptr inbounds i16, i16* %0, i64 8
+  %3 = bitcast i16* %2 to <8 x i16>*
+  %wide.load11 = load <8 x i16>, <8 x i16>* %3, align 2
+  %4 = zext <8 x i16> %wide.load to <8 x i32>
+  %5 = zext <8 x i16> %wide.load11 to <8 x i32>
+  %6 = mul nuw <8 x i32> %broadcast.splat, %4
+  %7 = mul nuw <8 x i32> %broadcast.splat13, %5
+  %8 = getelementptr inbounds i32, i32* %s, i64 %index
+  %9 = bitcast i32* %8 to <8 x i32>*
+  store <8 x i32> %6, <8 x i32>* %9, align 4
+  %10 = getelementptr inbounds i32, i32* %8, i64 8
+  %11 = bitcast i32* %10 to <8 x i32>*
+  store <8 x i32> %7, <8 x i32>* %11, align 4
+  %index.next = add nuw i64 %index, 16
+  %12 = icmp eq i64 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14
+
+for.body.preheader14:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader14, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ]
+  %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
+  %13 = load i16, i16* %arrayidx, align 2
+  %conv = zext i16 %13 to i32
+  %mul = mul nuw i32 %conv, %conv1
+  %arrayidx3 = getelementptr inbounds i32, i32* %s, i64 %indvars.iv
+  store i32 %mul, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+
+define i16 @red_mla_dup_ext_u8_s8_s16(i8* noalias nocapture noundef readonly %A, i8 noundef %B, i32 noundef %n) {
+; CHECK-LABEL: red_mla_dup_ext_u8_s8_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    cbz w2, .LBB5_3
+; CHECK-NEXT:  // %bb.1: // %for.body.preheader
+; CHECK-NEXT:    sxtb w9, w1
+; CHECK-NEXT:    mov w10, w2
+; CHECK-NEXT:    cmp w2, #15
+; CHECK-NEXT:    b.hi .LBB5_4
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov x11, xzr
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB5_7
+; CHECK-NEXT:  .LBB5_3:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB5_4: // %vector.ph
+; CHECK-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov x12, x11
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    dup v2.8h, w9
+; CHECK-NEXT:  .LBB5_5: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldp d3, d4, [x8, #-8]
+; CHECK-NEXT:    subs x12, x12, #16
+; CHECK-NEXT:    add x8, x8, #16
+; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-NEXT:    mla v0.8h, v2.8h, v3.8h
+; CHECK-NEXT:    mla v1.8h, v2.8h, v4.8h
+; CHECK-NEXT:    b.ne .LBB5_5
+; CHECK-NEXT:  // %bb.6: // %middle.block
+; CHECK-NEXT:    add v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    cmp x11, x10
+; CHECK-NEXT:    addv h0, v0.8h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    b.eq .LBB5_9
+; CHECK-NEXT:  .LBB5_7: // %for.body.preheader1
+; CHECK-NEXT:    sub x10, x10, x11
+; CHECK-NEXT:    add x11, x0, x11
+; CHECK-NEXT:  .LBB5_8: // %for.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldrb w12, [x11], #1
+; CHECK-NEXT:    subs x10, x10, #1
+; CHECK-NEXT:    madd w8, w12, w9, w8
+; CHECK-NEXT:    b.ne .LBB5_8
+; CHECK-NEXT:  .LBB5_9: // %for.cond.cleanup
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+entry:
+  %conv2 = sext i8 %B to i16
+  %cmp10.not = icmp eq i32 %n, 0
+  br i1 %cmp10.not, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 16
+  br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <8 x i16> poison, i16 %conv2, i64 0
+  %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
+  %broadcast.splatinsert15 = insertelement <8 x i16> poison, i16 %conv2, i64 0
+  %broadcast.splat16 = shufflevector <8 x i16> %broadcast.splatinsert15, <8 x i16> poison, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %8, %vector.body ]
+  %vec.phi13 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %A, i64 %index
+  %1 = bitcast i8* %0 to <8 x i8>*
+  %wide.load = load <8 x i8>, <8 x i8>* %1, align 1
+  %2 = getelementptr inbounds i8, i8* %0, i64 8
+  %3 = bitcast i8* %2 to <8 x i8>*
+  %wide.load14 = load <8 x i8>, <8 x i8>* %3, align 1
+  %4 = zext <8 x i8> %wide.load to <8 x i16>
+  %5 = zext <8 x i8> %wide.load14 to <8 x i16>
+  %6 = mul nsw <8 x i16> %broadcast.splat, %4
+  %7 = mul nsw <8 x i16> %broadcast.splat16, %5
+  %8 = add <8 x i16> %6, %vec.phi
+  %9 = add <8 x i16> %7, %vec.phi13
+  %index.next = add nuw i64 %index, 16
+  %10 = icmp eq i64 %index.next, %n.vec
+  br i1 %10, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <8 x i16> %9, %8
+  %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
+
+for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %s.011.ph = phi i16 [ 0, %for.body.preheader ], [ %11, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  %s.0.lcssa = phi i16 [ 0, %entry ], [ %11, %middle.block ], [ %add, %for.body ]
+  ret i16 %s.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader17, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
+  %s.011 = phi i16 [ %add, %for.body ], [ %s.011.ph, %for.body.preheader17 ]
+  %arrayidx = getelementptr inbounds i8, i8* %A, i64 %indvars.iv
+  %12 = load i8, i8* %arrayidx, align 1
+  %13 = zext i8 %12 to i16
+  %mul = mul nsw i16 %13, %conv2
+  %add = add i16 %mul, %s.011
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)