diff --git a/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem-constrained.c b/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem-constrained.c
index 6371339f0a40d..1d0db697e4fdd 100644
--- a/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem-constrained.c
+++ b/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem-constrained.c
@@ -103,7 +103,7 @@ float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
 // COMMONIR:        [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 // UNCONSTRAINED:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
 // CONSTRAINED:     [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}]
+// CHECK-ASM:       fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 // COMMONIR:        [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
 // COMMONIR:        ret <1 x double> [[TMP7]]
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
@@ -122,7 +122,7 @@ float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
 // COMMONIR:        [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
 // UNCONSTRAINED:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
 // CONSTRAINED:     [[TMP6:%.*]] = call double @llvm.experimental.constrained.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]], metadata !"round.tonearest", metadata !"fpexcept.strict")
-// CHECK-ASM:       fmla d{{[0-9]+}}, d{{[0-9]+}}, v{{[0-9]+}}.d[{{[0-9]+}}]
+// CHECK-ASM:       fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 // COMMONIR:        [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
 // COMMONIR:        ret <1 x double> [[TMP7]]
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 885b70a50121f..57d69ae05c47f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -5409,6 +5409,44 @@ multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
                   (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
     let Inst{23-22} = 0b01; // 64-bit size flag
   }
+
+  let Predicates = [HasFullFP16] in {
+  def : Pat<(f16 (node (f16 FPR16:$Rn),
+                       (f16 (extractelt (v8f16 V128:$Rm), (i64 0))),
+                       (f16 FPR16:$Ra))),
+            (!cast<Instruction>(NAME # Hrrr)
+              FPR16:$Rn, (f16 (EXTRACT_SUBREG V128:$Rm, hsub)), FPR16:$Ra)>;
+
+  def : Pat<(f16 (node (f16 (extractelt (v8f16 V128:$Rn), (i64 0))),
+                       (f16 FPR16:$Rm),
+                       (f16 FPR16:$Ra))),
+            (!cast<Instruction>(NAME # Hrrr)
+              (f16 (EXTRACT_SUBREG V128:$Rn, hsub)), FPR16:$Rm, FPR16:$Ra)>;
+  }
+
+  def : Pat<(f32 (node (f32 FPR32:$Rn),
+                       (f32 (extractelt (v4f32 V128:$Rm), (i64 0))),
+                       (f32 FPR32:$Ra))),
+            (!cast<Instruction>(NAME # Srrr)
+              FPR32:$Rn, (EXTRACT_SUBREG V128:$Rm, ssub), FPR32:$Ra)>;
+
+  def : Pat<(f32 (node (f32 (extractelt (v4f32 V128:$Rn), (i64 0))),
+                       (f32 FPR32:$Rm),
+                       (f32 FPR32:$Ra))),
+            (!cast<Instruction>(NAME # Srrr)
+              (EXTRACT_SUBREG V128:$Rn, ssub), FPR32:$Rm, FPR32:$Ra)>;
+
+  def : Pat<(f64 (node (f64 FPR64:$Rn),
+                       (f64 (extractelt (v2f64 V128:$Rm), (i64 0))),
+                       (f64 FPR64:$Ra))),
+            (!cast<Instruction>(NAME # Drrr)
+              FPR64:$Rn, (EXTRACT_SUBREG V128:$Rm, dsub), FPR64:$Ra)>;
+
+  def : Pat<(f64 (node (f64 (extractelt (v2f64 V128:$Rn), (i64 0))),
+                       (f64 FPR64:$Rm),
+                       (f64 FPR64:$Ra))),
+            (!cast<Instruction>(NAME # Drrr)
+              (EXTRACT_SUBREG V128:$Rn, dsub), FPR64:$Rm, FPR64:$Ra)>;
 }
 
 //---
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
index f4e72ac5f810d..40433e2e076aa 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
@@ -7,16 +7,15 @@ target triple = "aarch64"
 define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
 ; CHECK-LABEL: complex_mul_v2f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    mov h3, v0.h[1]
-; CHECK-NEXT:    mov h2, v1.h[1]
-; CHECK-NEXT:    fmul h4, h0, v1.h[1]
-; CHECK-NEXT:    fnmul h2, h3, h2
-; CHECK-NEXT:    fmla h4, h3, v1.h[0]
-; CHECK-NEXT:    fmla h2, h0, v1.h[0]
-; CHECK-NEXT:    mov v2.h[1], v4.h[0]
-; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    mov h2, v0.h[1]
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    fmul h3, h0, v1.h[1]
+; CHECK-NEXT:    fmul h4, h2, v1.h[1]
+; CHECK-NEXT:    fmadd h2, h1, h2, h3
+; CHECK-NEXT:    fnmsub h0, h1, h0, h4
+; CHECK-NEXT:    mov v0.h[1], v2.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
 entry:
   %a.real   = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 0>
diff --git a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
index 5d956332de977..f68691ac15c62 100644
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@@ -80,11 +80,11 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
-; CHECK-LABEL: t_vfmah_lane_f16:
+define dso_local half @t_vfmah_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmah_lane_f16_0:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    fmla h0, h1, v2.h[0]
+; CHECK-NEXT:    fmadd h0, h1, h2, h0
 ; CHECK-NEXT:    ret
 entry:
   %extract = extractelement <4 x half> %c, i32 0
@@ -92,10 +92,34 @@ entry:
   ret half %0
 }
 
-define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
-; CHECK-LABEL: t_vfmah_laneq_f16:
+define dso_local half @t_vfmah_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmah_lane_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmla h0, h1, v2.h[0]
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmadd h0, h2, h1, h0
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <4 x half> %c, i32 0
+  %0 = tail call half @llvm.fma.f16(half %extract, half %b, half %a)
+  ret half %0
+}
+
+define dso_local half @t_vfmah_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmah_lane_f16_3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmla h0, h1, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <4 x half> %c, i32 3
+  %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
+  ret half %0
+}
+
+define dso_local half @t_vfmah_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmah_laneq_f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd h0, h1, h2, h0
 ; CHECK-NEXT:    ret
 entry:
   %extract = extractelement <8 x half> %c, i32 0
@@ -103,6 +127,28 @@ entry:
   ret half %0
 }
 
+define dso_local half @t_vfmah_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmah_laneq_f16_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmadd h0, h2, h1, h0
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <8 x half> %c, i32 0
+  %0 = tail call half @llvm.fma.f16(half %extract, half %b, half %a)
+  ret half %0
+}
+
+define dso_local half @t_vfmah_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmah_laneq_f16_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla h0, h1, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %extract = extractelement <8 x half> %c, i32 7
+  %0 = tail call half @llvm.fma.f16(half %b, half %extract, half %a)
+  ret half %0
+}
+
 define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_lane_f16:
 ; CHECK:       // %bb.0: // %entry
@@ -181,23 +227,49 @@ entry:
   ret <8 x half> %0
 }
 
-define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
-; CHECK-LABEL: t_vfmsh_lane_f16:
+define dso_local half @t_vfmsh_lane_f16_0(half %a, half %b, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmsh_lane_f16_0:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmsub h0, h2, h1, h0
+; CHECK-NEXT:    ret
+entry:
+  %0 = fsub half 0xH8000, %b
+  %extract = extractelement <4 x half> %c, i32 0
+  %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
+  ret half %1
+}
+
+define dso_local half @t_vfmsh_lane_f16_0_swap(half %a, half %b, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmsh_lane_f16_0_swap:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    fmls h0, h1, v2.h[0]
+; CHECK-NEXT:    fmsub h0, h2, h1, h0
 ; CHECK-NEXT:    ret
 entry:
   %0 = fsub half 0xH8000, %b
   %extract = extractelement <4 x half> %c, i32 0
+  %1 = tail call half @llvm.fma.f16(half %extract, half %0, half %a)
+  ret half %1
+}
+
+define dso_local half @t_vfmsh_lane_f16_3(half %a, half %b, <4 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmsh_lane_f16_3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    fmls h0, h1, v2.h[3]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fsub half 0xH8000, %b
+  %extract = extractelement <4 x half> %c, i32 3
   %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
   ret half %1
 }
 
-define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
-; CHECK-LABEL: t_vfmsh_laneq_f16:
+define dso_local half @t_vfmsh_laneq_f16_0(half %a, half %b, <8 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmsh_laneq_f16_0:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmls h0, h1, v2.h[0]
+; CHECK-NEXT:    fmsub h0, h2, h1, h0
 ; CHECK-NEXT:    ret
 entry:
   %0 = fsub half 0xH8000, %b
@@ -206,6 +278,30 @@ entry:
   ret half %1
 }
 
+define dso_local half @t_vfmsh_laneq_f16_0_swap(half %a, half %b, <8 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmsh_laneq_f16_0_swap:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmsub h0, h2, h1, h0
+; CHECK-NEXT:    ret
+entry:
+  %0 = fsub half 0xH8000, %b
+  %extract = extractelement <8 x half> %c, i32 0
+  %1 = tail call half @llvm.fma.f16(half %extract, half %0, half %a)
+  ret half %1
+}
+
+define dso_local half @t_vfmsh_laneq_f16_7(half %a, half %b, <8 x half> %c, i32 %lane) {
+; CHECK-LABEL: t_vfmsh_laneq_f16_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmls h0, h1, v2.h[7]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fsub half 0xH8000, %b
+  %extract = extractelement <8 x half> %c, i32 7
+  %1 = tail call half @llvm.fma.f16(half %0, half %extract, half %a)
+  ret half %1
+}
+
 define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmul_laneq_f16:
 ; CHECK:       // %bb.0: // %entry
diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 14b6e4b383adf..ed88293fcf7e3 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -7,56 +7,132 @@ declare double @llvm.fma.f64(double, double, double)
 declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
 declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
 
-define float @test_fmla_ss4S(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S
+define float @test_fmla_ss4S_0(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss4S_0
+  ; CHECK: fmadd s0, s1, s2, s0
+  %tmp1 = extractelement <4 x float> %v, i32 0
+  %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_0_swap(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss4S_0_swap
+  ; CHECK: fmadd s0, s2, s1, s0
+  %tmp1 = extractelement <4 x float> %v, i32 0
+  %tmp2 = call float @llvm.fma.f32(float %tmp1, float %b, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_3(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss4S_3
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
-define float @test_fmla_ss4S_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss4S_swap
+define float @test_fmla_ss4S_3_swap(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss4S_3_swap
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.fma.f32(float %tmp1, float %a, float %a)
   ret float %tmp2
 }
 
-define float @test_fmla_ss2S(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmla_ss2S
+define float @test_fmla_ss2S_0(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss2S_0
+  ; CHECK: fmadd s0, s1, s2, s0
+  %tmp1 = extractelement <2 x float> %v, i32 0
+  %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_0_swap(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss2S_0_swap
+  ; CHECK: fmadd s0, s2, s1, s0
+  %tmp1 = extractelement <2 x float> %v, i32 0
+  %tmp2 = call float @llvm.fma.f32(float %tmp1, float %b, float %a)
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_1(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmla_ss2S_1
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.fma.f32(float %b, float %tmp1, float %a)
   ret float %tmp2
 }
 
-define double @test_fmla_ddD(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmla_ddD
-  ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
+define double @test_fmla_ddD_0(double %a, double %b, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmla_ddD_0
+  ; CHECK: fmadd d0, d1, d2, d0
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
-define double @test_fmla_dd2D(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D
+define double @test_fmla_ddD_0_swap(double %a, double %b, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmla_ddD_0_swap
+  ; CHECK: fmadd d0, d2, d1, d0
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_0(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmla_dd2D_0
+  ; CHECK: fmadd d0, d1, d2, d0
+  %tmp1 = extractelement <2 x double> %v, i32 0
+  %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_0_swap(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmla_dd2D_0_swap
+  ; CHECK: fmadd d0, d2, d1, d0
+  %tmp1 = extractelement <2 x double> %v, i32 0
+  %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_1(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmla_dd2D_1
   ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %b, double %tmp1, double %a)
   ret double %tmp2
 }
 
-define double @test_fmla_dd2D_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmla_dd2D_swap
+define double @test_fmla_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmla_dd2D_1_swap
   ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.fma.f64(double %tmp1, double %b, double %a)
   ret double %tmp2
 }
 
-define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S
+define float @test_fmls_ss4S_0(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss4S_0
+  ; CHECK: fmsub s0, s2, s1, s0
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <4 x float> %v, i64 0
+  %0 = tail call float @llvm.fma.f32(float %fneg, float %extract, float %a)
+  ret float %0
+}
+
+define float @test_fmls_ss4S_0_swap(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss4S_0_swap
+  ; CHECK: fmsub s0, s2, s1, s0
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <4 x float> %v, i64 0
+  %0 = tail call float @llvm.fma.f32(float %extract, float %fneg, float %a)
+  ret float %0
+}
+
+define float @test_fmls_ss4S_3(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss4S_3
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
@@ -64,8 +140,8 @@ define float @test_fmls_ss4S(float %a, float %b, <4 x float> %v) {
   ret float %tmp3
 }
 
-define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss4S_swap
+define float @test_fmls_ss4S_3_swap(float %a, float %b, <4 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss4S_3_swap
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fsub float -0.0, %tmp1
@@ -74,8 +150,28 @@ define float @test_fmls_ss4S_swap(float %a, float %b, <4 x float> %v) {
 }
 
 
-define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
-  ; CHECK-LABEL: test_fmls_ss2S
+define float @test_fmls_ss2S_0(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss2S_0
+  ; CHECK: fmsub s0, s2, s1, s0
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <2 x float> %v, i64 0
+  %0 = tail call float @llvm.fma.f32(float %fneg, float %extract, float %a)
+  ret float %0
+}
+
+define float @test_fmls_ss2S_0_swap(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss2S_0_swap
+  ; CHECK: fmsub s0, s2, s1, s0
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <2 x float> %v, i64 0
+  %0 = tail call float @llvm.fma.f32(float %extract, float %fneg, float %a)
+  ret float %0
+}
+
+define float @test_fmls_ss2S_1(float %a, float %b, <2 x float> %v) {
+  ; CHECK-LABEL: test_fmls_ss2S_1
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fsub float -0.0, %tmp1
@@ -83,17 +179,48 @@ define float @test_fmls_ss2S(float %a, float %b, <2 x float> %v) {
   ret float %tmp3
 }
 
-define double @test_fmls_ddD(double %a, double %b, <1 x double> %v) {
-  ; CHECK-LABEL: test_fmls_ddD
-  ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
-  %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = fsub double -0.0, %tmp1
-  %tmp3 = call double @llvm.fma.f64(double %tmp2, double %tmp1, double %a)
-  ret double %tmp3
+define double @test_fmls_ddD_0(double %a, double %b, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmls_ddD_0
+  ; CHECK: fmsub d0, d1, d2, d0
+entry:
+  %fneg = fneg double %b
+  %extract = extractelement <1 x double> %v, i64 0
+  %0 = tail call double @llvm.fma.f64(double %fneg, double %extract, double %a)
+  ret double %0
+}
+
+define double @test_fmls_ddD_0_swap(double %a, double %b, <1 x double> %v) {
+  ; CHECK-LABEL: test_fmls_ddD_0_swap
+  ; CHECK: fmsub d0, d2, d1, d0
+entry:
+  %fneg = fneg double %b
+  %extract = extractelement <1 x double> %v, i64 0
+  %0 = tail call double @llvm.fma.f64(double %extract, double %fneg, double %a)
+  ret double %0
+}
+
+define double @test_fmls_dd2D_0(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmls_dd2D_0
+  ; CHECK: fmsub d0, d2, d1, d0
+entry:
+  %fneg = fneg double %b
+  %extract = extractelement <2 x double> %v, i64 0
+  %0 = tail call double @llvm.fma.f64(double %fneg, double %extract, double %a)
+  ret double %0
+}
+
+define double @test_fmls_dd2D_0_swap(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmls_dd2D_0_swap
+  ; CHECK: fmsub d0, d2, d1, d0
+entry:
+  %fneg = fneg double %b
+  %extract = extractelement <2 x double> %v, i64 0
+  %0 = tail call double @llvm.fma.f64(double %extract, double %fneg, double %a)
+  ret double %0
 }
 
-define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D
+define double @test_fmls_dd2D_1(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmls_dd2D_1
   ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
@@ -101,8 +228,8 @@ define double @test_fmls_dd2D(double %a, double %b, <2 x double> %v) {
   ret double %tmp3
 }
 
-define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
-  ; CHECK-LABEL: test_fmls_dd2D_swap
+define double @test_fmls_dd2D_1_swap(double %a, double %b, <2 x double> %v) {
+  ; CHECK-LABEL: test_fmls_dd2D_1_swap
   ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fsub double -0.0, %tmp1
@@ -110,56 +237,132 @@ define double @test_fmls_dd2D_swap(double %a, double %b, <2 x double> %v) {
   ret double %tmp3
 }
 
-define float @test_fmla_ss4S_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_strict
+define float @test_fmla_ss4S_0_strict(float %a, float %b, <4 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ss4S_0_strict
+  ; CHECK: fmadd s0, s1, s2, s0
+  %tmp1 = extractelement <4 x float> %v, i32 0
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_0_swap_strict(float %a, float %b, <4 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ss4S_0_swap_strict
+  ; CHECK: fmadd s0, s2, s1, s0
+  %tmp1 = extractelement <4 x float> %v, i32 0
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %b, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret float %tmp2
+}
+
+define float @test_fmla_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ss4S_3_strict
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
-define float @test_fmla_ss4S_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss4S_swap_strict
+define float @test_fmla_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ss4S_3_swap_strict
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %a, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
-define float @test_fmla_ss2S_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ss2S_strict
+define float @test_fmla_ss2S_0_strict(float %a, float %b, <2 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ss2S_0_strict
+  ; CHECK: fmadd s0, s1, s2, s0
+  %tmp1 = extractelement <2 x float> %v, i32 0
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_0_swap_strict(float %a, float %b, <2 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ss2S_0_swap_strict
+  ; CHECK: fmadd s0, s2, s1, s0
+  %tmp1 = extractelement <2 x float> %v, i32 0
+  %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %tmp1, float %b, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret float %tmp2
+}
+
+define float @test_fmla_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ss2S_1_strict
   ; CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = call float @llvm.experimental.constrained.fma.f32(float %b, float %tmp1, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret float %tmp2
 }
 
-define double @test_fmla_ddD_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_ddD_strict
-  ; CHECK: {{fmla d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmadd d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
+define double @test_fmla_ddD_0_strict(double %a, double %b, <1 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ddD_0_strict
+  ; CHECK: fmadd d0, d1, d2, d0
   %tmp1 = extractelement <1 x double> %v, i32 0
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
-define double @test_fmla_dd2D_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_strict
+define double @test_fmla_ddD_0_swap_strict(double %a, double %b, <1 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmla_ddD_0_swap_strict
+  ; CHECK: fmadd d0, d2, d1, d0
+  %tmp1 = extractelement <1 x double> %v, i32 0
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_0_strict(double %a, double %b, <2 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmla_dd2D_0_strict
+  ; CHECK: fmadd d0, d1, d2, d0
+  %tmp1 = extractelement <2 x double> %v, i32 0
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_0_swap_strict(double %a, double %b, <2 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmla_dd2D_0_swap_strict
+  ; CHECK: fmadd d0, d2, d1, d0
+  %tmp1 = extractelement <2 x double> %v, i32 0
+  %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret double %tmp2
+}
+
+define double @test_fmla_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmla_dd2D_1_strict
   ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %b, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
-define double @test_fmla_dd2D_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmla_dd2D_swap_strict
+define double @test_fmla_dd2D_1_swap_strict(double %a, double %b, <2 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmla_dd2D_1_swap_strict
   ; CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = call double @llvm.experimental.constrained.fma.f64(double %tmp1, double %b, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
   ret double %tmp2
 }
 
-define float @test_fmls_ss4S_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_strict
+define float @test_fmls_ss4S_0_strict(float %a, float %b, <4 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ss4S_0_strict
+  ; CHECK: fmsub s0, s2, s1, s0
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <4 x float> %v, i64 0
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %fneg, float %extract, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret float %0
+}
+
+define float @test_fmls_ss4S_0_swap_strict(float %a, float %b, <4 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ss4S_0_swap_strict
+  ; CHECK: fmsub s0, s2, s1, s0
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <4 x float> %v, i64 0
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %extract, float %fneg, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret float %0
+}
+
+define float @test_fmls_ss4S_3_strict(float %a, float %b, <4 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ss4S_3_strict
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fneg float %tmp1
@@ -167,8 +370,8 @@ define float @test_fmls_ss4S_strict(float %a, float %b, <4 x float> %v) #0 {
   ret float %tmp3
 }
 
-define float @test_fmls_ss4S_swap_strict(float %a, float %b, <4 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss4S_swap_strict
+define float @test_fmls_ss4S_3_swap_strict(float %a, float %b, <4 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ss4S_3_swap_strict
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
   %tmp2 = fneg float %tmp1
@@ -176,8 +379,28 @@ define float @test_fmls_ss4S_swap_strict(float %a, float %b, <4 x float> %v) #0
   ret float %tmp3
 }
 
-define float @test_fmls_ss2S_strict(float %a, float %b, <2 x float> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ss2S_strict
+define float @test_fmls_ss2S_0_strict(float %a, float %b, <2 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ss2S_0_strict
+  ; CHECK: fmsub s0, s2, s1, s0
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <2 x float> %v, i64 0
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %fneg, float %extract, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret float %0
+}
+
+define float @test_fmls_ss2S_0_swap_strict(float %a, float %b, <2 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ss2S_0_swap_strict
+  ; CHECK: fmsub s0, s2, s1, s0
+entry:
+  %fneg = fneg float %b
+  %extract = extractelement <2 x float> %v, i64 0
+  %0 = tail call float @llvm.experimental.constrained.fma.f32(float %extract, float %fneg, float %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret float %0
+}
+
+define float @test_fmls_ss2S_1_strict(float %a, float %b, <2 x float> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ss2S_1_strict
   ; CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
   %tmp2 = fneg float %tmp1
@@ -185,17 +408,48 @@ define float @test_fmls_ss2S_strict(float %a, float %b, <2 x float> %v) #0 {
   ret float %tmp3
 }
 
-define double @test_fmls_ddD_strict(double %a, double %b, <1 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_ddD_strict
-  ; CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+.d\[0]|fmsub d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}
-  %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = fneg double %tmp1
-  %tmp3 = call double @llvm.experimental.constrained.fma.f64(double %tmp2, double %tmp1, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
-  ret double %tmp3
+define double @test_fmls_ddD_0_strict(double %a, double %b, <1 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ddD_0_strict
+  ; CHECK: fmsub d0, d2, d1, d0
+entry:
+  %fneg = fneg double %b
+  %extract = extractelement <1 x double> %v, i64 0
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %fneg, double %extract, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret double %0
+}
+
+define double @test_fmls_ddD_0_swap_strict(double %a, double %b, <1 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmls_ddD_0_swap_strict
+  ; CHECK: fmsub d0, d2, d1, d0
+entry:
+  %fneg = fneg double %b
+  %extract = extractelement <1 x double> %v, i64 0
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %extract, double %fneg, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret double %0
+}
+
+define double @test_fmls_dd2D_0_strict(double %a, double %b, <2 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmls_dd2D_0_strict
+  ; CHECK: fmsub d0, d2, d1, d0
+entry:
+  %fneg = fneg double %b
+  %extract = extractelement <2 x double> %v, i64 0
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %fneg, double %extract, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret double %0
+}
+
+define double @test_fmls_dd2D_0_swap_strict(double %a, double %b, <2 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmls_dd2D_0_swap_strict
+  ; CHECK: fmsub d0, d2, d1, d0
+entry:
+  %fneg = fneg double %b
+  %extract = extractelement <2 x double> %v, i64 0
+  %0 = tail call double @llvm.experimental.constrained.fma.f64(double %extract, double %fneg, double %a, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  ret double %0
 }
 
-define double @test_fmls_dd2D_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_strict
+define double @test_fmls_dd2D_1_strict(double %a, double %b, <2 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmls_dd2D_1_strict
   ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fneg double %tmp1
@@ -203,8 +457,8 @@ define double @test_fmls_dd2D_strict(double %a, double %b, <2 x double> %v) #0 {
   ret double %tmp3
 }
 
-define double @test_fmls_dd2D_swap_strict(double %a, double %b, <2 x double> %v) #0 {
-  ; CHECK-LABEL: test_fmls_dd2D_swap_strict
+define double @test_fmls_dd2D_1_swap_strict(double %a, double %b, <2 x double> %v) #0 {
+  ; CHECK-LABEL: test_fmls_dd2D_1_swap_strict
   ; CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
   %tmp2 = fneg double %tmp1