diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll index 6613f911f82585..0fdb60cc08b0e6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-dup.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll @@ -6,15 +6,15 @@ define <8 x i8> @v_dup8(i8 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.8b v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 - %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 - %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 - %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 - %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 - %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 - %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 - %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 - ret <8 x i8> %tmp8 + %tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0 + %tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1 + %tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2 + %tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3 + %tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4 + %tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5 + %tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6 + %tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7 + ret <8 x i8> %tmp8 } define <4 x i16> @v_dup16(i16 %A) nounwind { @@ -22,11 +22,11 @@ define <4 x i16> @v_dup16(i16 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.4h v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 - %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 - %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 - %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 - ret <4 x i16> %tmp4 + %tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0 + %tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1 + %tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2 + %tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3 + ret <4 x i16> %tmp4 } define <2 x i32> @v_dup32(i32 %A) nounwind { @@ -34,9 +34,9 @@ define <2 x i32> @v_dup32(i32 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.2s v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 - %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 - ret <2 x i32> %tmp2 + %tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0 + %tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1 + ret <2 x i32> %tmp2 } define <2 x float> @v_dupfloat(float %A) nounwind { @@ -45,9 +45,9 @@ define <2 x float> @v_dupfloat(float %A) nounwind { ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: dup.2s v0, v0[0] ; CHECK-NEXT: ret - %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 - %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 - ret <2 x float> %tmp2 + %tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0 + %tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1 + ret <2 x float> %tmp2 } define <16 x i8> @v_dupQ8(i8 %A) nounwind { @@ -55,23 +55,23 @@ define <16 x i8> @v_dupQ8(i8 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.16b v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 - %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 - %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 - %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 - %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 - %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 - %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 - %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 - %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 - %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 - %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 - %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 - %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 - %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 - %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 - %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 - ret <16 x i8> %tmp16 + %tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0 + %tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1 + %tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2 + %tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3 + %tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4 + %tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5 + %tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6 + %tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7 + %tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8 + %tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9 + %tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10 + %tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11 + %tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12 + %tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13 + %tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14 + %tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15 + ret <16 x i8> %tmp16 } define <8 x i16> @v_dupQ16(i16 %A) nounwind { @@ -79,15 +79,15 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.8h v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 - %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 - %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 - %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 - %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 - %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 - %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 - %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 - ret <8 x i16> %tmp8 + %tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0 + %tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1 + %tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2 + %tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3 + %tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4 + %tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5 + %tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6 + %tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7 + ret <8 x i16> %tmp8 } define <4 x i32> @v_dupQ32(i32 %A) nounwind { @@ -95,11 +95,11 @@ define <4 x i32> @v_dupQ32(i32 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.4s v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 - %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 - %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 - %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 - ret <4 x i32> %tmp4 + %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0 + %tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1 + %tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2 + %tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3 + ret <4 x i32> %tmp4 } define <4 x float> @v_dupQfloat(float %A) nounwind { @@ -108,11 +108,11 @@ define <4 x float> @v_dupQfloat(float %A) nounwind { ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: dup.4s v0, v0[0] ; CHECK-NEXT: ret - %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 - %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 - %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 - %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 - ret <4 x float> %tmp4 + %tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0 + %tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1 + %tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2 + %tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3 + ret <4 x float> %tmp4 } ; Check to make sure it works with shuffles, too. @@ -122,9 +122,9 @@ define <8 x i8> @v_shuffledup8(i8 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.8b v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 - %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer - ret <8 x i8> %tmp2 + %tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0 + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer + ret <8 x i8> %tmp2 } define <4 x i16> @v_shuffledup16(i16 %A) nounwind { @@ -132,9 +132,9 @@ define <4 x i16> @v_shuffledup16(i16 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.4h v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 - %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer - ret <4 x i16> %tmp2 + %tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0 + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer + ret <4 x i16> %tmp2 } define <2 x i32> @v_shuffledup32(i32 %A) nounwind { @@ -142,9 +142,9 @@ define <2 x i32> @v_shuffledup32(i32 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.2s v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 - %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer - ret <2 x i32> %tmp2 + %tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0 + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer + ret <2 x i32> %tmp2 } define <2 x float> @v_shuffledupfloat(float %A) nounwind { @@ -153,9 +153,9 @@ define <2 x float> @v_shuffledupfloat(float %A) nounwind { ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: dup.2s v0, v0[0] ; CHECK-NEXT: ret - %tmp1 = insertelement <2 x float> undef, float %A, i32 0 - %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer - ret <2 x float> %tmp2 + %tmp1 = insertelement <2 x float> undef, float %A, i32 0 + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer + ret <2 x float> %tmp2 } define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { @@ -163,9 +163,9 @@ define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.16b v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 - %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer - ret <16 x i8> %tmp2 + %tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0 + %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer + ret <16 x i8> %tmp2 } define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { @@ -173,9 +173,9 @@ define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.8h v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 - %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer - ret <8 x i16> %tmp2 + %tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0 + %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %tmp2 } define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { @@ -183,9 +183,9 @@ define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: dup.4s v0, w0 ; CHECK-NEXT: ret - %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 - %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer - ret <4 x i32> %tmp2 + %tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0 + %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer + ret <4 x i32> %tmp2 } define <4 x float> @v_shuffledupQfloat(float %A) nounwind { @@ -194,97 +194,89 @@ define <4 x float> @v_shuffledupQfloat(float %A) nounwind { ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 ; CHECK-NEXT: dup.4s v0, v0[0] ; CHECK-NEXT: ret - %tmp1 = insertelement <4 x float> undef, float %A, i32 0 - %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer - ret <4 x float> %tmp2 + %tmp1 = insertelement <4 x float> undef, float %A, i32 0 + %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer + ret <4 x float> %tmp2 } -define <8 x i8> @vduplane8(ptr %A) nounwind { +define <8 x i8> @vduplane8(<8 x i8> %A) nounwind { ; CHECK-LABEL: vduplane8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: dup.8b v0, v0[1] ; CHECK-NEXT: ret - %tmp1 = load <8 x i8>, ptr %A - %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > - ret <8 x i8> %tmp2 + %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <8 x i8> %tmp2 } -define <4 x i16> @vduplane16(ptr %A) nounwind { +define <4 x i16> @vduplane16(<4 x i16> %A) nounwind { ; CHECK-LABEL: vduplane16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: dup.4h v0, v0[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > - ret <4 x i16> %tmp2 + %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x i16> %tmp2 } -define <2 x i32> @vduplane32(ptr %A) nounwind { +define <2 x i32> @vduplane32(<2 x i32> %A) nounwind { ; CHECK-LABEL: vduplane32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: dup.2s v0, v0[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > - ret <2 x i32> %tmp2 + %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > + ret <2 x i32> %tmp2 } -define <2 x float> @vduplanefloat(ptr %A) nounwind { +define <2 x float> @vduplanefloat(<2 x float> %A) nounwind { ; CHECK-LABEL: vduplanefloat: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: dup.2s v0, v0[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x float>, ptr %A - %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > - ret <2 x float> %tmp2 + %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 > + ret <2 x float> %tmp2 } -define <16 x i8> @vduplaneQ8(ptr %A) nounwind { +define <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind { ; CHECK-LABEL: vduplaneQ8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: dup.16b v0, v0[1] ; CHECK-NEXT: ret - %tmp1 = load <8 x i8>, ptr %A - %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > - ret <16 x i8> %tmp2 + %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <16 x i8> %tmp2 } -define <8 x i16> @vduplaneQ16(ptr %A) nounwind { +define <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind { ; CHECK-LABEL: vduplaneQ16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: dup.8h v0, v0[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > - ret <8 x i16> %tmp2 + %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <8 x i16> %tmp2 } -define <4 x i32> @vduplaneQ32(ptr %A) nounwind { +define <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind { ; CHECK-LABEL: vduplaneQ32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: dup.4s v0, v0[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > - ret <4 x i32> %tmp2 + %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x i32> %tmp2 } -define <4 x float> @vduplaneQfloat(ptr %A) nounwind { +define <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind { ; CHECK-LABEL: vduplaneQfloat: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-NEXT: dup.4s v0, v0[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x float>, ptr %A - %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > - ret <4 x float> %tmp2 + %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x float> %tmp2 } define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { diff --git a/llvm/test/CodeGen/AArch64/arm64-vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll index 0a29d6b86659ea..7f743f605f255d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vmul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll @@ -1081,59 +1081,45 @@ declare <2 x float> @llvm.experimental.constrained.fma.v2f32(<2 x float>, <2 x f declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata) declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata) -define <4 x i16> @mul_4h(ptr %A, ptr %B) nounwind { +define <4 x i16> @mul_4h(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: mul_4h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: mul.4h v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp4 = mul <4 x i16> %tmp1, %tmp3 + %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp4 = mul <4 x i16> %A, %tmp3 ret <4 x i16> %tmp4 } -define <8 x i16> @mul_8h(ptr %A, ptr %B) nounwind { +define <8 x i16> @mul_8h(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: mul_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: mul.8h v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <8 x i16>, ptr %A - %tmp2 = load <8 x i16>, ptr %B - %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> - %tmp4 = mul <8 x i16> %tmp1, %tmp3 + %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> + %tmp4 = mul <8 x i16> %A, %tmp3 ret <8 x i16> %tmp4 } -define <2 x i32> @mul_2s(ptr %A, ptr %B) nounwind { +define <2 x i32> @mul_2s(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: mul_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: mul.2s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp4 = mul <2 x i32> %tmp1, %tmp3 + %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp4 = mul <2 x i32> %A, %tmp3 ret <2 x i32> %tmp4 } -define <4 x i32> @mul_4s(ptr %A, ptr %B) nounwind { +define <4 x i32> @mul_4s(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: mul_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: mul.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i32>, ptr %A - %tmp2 = load <4 x i32>, ptr %B - %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> - %tmp4 = mul <4 x i32> %tmp1, %tmp3 + %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> + %tmp4 = mul <4 x i32> %A, %tmp3 ret <4 x i32> %tmp4 } @@ -1153,45 +1139,34 @@ define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind { ret <2 x i64> %tmp1 } -define <2 x float> @fmul_lane_2s(ptr %A, ptr %B) nounwind { +define <2 x float> @fmul_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { ; CHECK-LABEL: fmul_lane_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fmul.2s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x float>, ptr %A - %tmp2 = load <2 x float>, ptr %B - %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> - %tmp4 = fmul <2 x float> %tmp1, %tmp3 + %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> + %tmp4 = fmul <2 x float> %A, %tmp3 ret <2 x float> %tmp4 } -define <4 x float> @fmul_lane_4s(ptr %A, ptr %B) nounwind { +define <4 x float> @fmul_lane_4s(<4 x float> %A, <4 x float> %B) nounwind { ; CHECK-LABEL: fmul_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fmul.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x float>, ptr %A - %tmp2 = load <4 x float>, ptr %B - %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> - %tmp4 = fmul <4 x float> %tmp1, %tmp3 + %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> + %tmp4 = fmul <4 x float> %A, %tmp3 ret <4 x float> %tmp4 } -define <2 x double> @fmul_lane_2d(ptr %A, ptr %B) nounwind { +define <2 x double> @fmul_lane_2d(<2 x double> %A, <2 x double> %B) nounwind { ; CHECK-LABEL: fmul_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fmul.2d v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x double>, ptr %A - %tmp2 = load <2 x double>, ptr %B - %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> - %tmp4 = fmul <2 x double> %tmp1, %tmp3 + %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> + %tmp4 = fmul <2 x double> %A, %tmp3 ret <2 x double> %tmp4 } @@ -1217,101 +1192,76 @@ define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind { -define <2 x float> @fmulx_lane_2s(ptr %A, ptr %B) nounwind { +define <2 x float> @fmulx_lane_2s(<2 x float> %A, <2 x float> %B) nounwind { ; CHECK-LABEL: fmulx_lane_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: fmulx.2s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x float>, ptr %A - %tmp2 = load <2 x float>, ptr %B - %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> - %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3) + %tmp3 = shufflevector <2 x float> %B, <2 x float> poison, <2 x i32> + %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %A, <2 x float> %tmp3) ret <2 x float> %tmp4 } -define <4 x float> @fmulx_lane_4s(ptr %A, ptr %B) nounwind { +define <4 x float> @fmulx_lane_4s(<4 x float> %A, <4 x float> %B) nounwind { ; CHECK-LABEL: fmulx_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fmulx.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x float>, ptr %A - %tmp2 = load <4 x float>, ptr %B - %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> - %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3) + %tmp3 = shufflevector <4 x float> %B, <4 x float> poison, <4 x i32> + %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %A, <4 x float> %tmp3) ret <4 x float> %tmp4 } -define <2 x double> @fmulx_lane_2d(ptr %A, ptr %B) nounwind { +define <2 x double> @fmulx_lane_2d(<2 x double> %A, <2 x double> %B) nounwind { ; CHECK-LABEL: fmulx_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fmulx.2d v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x double>, ptr %A - %tmp2 = load <2 x double>, ptr %B - %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> - %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3) + %tmp3 = shufflevector <2 x double> %B, <2 x double> poison, <2 x i32> + %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %A, <2 x double> %tmp3) ret <2 x double> %tmp4 } -define <4 x i16> @sqdmulh_lane_4h(ptr %A, ptr %B) nounwind { +define <4 x i16> @sqdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_4h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: sqdmulh.4h v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3) + %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3) ret <4 x i16> %tmp4 } -define <8 x i16> @sqdmulh_lane_8h(ptr %A, ptr %B) nounwind { +define <8 x i16> @sqdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: sqdmulh.8h v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <8 x i16>, ptr %A - %tmp2 = load <8 x i16>, ptr %B - %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> - %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3) + %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> + %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3) ret <8 x i16> %tmp4 } -define <2 x i32> @sqdmulh_lane_2s(ptr %A, ptr %B) nounwind { +define <2 x i32> @sqdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: sqdmulh.2s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3) + %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3) ret <2 x i32> %tmp4 } -define <4 x i32> @sqdmulh_lane_4s(ptr %A, ptr %B) nounwind { +define <4 x i32> @sqdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: sqdmulh_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: sqdmulh.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i32>, ptr %A - %tmp2 = load <4 x i32>, ptr %B - %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> - %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3) + %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> + %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3) ret <4 x i32> %tmp4 } @@ -1327,59 +1277,45 @@ define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { ret i32 %tmp2 } -define <4 x i16> @sqrdmulh_lane_4h(ptr %A, ptr %B) nounwind { +define <4 x i16> @sqrdmulh_lane_4h(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_4h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: sqrdmulh.4h v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3) + %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %A, <4 x i16> %tmp3) ret <4 x i16> %tmp4 } -define <8 x i16> @sqrdmulh_lane_8h(ptr %A, ptr %B) nounwind { +define <8 x i16> @sqrdmulh_lane_8h(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_8h: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: sqrdmulh.8h v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <8 x i16>, ptr %A - %tmp2 = load <8 x i16>, ptr %B - %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> - %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3) + %tmp3 = shufflevector <8 x i16> %B, <8 x i16> poison, <8 x i32> + %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %A, <8 x i16> %tmp3) ret <8 x i16> %tmp4 } -define <2 x i32> @sqrdmulh_lane_2s(ptr %A, ptr %B) nounwind { +define <2 x i32> @sqrdmulh_lane_2s(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_2s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: sqrdmulh.2s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3) + %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %A, <2 x i32> %tmp3) ret <2 x i32> %tmp4 } -define <4 x i32> @sqrdmulh_lane_4s(ptr %A, ptr %B) nounwind { +define <4 x i32> @sqrdmulh_lane_4s(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: sqrdmulh_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: sqrdmulh.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i32>, ptr %A - %tmp2 = load <4 x i32>, ptr %B - %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> - %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3) + %tmp3 = shufflevector <4 x i32> %B, <4 x i32> poison, <4 x i32> + %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %A, <4 x i32> %tmp3) ret <4 x i32> %tmp4 } @@ -1395,221 +1331,169 @@ define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind { ret i32 %tmp2 } -define <4 x i32> @sqdmull_lane_4s(ptr %A, ptr %B) nounwind { +define <4 x i32> @sqdmull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: sqdmull_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: sqdmull.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) + %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) ret <4 x i32> %tmp4 } -define <2 x i64> @sqdmull_lane_2d(ptr %A, ptr %B) nounwind { +define <2 x i64> @sqdmull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: sqdmull_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: sqdmull.2d v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) + %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) ret <2 x i64> %tmp4 } -define <4 x i32> @sqdmull2_lane_4s(ptr %A, ptr %B) nounwind { +define <4 x i32> @sqdmull2_lane_4s(<8 x i16> %A, <8 x i16> %B) nounwind { ; CHECK-LABEL: sqdmull2_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmull.4s v0, v0, v1[1] +; CHECK-NEXT: sqdmull2.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %load1 = load <8 x i16>, ptr %A - %load2 = load <8 x i16>, ptr %B - %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> - %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> + %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> + %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) ret <4 x i32> %tmp4 } -define <2 x i64> @sqdmull2_lane_2d(ptr %A, ptr %B) nounwind { +define <2 x i64> @sqdmull2_lane_2d(<4 x i32> %A, <4 x i32> %B) nounwind { ; CHECK-LABEL: sqdmull2_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0, #8] -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: sqdmull.2d v0, v0, v1[1] +; CHECK-NEXT: sqdmull2.2d v0, v0, v1[1] ; CHECK-NEXT: ret - %load1 = load <4 x i32>, ptr %A - %load2 = load <4 x i32>, ptr %B - %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> - %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> + %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> + %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) ret <2 x i64> %tmp4 } -define <4 x i32> @umull_lane_4s(ptr %A, ptr %B) nounwind { +define <4 x i32> @umull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: umull_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: umull.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) + %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) ret <4 x i32> %tmp4 } -define <2 x i64> @umull_lane_2d(ptr %A, ptr %B) nounwind { +define <2 x i64> @umull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: umull_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: umull.2d v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) + %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) ret <2 x i64> %tmp4 } -define <4 x i32> @smull_lane_4s(ptr %A, ptr %B) nounwind { +define <4 x i32> @smull_lane_4s(<4 x i16> %A, <4 x i16> %B) nounwind { ; CHECK-LABEL: smull_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: smull.4s v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) + %tmp3 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp3) ret <4 x i32> %tmp4 } -define <2 x i64> @smull_lane_2d(ptr %A, ptr %B) nounwind { +define <2 x i64> @smull_lane_2d(<2 x i32> %A, <2 x i32> %B) nounwind { ; CHECK-LABEL: smull_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: smull.2d v0, v0, v1[1] ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) + %tmp3 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp3) ret <2 x i64> %tmp4 } -define <4 x i32> @smlal_lane_4s(ptr %A, ptr %B, ptr %C) nounwind { +define <4 x i32> @smlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: smlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.4s v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: smlal.4s v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = load <4 x i32>, ptr %C - %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) - %tmp6 = add <4 x i32> %tmp3, %tmp5 + %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) + %tmp6 = add <4 x i32> %C, %tmp5 ret <4 x i32> %tmp6 } -define <2 x i64> @smlal_lane_2d(ptr %A, ptr %B, ptr %C) nounwind { +define <2 x i64> @smlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: smlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlal.2d v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: smlal.2d v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = load <2 x i64>, ptr %C - %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) - %tmp6 = add <2 x i64> %tmp3, %tmp5 + %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) + %tmp6 = add <2 x i64> %C, %tmp5 ret <2 x i64> %tmp6 } -define <4 x i32> @sqdmlal_lane_4s(ptr %A, ptr %B, ptr %C) nounwind { +define <4 x i32> @sqdmlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: sqdmlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.4s v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmlal.4s v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = load <4 x i32>, ptr %C - %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) - %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) + %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) + %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5) ret <4 x i32> %tmp6 } -define <2 x i64> @sqdmlal_lane_2d(ptr %A, ptr %B, ptr %C) nounwind { +define <2 x i64> @sqdmlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: sqdmlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlal.2d v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmlal.2d v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = load <2 x i64>, ptr %C - %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) - %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) + %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) + %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5) ret <2 x i64> %tmp6 } -define <4 x i32> @sqdmlal2_lane_4s(ptr %A, ptr %B, ptr %C) nounwind { +define <4 x i32> @sqdmlal2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: sqdmlal2_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: sqdmlal.4s v0, v1, v2[1] +; CHECK-NEXT: sqdmlal2.4s v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %load1 = load <8 x i16>, ptr %A - %load2 = load <8 x i16>, ptr %B - %tmp3 = load <4 x i32>, ptr %C - %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> - %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> + %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> + %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) + %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %C, <4 x i32> %tmp5) ret <4 x i32> %tmp6 } -define <2 x i64> @sqdmlal2_lane_2d(ptr %A, ptr %B, ptr %C) nounwind { +define <2 x i64> @sqdmlal2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: sqdmlal2_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: sqdmlal.2d v0, v1, v2[1] +; CHECK-NEXT: sqdmlal2.2d v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %load1 = load <4 x i32>, ptr %A - %load2 = load <4 x i32>, ptr %B - %tmp3 = load <2 x i64>, ptr %C - %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> - %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> + %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> + %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) + %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %C, <2 x i64> %tmp5) ret <2 x i64> %tmp6 } @@ -1715,176 +1599,134 @@ define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind { declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) -define <4 x i32> @umlal_lane_4s(ptr %A, ptr %B, ptr %C) nounwind { +define <4 x i32> @umlal_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: umlal_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.4s v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umlal.4s v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = load <4 x i32>, ptr %C - %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) - %tmp6 = add <4 x i32> %tmp3, %tmp5 + %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) + %tmp6 = add <4 x i32> %C, %tmp5 ret <4 x i32> %tmp6 } -define <2 x i64> @umlal_lane_2d(ptr %A, ptr %B, ptr %C) nounwind { +define <2 x i64> @umlal_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: umlal_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlal.2d v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umlal.2d v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = load <2 x i64>, ptr %C - %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) - %tmp6 = add <2 x i64> %tmp3, %tmp5 + %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) + %tmp6 = add <2 x i64> %C, %tmp5 ret <2 x i64> %tmp6 } -define <4 x i32> @smlsl_lane_4s(ptr %A, ptr %B, ptr %C) nounwind { +define <4 x i32> @smlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: smlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.4s v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: smlsl.4s v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = load <4 x i32>, ptr %C - %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) - %tmp6 = sub <4 x i32> %tmp3, %tmp5 + %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) + %tmp6 = sub <4 x i32> %C, %tmp5 ret <4 x i32> %tmp6 } -define <2 x i64> @smlsl_lane_2d(ptr %A, ptr %B, ptr %C) nounwind { +define <2 x i64> @smlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: smlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: smlsl.2d v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: smlsl.2d v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = load <2 x i64>, ptr %C - %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) - %tmp6 = sub <2 x i64> %tmp3, %tmp5 + %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) + %tmp6 = sub <2 x i64> %C, %tmp5 ret <2 x i64> %tmp6 } -define <4 x i32> @sqdmlsl_lane_4s(ptr %A, ptr %B, ptr %C) nounwind { +define <4 x i32> @sqdmlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: sqdmlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.4s v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmlsl.4s v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = load <4 x i32>, ptr %C - %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) - %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) + %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) + %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5) ret <4 x i32> %tmp6 } -define <2 x i64> @sqdmlsl_lane_2d(ptr %A, ptr %B, ptr %C) nounwind { +define <2 x i64> @sqdmlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: sqdmlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: sqdmlsl.2d v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: sqdmlsl.2d v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = load <2 x i64>, ptr %C - %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) - %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) + %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) + %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5) ret <2 x i64> %tmp6 } -define <4 x i32> @sqdmlsl2_lane_4s(ptr %A, ptr %B, ptr %C) nounwind { +define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16> %A, <8 x i16> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: sqdmlsl2_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: sqdmlsl.4s v0, v1, v2[1] +; CHECK-NEXT: sqdmlsl2.4s v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %load1 = load <8 x i16>, ptr %A - %load2 = load <8 x i16>, ptr %B - %tmp3 = load <4 x i32>, ptr %C - %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> - %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> + %tmp1 = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> + %tmp2 = shufflevector <8 x i16> %B, <8 x i16> undef, <4 x i32> %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5) + %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %C, <4 x i32> %tmp5) ret <4 x i32> %tmp6 } -define <2 x i64> @sqdmlsl2_lane_2d(ptr %A, ptr %B, ptr %C) nounwind { +define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32> %A, <4 x i32> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: sqdmlsl2_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: ldr d1, [x0, #8] -; CHECK-NEXT: ldr d2, [x1] -; CHECK-NEXT: sqdmlsl.2d v0, v1, v2[1] +; CHECK-NEXT: sqdmlsl2.2d v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %load1 = load <4 x i32>, ptr %A - %load2 = load <4 x i32>, ptr %B - %tmp3 = load <2 x i64>, ptr %C - %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> - %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> + %tmp1 = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> + %tmp2 = shufflevector <4 x i32> %B, <4 x i32> undef, <2 x i32> %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5) + %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %C, <2 x i64> %tmp5) ret <2 x i64> %tmp6 } -define <4 x i32> @umlsl_lane_4s(ptr %A, ptr %B, ptr %C) nounwind { +define <4 x i32> @umlsl_lane_4s(<4 x i16> %A, <4 x i16> %B, <4 x i32> %C) nounwind { ; CHECK-LABEL: umlsl_lane_4s: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.4s v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umlsl.4s v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = load <4 x i32>, ptr %C - %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> - %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4) - %tmp6 = sub <4 x i32> %tmp3, %tmp5 + %tmp4 = shufflevector <4 x i16> %B, <4 x i16> poison, <4 x i32> + %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %A, <4 x i16> %tmp4) + %tmp6 = sub <4 x i32> %C, %tmp5 ret <4 x i32> %tmp6 } -define <2 x i64> @umlsl_lane_2d(ptr %A, ptr %B, ptr %C) nounwind { +define <2 x i64> @umlsl_lane_2d(<2 x i32> %A, <2 x i32> %B, <2 x i64> %C) nounwind { ; CHECK-LABEL: umlsl_lane_2d: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr d1, [x1] -; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ldr q0, [x2] -; CHECK-NEXT: umlsl.2d v0, v2, v1[1] +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umlsl.2d v2, v0, v1[1] +; CHECK-NEXT: mov.16b v0, v2 ; CHECK-NEXT: ret - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = load <2 x i32>, ptr %B - %tmp3 = load <2 x i64>, ptr %C - %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> - %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4) - %tmp6 = sub <2 x i64> %tmp3, %tmp5 + %tmp4 = shufflevector <2 x i32> %B, <2 x i32> poison, <2 x i32> + %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %A, <2 x i32> %tmp4) + %tmp6 = sub <2 x i64> %C, %tmp5 ret <2 x i64> %tmp6 } diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll index 9a792035a469b8..3f6d38c929b98a 100644 --- a/llvm/test/CodeGen/ARM/vdup.ll +++ b/llvm/test/CodeGen/ARM/vdup.ll @@ -219,103 +219,79 @@ define <4 x float> @v_shuffledupQfloat(float %A) nounwind { ret <4 x float> %tmp2 } -define <8 x i8> @vduplane8(ptr %A) nounwind { +define arm_aapcs_vfpcc <8 x i8> @vduplane8(<8 x i8> %A) nounwind { ; CHECK-LABEL: vduplane8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.8 d16, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vdup.8 d0, d0[1] ; CHECK-NEXT: mov pc, lr - %tmp1 = load <8 x i8>, ptr %A - %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > ret <8 x i8> %tmp2 } -define <4 x i16> @vduplane16(ptr %A) nounwind { +define arm_aapcs_vfpcc <4 x i16> @vduplane16(<4 x i16> %A) nounwind { ; CHECK-LABEL: vduplane16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.16 d16, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vdup.16 d0, d0[1] ; CHECK-NEXT: mov pc, lr - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > ret <4 x i16> %tmp2 } -define <2 x i32> @vduplane32(ptr %A) nounwind { +define arm_aapcs_vfpcc <2 x i32> @vduplane32(<2 x i32> %A) nounwind { ; CHECK-LABEL: vduplane32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.32 d16, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vdup.32 d0, d0[1] ; CHECK-NEXT: mov pc, lr - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > + %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > ret <2 x i32> %tmp2 } -define <2 x float> @vduplanefloat(ptr %A) nounwind { +define arm_aapcs_vfpcc <2 x float> @vduplanefloat(<2 x float> %A) nounwind { ; CHECK-LABEL: vduplanefloat: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.32 d16, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vdup.32 d0, d0[1] ; CHECK-NEXT: mov pc, lr - %tmp1 = load <2 x float>, ptr %A - %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > + %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <2 x i32> < i32 1, i32 1 > ret <2 x float> %tmp2 } -define <16 x i8> @vduplaneQ8(ptr %A) nounwind { +define arm_aapcs_vfpcc <16 x i8> @vduplaneQ8(<8 x i8> %A) nounwind { ; CHECK-LABEL: vduplaneQ8: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.8 q8, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vdup.8 q0, d0[1] ; CHECK-NEXT: mov pc, lr - %tmp1 = load <8 x i8>, ptr %A - %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + %tmp2 = shufflevector <8 x i8> %A, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > ret <16 x i8> %tmp2 } -define <8 x i16> @vduplaneQ16(ptr %A) nounwind { +define arm_aapcs_vfpcc <8 x i16> @vduplaneQ16(<4 x i16> %A) nounwind { ; CHECK-LABEL: vduplaneQ16: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.16 q8, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vdup.16 q0, d0[1] ; CHECK-NEXT: mov pc, lr - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + %tmp2 = shufflevector <4 x i16> %A, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > ret <8 x i16> %tmp2 } -define <4 x i32> @vduplaneQ32(ptr %A) nounwind { +define arm_aapcs_vfpcc <4 x i32> @vduplaneQ32(<2 x i32> %A) nounwind { ; CHECK-LABEL: vduplaneQ32: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.32 q8, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vdup.32 q0, d0[1] ; CHECK-NEXT: mov pc, lr - %tmp1 = load <2 x i32>, ptr %A - %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + %tmp2 = shufflevector <2 x i32> %A, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > ret <4 x i32> %tmp2 } -define <4 x float> @vduplaneQfloat(ptr %A) nounwind { +define arm_aapcs_vfpcc <4 x float> @vduplaneQfloat(<2 x float> %A) nounwind { ; CHECK-LABEL: vduplaneQfloat: ; CHECK: @ %bb.0: -; CHECK-NEXT: vldr d16, [r0] -; CHECK-NEXT: vdup.32 q8, d16[1] -; CHECK-NEXT: vmov r0, r1, d16 -; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vdup.32 q0, d0[1] ; CHECK-NEXT: mov pc, lr - %tmp1 = load <2 x float>, ptr %A - %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + %tmp2 = shufflevector <2 x float> %A, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > ret <4 x float> %tmp2 }