diff --git a/llvm/test/CodeGen/AArch64/arm64-trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll index f73cb8d3095fde..125610ec93dce6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-trn.ll +++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll @@ -1,10 +1,26 @@ -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s --check-prefixes=CHECKLE +; RUN: llc < %s -mtriple=aarch64_be-none-eabi | FileCheck %s --check-prefixes=CHECKBE define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vtrni8: -;CHECK: trn1.8b -;CHECK: trn2.8b -;CHECK-NEXT: add.8b +; CHECKLE-LABEL: vtrni8: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr d0, [x0] +; CHECKLE-NEXT: ldr d1, [x1] +; CHECKLE-NEXT: trn1 v2.8b, v0.8b, v1.8b +; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b +; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrni8: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.8b }, [x0] +; CHECKBE-NEXT: ld1 { v1.8b }, [x1] +; CHECKBE-NEXT: trn1 v2.8b, v0.8b, v1.8b +; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b +; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b +; CHECKBE-NEXT: rev64 v0.8b, v0.8b +; CHECKBE-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> @@ -14,10 +30,24 @@ define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK-LABEL: vtrni16: -;CHECK: trn1.4h -;CHECK: trn2.4h -;CHECK-NEXT: add.4h +; CHECKLE-LABEL: vtrni16: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr d0, [x0] +; CHECKLE-NEXT: ldr d1, [x1] +; CHECKLE-NEXT: trn1 v2.4h, v0.4h, v1.4h +; CHECKLE-NEXT: trn2 v0.4h, v0.4h, v1.4h +; CHECKLE-NEXT: add v0.4h, v2.4h, v0.4h +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrni16: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.4h }, [x0] +; CHECKBE-NEXT: ld1 { v1.4h }, [x1] +; CHECKBE-NEXT: trn1 v2.4h, v0.4h, v1.4h +; CHECKBE-NEXT: trn2 v0.4h, v0.4h, v1.4h +; CHECKBE-NEXT: add v0.4h, v2.4h, v0.4h +; CHECKBE-NEXT: rev64 v0.4h, v0.4h +; CHECKBE-NEXT: ret %tmp1 = load <4 x i16>, <4 x i16>* %A %tmp2 = load <4 x i16>, <4 x i16>* %B %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> @@ -26,12 +56,49 @@ define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ret <4 x i16> %tmp5 } +define <8 x i8> @vtrni16_viabitcast(<4 x i16> *%A, <4 x i16> *%B) nounwind { +; CHECKLE-LABEL: vtrni16_viabitcast: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr d0, [x0] +; CHECKLE-NEXT: ldr d1, [x1] +; CHECKLE-NEXT: trn1 v0.4h, v0.4h, v1.4h +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrni16_viabitcast: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.4h }, [x0] +; CHECKBE-NEXT: ld1 { v1.4h }, [x1] +; CHECKBE-NEXT: trn1 v0.4h, v0.4h, v1.4h +; CHECKBE-NEXT: rev64 v0.4h, v0.4h +; CHECKBE-NEXT: ret + %l1 = load <4 x i16>, <4 x i16> *%A + %l2 = load <4 x i16>, <4 x i16> *%B + %b1 = bitcast <4 x i16> %l1 to <8 x i8> + %b2 = bitcast <4 x i16> %l2 to <8 x i8> + %tmp3 = shufflevector <8 x i8> %b1, <8 x i8> %b2, <8 x i32> + ret <8 x i8> %tmp3 +} + ; 2xi32 TRN is redundant with ZIP define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK-LABEL: vtrni32: -;CHECK: zip1.2s -;CHECK: zip2.2s -;CHECK-NEXT: add.2s +; CHECKLE-LABEL: vtrni32: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr d0, [x0] +; CHECKLE-NEXT: ldr d1, [x1] +; CHECKLE-NEXT: zip1 v2.2s, v0.2s, v1.2s +; CHECKLE-NEXT: zip2 v0.2s, v0.2s, v1.2s +; CHECKLE-NEXT: add v0.2s, v2.2s, v0.2s +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrni32: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.2s }, [x0] +; CHECKBE-NEXT: ld1 { v1.2s }, [x1] +; CHECKBE-NEXT: zip1 v2.2s, v0.2s, v1.2s +; CHECKBE-NEXT: zip2 v0.2s, v0.2s, v1.2s +; CHECKBE-NEXT: add v0.2s, v2.2s, v0.2s +; CHECKBE-NEXT: rev64 v0.2s, v0.2s +; CHECKBE-NEXT: ret %tmp1 = load <2 x i32>, <2 x i32>* %A %tmp2 = load <2 x i32>, <2 x i32>* %B %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> @@ -41,10 +108,24 @@ define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { } define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK-LABEL: vtrnf: -;CHECK: zip1.2s -;CHECK: zip2.2s -;CHECK-NEXT: fadd.2s +; CHECKLE-LABEL: vtrnf: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr d0, [x0] +; CHECKLE-NEXT: ldr d1, [x1] +; CHECKLE-NEXT: zip1 v2.2s, v0.2s, v1.2s +; CHECKLE-NEXT: zip2 v0.2s, v0.2s, v1.2s +; CHECKLE-NEXT: fadd v0.2s, v2.2s, v0.2s +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnf: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.2s }, [x0] +; CHECKBE-NEXT: ld1 { v1.2s }, [x1] +; CHECKBE-NEXT: zip1 v2.2s, v0.2s, v1.2s +; CHECKBE-NEXT: zip2 v0.2s, v0.2s, v1.2s +; CHECKBE-NEXT: fadd v0.2s, v2.2s, v0.2s +; CHECKBE-NEXT: rev64 v0.2s, v0.2s +; CHECKBE-NEXT: ret %tmp1 = load <2 x float>, <2 x float>* %A %tmp2 = load <2 x float>, <2 x float>* %B %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> @@ -54,10 +135,25 @@ define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind { } define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK-LABEL: vtrnQi8: -;CHECK: trn1.16b -;CHECK: trn2.16b -;CHECK-NEXT: add.16b +; CHECKLE-LABEL: vtrnQi8: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.16b, v0.16b, v1.16b +; CHECKLE-NEXT: trn2 v0.16b, v0.16b, v1.16b +; CHECKLE-NEXT: add v0.16b, v2.16b, v0.16b +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQi8: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.16b }, [x0] +; CHECKBE-NEXT: ld1 { v1.16b }, [x1] +; CHECKBE-NEXT: trn1 v2.16b, v0.16b, v1.16b +; CHECKBE-NEXT: trn2 v0.16b, v0.16b, v1.16b +; CHECKBE-NEXT: add v0.16b, v2.16b, v0.16b +; CHECKBE-NEXT: rev64 v0.16b, v0.16b +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret %tmp1 = load <16 x i8>, <16 x i8>* %A %tmp2 = load <16 x i8>, <16 x i8>* %B %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> @@ -67,10 +163,25 @@ define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { } define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: vtrnQi16: -;CHECK: trn1.8h -;CHECK: trn2.8h -;CHECK-NEXT: add.8h +; CHECKLE-LABEL: vtrnQi16: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h +; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQi16: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.8h }, [x0] +; CHECKBE-NEXT: ld1 { v1.8h }, [x1] +; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h +; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h +; CHECKBE-NEXT: rev64 v0.8h, v0.8h +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> @@ -80,10 +191,25 @@ define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { } define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK-LABEL: vtrnQi32: -;CHECK: trn1.4s -;CHECK: trn2.4s -;CHECK-NEXT: add.4s +; CHECKLE-LABEL: vtrnQi32: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECKLE-NEXT: add v0.4s, v2.4s, v0.4s +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQi32: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.4s }, [x0] +; CHECKBE-NEXT: ld1 { v1.4s }, [x1] +; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECKBE-NEXT: add v0.4s, v2.4s, v0.4s +; CHECKBE-NEXT: rev64 v0.4s, v0.4s +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret %tmp1 = load <4 x i32>, <4 x i32>* %A %tmp2 = load <4 x i32>, <4 x i32>* %B %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> @@ -93,10 +219,25 @@ define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { } define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK-LABEL: vtrnQf: -;CHECK: trn1.4s -;CHECK: trn2.4s -;CHECK-NEXT: fadd.4s +; CHECKLE-LABEL: vtrnQf: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECKLE-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECKLE-NEXT: fadd v0.4s, v2.4s, v0.4s +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQf: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.4s }, [x0] +; CHECKBE-NEXT: ld1 { v1.4s }, [x1] +; CHECKBE-NEXT: trn1 v2.4s, v0.4s, v1.4s +; CHECKBE-NEXT: trn2 v0.4s, v0.4s, v1.4s +; CHECKBE-NEXT: fadd v0.4s, v2.4s, v0.4s +; CHECKBE-NEXT: rev64 v0.4s, v0.4s +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret %tmp1 = load <4 x float>, <4 x float>* %A %tmp2 = load <4 x float>, <4 x float>* %B %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> @@ -108,10 +249,24 @@ define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind { ; Undef shuffle indices should not prevent matching to VTRN: define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK-LABEL: vtrni8_undef: -;CHECK: trn1.8b -;CHECK: trn2.8b -;CHECK-NEXT: add.8b +; CHECKLE-LABEL: vtrni8_undef: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr d0, [x0] +; CHECKLE-NEXT: ldr d1, [x1] +; CHECKLE-NEXT: trn1 v2.8b, v0.8b, v1.8b +; CHECKLE-NEXT: trn2 v0.8b, v0.8b, v1.8b +; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrni8_undef: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.8b }, [x0] +; CHECKBE-NEXT: ld1 { v1.8b }, [x1] +; CHECKBE-NEXT: trn1 v2.8b, v0.8b, v1.8b +; CHECKBE-NEXT: trn2 v0.8b, v0.8b, v1.8b +; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b +; CHECKBE-NEXT: rev64 v0.8b, v0.8b +; CHECKBE-NEXT: ret %tmp1 = load <8 x i8>, <8 x i8>* %A %tmp2 = load <8 x i8>, <8 x i8>* %B %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> @@ -121,10 +276,25 @@ define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind { } define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK-LABEL: vtrnQi16_undef: -;CHECK: trn1.8h -;CHECK: trn2.8h -;CHECK-NEXT: add.8h +; CHECKLE-LABEL: vtrnQi16_undef: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: ldr q0, [x0] +; CHECKLE-NEXT: ldr q1, [x1] +; CHECKLE-NEXT: trn1 v2.8h, v0.8h, v1.8h +; CHECKLE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECKLE-NEXT: add v0.8h, v2.8h, v0.8h +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrnQi16_undef: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: ld1 { v0.8h }, [x0] +; CHECKBE-NEXT: ld1 { v1.8h }, [x1] +; CHECKBE-NEXT: trn1 v2.8h, v0.8h, v1.8h +; CHECKBE-NEXT: trn2 v0.8h, v0.8h, v1.8h +; CHECKBE-NEXT: add v0.8h, v2.8h, v0.8h +; CHECKBE-NEXT: rev64 v0.8h, v0.8h +; CHECKBE-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECKBE-NEXT: ret %tmp1 = load <8 x i16>, <8 x i16>* %A %tmp2 = load <8 x i16>, <8 x i16>* %B %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll index 5556e2d1033355..f274f331a5073d 100644 --- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll @@ -1,7 +1,13 @@ -; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=armv7-none-eabi | FileCheck %s ; PR7158 define i32 @test_pr7158() nounwind { +; CHECK-LABEL: test_pr7158: +; CHECK: @ %bb.0: @ %bb.nph55.bb.nph55.split_crit_edge +; CHECK-NEXT: .LBB0_1: @ %bb.i19 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: b .LBB0_1 bb.nph55.bb.nph55.split_crit_edge: br label %bb3 @@ -19,6 +25,8 @@ bb.i19: ; preds = %bb.i19, %bb3 ; Check that the DAG combiner does not arbitrarily modify BUILD_VECTORs ; after legalization. define void @test_illegal_build_vector() nounwind { +; CHECK-LABEL: test_illegal_build_vector: +; CHECK: @ %bb.0: @ %entry entry: store <2 x i64> undef, <2 x i64>* undef, align 16 %0 = load <16 x i8>, <16 x i8>* undef, align 16 ; <<16 x i8>> [#uses=1] @@ -30,6 +38,8 @@ entry: ; PR22678 ; Check CONCAT_VECTORS DAG combiner pass doesn't introduce illegal types. define void @test_pr22678() { +; CHECK-LABEL: test_pr22678: +; CHECK: @ %bb.0: %1 = fptoui <16 x float> undef to <16 x i8> store <16 x i8> %1, <16 x i8>* undef ret void @@ -37,8 +47,20 @@ define void @test_pr22678() { ; Radar 8407927: Make sure that VMOVRRD gets optimized away when the result is ; converted back to be used as a vector type. -; CHECK-LABEL: test_vmovrrd_combine: define <4 x i32> @test_vmovrrd_combine() nounwind { +; CHECK-LABEL: test_vmovrrd_combine: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: @ implicit-def: $q8 +; CHECK-NEXT: bne .LBB3_2 +; CHECK-NEXT: @ %bb.1: @ %bb1.preheader +; CHECK-NEXT: vmov.i32 q8, #0x0 +; CHECK-NEXT: vext.8 q8, q8, q8, #4 +; CHECK-NEXT: .LBB3_2: @ %bb2 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr entry: br i1 undef, label %bb1, label %bb2 @@ -47,8 +69,6 @@ bb1: %1 = extractelement <2 x double> %0, i32 0 %2 = bitcast double %1 to i64 %3 = insertelement <1 x i64> undef, i64 %2, i32 0 -; CHECK-NOT: vmov s -; CHECK: vext.8 %4 = shufflevector <1 x i64> %3, <1 x i64> undef, <2 x i32> %tmp2006.3 = bitcast <2 x i64> %4 to <16 x i8> %5 = shufflevector <16 x i8> %tmp2006.3, <16 x i8> undef, <16 x i32> @@ -63,6 +83,15 @@ bb2: ; Test trying to do a ShiftCombine on illegal types. ; The vector should be split first. define void @lshrIllegalType(<8 x i32>* %A) nounwind { +; CHECK-LABEL: lshrIllegalType: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: vshr.u32 q8, q8, #3 +; CHECK-NEXT: vst1.32 {d16, d17}, [r0:128]! +; CHECK-NEXT: vld1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: vshr.u32 q8, q8, #3 +; CHECK-NEXT: vst1.64 {d16, d17}, [r0:128] +; CHECK-NEXT: bx lr %tmp1 = load <8 x i32>, <8 x i32>* %A %tmp2 = lshr <8 x i32> %tmp1, < i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> store <8 x i32> %tmp2, <8 x i32>* %A @@ -72,6 +101,10 @@ define void @lshrIllegalType(<8 x i32>* %A) nounwind { ; Test folding a binary vector operation with constant BUILD_VECTOR ; operands with i16 elements. define void @test_i16_constant_fold() nounwind optsize { +; CHECK-LABEL: test_i16_constant_fold: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i8 d16, #0x1 +; CHECK-NEXT: vst1.8 {d16}, [r0] entry: %0 = sext <4 x i1> zeroinitializer to <4 x i16> %1 = add <4 x i16> %0, zeroinitializer @@ -87,8 +120,11 @@ declare void @llvm.arm.neon.vst1.p0i8.v8i8(i8*, <8 x i8>, i32) nounwind ; Test that loads and stores of i64 vector elements are handled as f64 values ; so they are not split up into i32 values. Radar 8755338. define void @i64_buildvector(i64* %ptr, <2 x i64>* %vp) nounwind { -; CHECK: i64_buildvector -; CHECK: vldr +; CHECK-LABEL: i64_buildvector: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NEXT: bx lr %t0 = load i64, i64* %ptr, align 4 %t1 = insertelement <2 x i64> undef, i64 %t0, i32 0 store <2 x i64> %t1, <2 x i64>* %vp @@ -96,8 +132,12 @@ define void @i64_buildvector(i64* %ptr, <2 x i64>* %vp) nounwind { } define void @i64_insertelement(i64* %ptr, <2 x i64>* %vp) nounwind { -; CHECK: i64_insertelement -; CHECK: vldr +; CHECK-LABEL: i64_insertelement: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vldr d16, [r0] +; CHECK-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NEXT: bx lr %t0 = load i64, i64* %ptr, align 4 %vec = load <2 x i64>, <2 x i64>* %vp %t1 = insertelement <2 x i64> %vec, i64 %t0, i32 0 @@ -106,8 +146,11 @@ define void @i64_insertelement(i64* %ptr, <2 x i64>* %vp) nounwind { } define void @i64_extractelement(i64* %ptr, <2 x i64>* %vp) nounwind { -; CHECK: i64_extractelement -; CHECK: vstr +; CHECK-LABEL: i64_extractelement: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vstr d16, [r0] +; CHECK-NEXT: bx lr %vec = load <2 x i64>, <2 x i64>* %vp %t1 = extractelement <2 x i64> %vec, i32 0 store i64 %t1, i64* %ptr @@ -116,6 +159,29 @@ define void @i64_extractelement(i64* %ptr, <2 x i64>* %vp) nounwind { ; Test trying to do a AND Combine on illegal types. define void @andVec(<3 x i8>* %A) nounwind { +; CHECK-LABEL: andVec: +; CHECK: @ %bb.0: +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, sp, #8 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: vmov.i16 d17, #0x7 +; CHECK-NEXT: str r1, [sp, #4] +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: vld1.32 {d16[0]}, [r1:32] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmovl.u8 q9, d16 +; CHECK-NEXT: vand d16, d18, d17 +; CHECK-NEXT: vorr d17, d16, d16 +; CHECK-NEXT: vuzp.8 d17, d18 +; CHECK-NEXT: vst1.32 {d17[0]}, [r1:32] +; CHECK-NEXT: vld1.32 {d17[0]}, [r1:32] +; CHECK-NEXT: vmov.u16 r1, d16[2] +; CHECK-NEXT: vmovl.u16 q8, d17 +; CHECK-NEXT: vmov.32 r2, d16[0] +; CHECK-NEXT: strb r1, [r0, #2] +; CHECK-NEXT: strh r2, [r0] +; CHECK-NEXT: add sp, sp, #8 +; CHECK-NEXT: bx lr %tmp = load <3 x i8>, <3 x i8>* %A, align 4 %and = and <3 x i8> %tmp, store <3 x i8> %and, <3 x i8>* %A @@ -125,6 +191,28 @@ define void @andVec(<3 x i8>* %A) nounwind { ; Test trying to do an OR Combine on illegal types. define void @orVec(<3 x i8>* %A) nounwind { +; CHECK-LABEL: orVec: +; CHECK: @ %bb.0: +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, sp, #8 +; CHECK-NEXT: ldr r1, [r0] +; CHECK-NEXT: str r1, [sp, #4] +; CHECK-NEXT: add r1, sp, #4 +; CHECK-NEXT: vld1.32 {d16[0]}, [r1:32] +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vmovl.u8 q8, d16 +; CHECK-NEXT: vorr.i16 d16, #0x7 +; CHECK-NEXT: vorr d18, d16, d16 +; CHECK-NEXT: vuzp.8 d18, d19 +; CHECK-NEXT: vst1.32 {d18[0]}, [r1:32] +; CHECK-NEXT: vld1.32 {d18[0]}, [r1:32] +; CHECK-NEXT: vmov.u16 r1, d16[2] +; CHECK-NEXT: vmovl.u16 q8, d18 +; CHECK-NEXT: vmov.32 r2, d16[0] +; CHECK-NEXT: strb r1, [r0, #2] +; CHECK-NEXT: strh r2, [r0] +; CHECK-NEXT: add sp, sp, #8 +; CHECK-NEXT: bx lr %tmp = load <3 x i8>, <3 x i8>* %A, align 4 %or = or <3 x i8> %tmp, store <3 x i8> %or, <3 x i8>* %A @@ -136,6 +224,10 @@ define void @orVec(<3 x i8>* %A) nounwind { ; a BUILD_VECTOR with i32 0 operands, which did not match the i16 operands ; of the other BUILD_VECTOR. define i16 @foldBuildVectors() { +; CHECK-LABEL: foldBuildVectors: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r0, #0 +; CHECK-NEXT: bx lr %1 = sext <8 x i8> undef to <8 x i16> %2 = mul <8 x i16> %1, %3 = extractelement <8 x i16> %2, i32 0 @@ -144,11 +236,15 @@ define i16 @foldBuildVectors() { ; Test that we are generating vrev and vext for reverse shuffles of v8i16 ; shuffles. -; CHECK-LABEL: reverse_v8i16: define void @reverse_v8i16(<8 x i16>* %loadaddr, <8 x i16>* %storeaddr) { +; CHECK-LABEL: reverse_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vrev64.16 q8, q8 +; CHECK-NEXT: vext.16 q8, q8, q8, #4 +; CHECK-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NEXT: bx lr %v0 = load <8 x i16>, <8 x i16>* %loadaddr - ; CHECK: vrev64.16 - ; CHECK: vext.16 %v1 = shufflevector <8 x i16> %v0, <8 x i16> undef, <8 x i32> store <8 x i16> %v1, <8 x i16>* %storeaddr @@ -157,11 +253,15 @@ define void @reverse_v8i16(<8 x i16>* %loadaddr, <8 x i16>* %storeaddr) { ; Test that we are generating vrev and vext for reverse shuffles of v16i8 ; shuffles. -; CHECK-LABEL: reverse_v16i8: define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) { +; CHECK-LABEL: reverse_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r0] +; CHECK-NEXT: vrev64.8 q8, q8 +; CHECK-NEXT: vext.8 q8, q8, q8, #8 +; CHECK-NEXT: vst1.64 {d16, d17}, [r1] +; CHECK-NEXT: bx lr %v0 = load <16 x i8>, <16 x i8>* %loadaddr - ; CHECK: vrev64.8 - ; CHECK: vext.8 %v1 = shufflevector <16 x i8> %v0, <16 x i8> undef, <16 x i32> @@ -173,11 +273,15 @@ define void @reverse_v16i8(<16 x i8>* %loadaddr, <16 x i8>* %storeaddr) { ; vldr cannot handle unaligned loads. ; Fall back to vld1.32, which can, instead of using the general purpose loads ; followed by a costly sequence of instructions to build the vector register. -; CHECK-LABEL: t3: -; CHECK: vld1.32 {[[REG:d[0-9]+]][0]} -; CHECK: vld1.32 {[[REG]][1]} -; CHECK: vmull.u8 q{{[0-9]+}}, [[REG]], [[REG]] define <8 x i16> @t3(i8 zeroext %xf, i8* nocapture %sp0, i8* nocapture %sp1, i32* nocapture %outp) { +; CHECK-LABEL: t3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld1.32 {d16[0]}, [r1] +; CHECK-NEXT: vld1.32 {d16[1]}, [r2] +; CHECK-NEXT: vmull.u8 q8, d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr entry: %pix_sp0.0.cast = bitcast i8* %sp0 to i32* %pix_sp0.0.copyload = load i32, i32* %pix_sp0.0.cast, align 1 @@ -197,7 +301,12 @@ declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) ; Thus, check that scalar_to_vector do not interfer with that. define <8 x i16> @t4(i8* nocapture %sp0) { ; CHECK-LABEL: t4: -; CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r0] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld1.32 {d16[0]}, [r0] +; CHECK-NEXT: vmull.u8 q8, d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr entry: %pix_sp0.0.cast = bitcast i8* %sp0 to i32* %pix_sp0.0.copyload = load i32, i32* %pix_sp0.0.cast, align 1 @@ -210,13 +319,17 @@ entry: ; Make sure vector load is used for all three loads. ; Lowering to build vector was breaking the single use property of the load of ; %pix_sp0.0.copyload. -; CHECK-LABEL: t5: -; CHECK: vld1.32 {[[REG1:d[0-9]+]][1]}, [r0] -; CHECK: vorr [[REG2:d[0-9]+]], [[REG1]], [[REG1]] -; CHECK: vld1.32 {[[REG1]][0]}, [r1] -; CHECK: vld1.32 {[[REG2]][0]}, [r2] -; CHECK: vmull.u8 q{{[0-9]+}}, [[REG1]], [[REG2]] define <8 x i16> @t5(i8* nocapture %sp0, i8* nocapture %sp1, i8* nocapture %sp2) { +; CHECK-LABEL: t5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld1.32 {d16[1]}, [r0] +; CHECK-NEXT: vorr d17, d16, d16 +; CHECK-NEXT: vld1.32 {d16[0]}, [r1] +; CHECK-NEXT: vld1.32 {d17[0]}, [r2] +; CHECK-NEXT: vmull.u8 q8, d16, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr entry: %pix_sp0.0.cast = bitcast i8* %sp0 to i32* %pix_sp0.0.copyload = load i32, i32* %pix_sp0.0.cast, align 1 @@ -237,10 +350,12 @@ entry: ; illegal type to a legal type. define <2 x i8> @test_truncate(<2 x i128> %in) { ; CHECK-LABEL: test_truncate: -; CHECK: vmov.32 [[REG:d[0-9]+]][0], r0 -; CHECK-NEXT: mov [[BASE:r[0-9]+]], sp -; CHECK-NEXT: vld1.32 {[[REG]][1]}, [[[BASE]]:32] -; CHECK-NEXT: vmov r0, r1, [[REG]] +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.32 {d16[1]}, [r0:32] +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr entry: %res = trunc <2 x i128> %in to <2 x i8> ret <2 x i8> %res diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll index f2d9593f26418c..2369bd8e468c00 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -59,6 +59,56 @@ entry: ret <8 x i16> %out } +define arm_aapcs_vfpcc <8 x i16> @vmovn32_trunc1_viabitcast(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: vmovn32_trunc1_viabitcast: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vmov.f32 s9, s6 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s11, s7 +; CHECK-NEXT: vstrh.32 q2, [r0, #8] +; CHECK-NEXT: vmov.f32 s8, s0 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s10, s1 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vstrh.32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: bx lr +; +; CHECKBE-LABEL: vmovn32_trunc1_viabitcast: +; CHECKBE: @ %bb.0: @ %entry +; CHECKBE-NEXT: .pad #16 +; CHECKBE-NEXT: sub sp, #16 +; CHECKBE-NEXT: vrev64.32 q2, q1 +; CHECKBE-NEXT: vrev64.32 q1, q0 +; CHECKBE-NEXT: vmov.f32 s0, s6 +; CHECKBE-NEXT: mov r0, sp +; CHECKBE-NEXT: vmov.f32 s1, s10 +; CHECKBE-NEXT: vmov.f32 s2, s7 +; CHECKBE-NEXT: vmov.f32 s3, s11 +; CHECKBE-NEXT: vstrh.32 q0, [r0, #8] +; CHECKBE-NEXT: vmov.f32 s0, s4 +; CHECKBE-NEXT: vmov.f32 s1, s8 +; CHECKBE-NEXT: vmov.f32 s2, s5 +; CHECKBE-NEXT: vmov.f32 s3, s9 +; CHECKBE-NEXT: vstrh.32 q0, [r0] +; CHECKBE-NEXT: vldrb.u8 q1, [r0] +; CHECKBE-NEXT: vrev64.8 q0, q1 +; CHECKBE-NEXT: add sp, #16 +; CHECKBE-NEXT: bx lr +entry: + %b1 = bitcast <4 x i32> %src1 to <8 x i16> + %b2 = bitcast <4 x i32> %src2 to <8 x i16> + %s = shufflevector <8 x i16> %b1, <8 x i16> %b2, <16 x i32> + %b3 = bitcast <16 x i16> %s to <8 x i32> + %out = trunc <8 x i32> %b3 to <8 x i16> + ret <8 x i16> %out +} + define arm_aapcs_vfpcc <16 x i8> @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: vmovn16_trunc1: