diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll index 69e805d9ca2ee..014eaff57ce81 100644 --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -33,6 +33,16 @@ define @splice_nxv16i8_last_idx( %a, %res } +define @splice_nxv16i8_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv16i8_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #1 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv16i8( %b, %b, i32 1) + ret %res +} + define @splice_nxv8i16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8i16_first_idx: ; CHECK: // %bb.0: @@ -42,6 +52,16 @@ define @splice_nxv8i16_first_idx( %a, %res } +define @splice_nxv8i16_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8i16_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #2 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8i16( %b, %b, i32 1) + ret %res +} + define @splice_nxv4i32_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4i32_first_idx: ; CHECK: // %bb.0: @@ -60,6 +80,16 @@ define @splice_nxv4i32_last_idx( %a, %res } +define @splice_nxv4i32_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4i32_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4i32( %b, %b, i32 1) + ret %res +} + define @splice_nxv2i64_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i64_first_idx: ; CHECK: // %bb.0: @@ -78,6 +108,16 @@ define @splice_nxv2i64_last_idx( %a, %res } +define @splice_nxv2i64_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2i64_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2i64( %b, %b, i32 1) + ret %res +} + define @splice_nxv2f16_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f16_neg_idx: ; CHECK: // %bb.0: @@ -100,6 +140,18 @@ define @splice_nxv2f16_neg2_idx( %a, %res } +define @splice_nxv2f16_neg_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f16_neg_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2f16( %b, %b, i32 -1) + ret %res +} + define @splice_nxv2f16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f16_first_idx: ; CHECK: // %bb.0: @@ -118,6 +170,16 @@ define @splice_nxv2f16_last_idx( %a, %res } +define @splice_nxv2f16_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f16_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2f16( %b, %b, i32 1) + ret %res +} + define @splice_nxv4f16_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f16_neg_idx: ; CHECK: // %bb.0: @@ -140,6 +202,18 @@ define @splice_nxv4f16_neg3_idx( %a, %res } +define @splice_nxv4f16_neg_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f16_neg_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4f16( %b, %b, i32 -1) + ret %res +} + define @splice_nxv4f16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f16_first_idx: ; CHECK: // %bb.0: @@ -158,6 +232,16 @@ define @splice_nxv4f16_last_idx( %a, %res } +define @splice_nxv4f16_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f16_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4f16( %b, %b, i32 1) + ret %res +} + define @splice_nxv8f16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8f16_first_idx: ; CHECK: // %bb.0: @@ -176,6 +260,16 @@ define @splice_nxv8f16_last_idx( %a, %res } +define @splice_nxv8f16_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8f16_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #2 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8f16( %b, %b, i32 1) + ret %res +} + define @splice_nxv2f32_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f32_neg_idx: ; CHECK: // %bb.0: @@ -198,6 +292,18 @@ define @splice_nxv2f32_neg2_idx( %a, %res } +define @splice_nxv2f32_neg_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f32_neg_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2f32( %b, %b, i32 -1) + ret %res +} + define @splice_nxv2f32_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f32_first_idx: ; CHECK: // %bb.0: @@ -216,6 +322,16 @@ define @splice_nxv2f32_last_idx( %a, %res } +define @splice_nxv2f32_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f32_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2f32( %b, %b, i32 1) + ret %res +} + define @splice_nxv4f32_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4f32_first_idx: ; CHECK: // %bb.0: @@ -234,6 +350,16 @@ define @splice_nxv4f32_last_idx( %a, %res } +define @splice_nxv4f32_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4f32_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4f32( %b, %b, i32 1) + ret %res +} + define @splice_nxv2f64_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2f64_first_idx: ; CHECK: // %bb.0: @@ -252,6 +378,16 @@ define @splice_nxv2f64_last_idx( %a, ret %res } +define @splice_nxv2f64_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2f64_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2f64( %b, %b, i32 1) + ret %res +} + ; Ensure predicate based splice is promoted to use ZPRs. define @splice_nxv2i1_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i1_idx: @@ -710,6 +846,18 @@ define @splice_nxv2bf16_neg2_idx( %a, ret %res } +define @splice_nxv2bf16_neg_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_neg_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl1 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: rev p0.d, p0.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %b, %b, i32 -1) + ret %res +} + define @splice_nxv2bf16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2bf16_first_idx: ; CHECK: // %bb.0: @@ -728,6 +876,16 @@ define @splice_nxv2bf16_last_idx( %a, ret %res } +define @splice_nxv2bf16_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv2bf16_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #8 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv2bf16( %b, %b, i32 1) + ret %res +} + define @splice_nxv4bf16_neg_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4bf16_neg_idx: ; CHECK: // %bb.0: @@ -750,6 +908,18 @@ define @splice_nxv4bf16_neg3_idx( %a, ret %res } +define @splice_nxv4bf16_neg_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_neg_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s, vl1 +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: rev p0.s, p0.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %b, %b, i32 -1) + ret %res +} + define @splice_nxv4bf16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv4bf16_first_idx: ; CHECK: // %bb.0: @@ -768,6 +938,16 @@ define @splice_nxv4bf16_last_idx( %a, ret %res } +define @splice_nxv4bf16_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv4bf16_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #4 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv4bf16( %b, %b, i32 1) + ret %res +} + define @splice_nxv8bf16_first_idx( %a, %b) #0 { ; CHECK-LABEL: splice_nxv8bf16_first_idx: ; CHECK: // %bb.0: @@ -786,6 +966,16 @@ define @splice_nxv8bf16_last_idx( %a, ret %res } +define @splice_nxv8bf16_first_idx_unary( %a, %b) #0 { +; CHECK-LABEL: splice_nxv8bf16_first_idx_unary: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ext z0.b, z0.b, z1.b, #2 +; CHECK-NEXT: ret + %res = call @llvm.vector.splice.nxv8bf16( %b, %b, i32 1) + ret %res +} + ; Ensure predicate based splice is promoted to use ZPRs. define @splice_nxv2i1( %a, %b) #0 { ; CHECK-LABEL: splice_nxv2i1: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll index 00002dd3269a2..800f95d97af4c 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-subvector.ll @@ -5,6 +5,12 @@ target triple = "aarch64-unknown-linux-gnu" +; Note that both the vector.extract intrinsics and SK_ExtractSubvector +; shufflevector instructions get detected as a extract_subvector ISD node in +; SelectionDAG. We'll test both cases for the sake of completeness, even though +; vector.extract intrinsics should get lowered into shufflevector by the time we +; reach the backend. + ; i8 ; Don't use SVE for 64-bit vectors. @@ -40,6 +46,67 @@ define void @extract_subvector_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ret void } +define void @extract_v32i8_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v32i8_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <32 x i8>, ptr %in + %hi = shufflevector <32 x i8> %b, <32 x i8> poison, <16 x i32> + store <16 x i8> %hi, ptr %out + %lo = shufflevector <32 x i8> %b, <32 x i8> poison, <16 x i32> + store <16 x i8> %lo, ptr %out2 + ret void +} + +define void @extract_v32i8_half_unaligned(ptr %in, ptr %out) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v32i8_half_unaligned: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #4 +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: ret +entry: + %b = load <32 x i8>, ptr %in + %d = shufflevector <32 x i8> %b, <32 x i8> poison, <16 x i32> + store <16 x i8> %d, ptr %out + ret void +} + +define void @extract_v32i8_quarters(ptr %in, ptr %out, ptr %out2, ptr %out3, ptr %out4) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v32i8_quarters: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: ext z2.b, z2.b, z0.b, #24 +; CHECK-NEXT: str d1, [x1] +; CHECK-NEXT: str d2, [x2] +; CHECK-NEXT: str d0, [x3] +; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8 +; CHECK-NEXT: str d0, [x4] +; CHECK-NEXT: ret +entry: + %b = load <32 x i8>, ptr %in + %hilo = shufflevector <32 x i8> %b, <32 x i8> poison, <8 x i32> + store <8 x i8> %hilo, ptr %out + %hihi = shufflevector <32 x i8> %b, <32 x i8> poison, <8 x i32> + store <8 x i8> %hihi, ptr %out2 + %lolo = shufflevector <32 x i8> %b, <32 x i8> poison, <8 x i32> + store <8 x i8> %lolo, ptr %out3 + %lohi = shufflevector <32 x i8> %b, <32 x i8> poison, <8 x i32> + store <8 x i8> %lohi, ptr %out4 + ret void +} + define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 { ; CHECK-LABEL: extract_subvector_v64i8: ; CHECK: // %bb.0: @@ -54,6 +121,25 @@ define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 { ret void } +define void @extract_v64i8_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(4,4) { +; CHECK-LABEL: extract_v64i8_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1b { z1.b }, p0, [x1] +; CHECK-NEXT: st1b { z0.b }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <64 x i8>, ptr %in + %hi = shufflevector <64 x i8> %b, <64 x i8> poison, <32 x i32> + store <32 x i8> %hi, ptr %out + %lo = shufflevector <64 x i8> %b, <64 x i8> poison, <32 x i32> + store <32 x i8> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v128i8: ; CHECK: // %bb.0: @@ -117,6 +203,24 @@ define void @extract_subvector_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ret void } +define void @extract_v16i16_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v16i16_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x i16>, ptr %in + %hi = shufflevector <16 x i16> %b, <16 x i16> poison, <8 x i32> + store <8 x i16> %hi, ptr %out + %lo = shufflevector <16 x i16> %b, <16 x i16> poison, <8 x i32> + store <8 x i16> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: extract_subvector_v32i16: ; CHECK: // %bb.0: @@ -131,6 +235,25 @@ define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 { ret void } +define void @extract_v32i16_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(4,4) { +; CHECK-LABEL: extract_v32i16_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: st1h { z0.h }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <32 x i16>, ptr %in + %hi = shufflevector <32 x i16> %b, <32 x i16> poison, <16 x i32> + store <16 x i16> %hi, ptr %out + %lo = shufflevector <32 x i16> %b, <32 x i16> poison, <16 x i32> + store <16 x i16> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v64i16: ; CHECK: // %bb.0: @@ -195,6 +318,24 @@ define void @extract_subvector_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ret void } +define void @extract_v8i32_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v8i32_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x i32>, ptr %in + %hi = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> + store <4 x i32> %hi, ptr %out + %lo = shufflevector <8 x i32> %b, <8 x i32> poison, <4 x i32> + store <4 x i32> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: extract_subvector_v16i32: ; CHECK: // %bb.0: @@ -209,6 +350,25 @@ define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 { ret void } +define void @extract_v16i32_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(4,4) { +; CHECK-LABEL: extract_v16i32_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1w { z1.s }, p0, [x1] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x i32>, ptr %in + %hi = shufflevector <16 x i32> %b, <16 x i32> poison, <8 x i32> + store <8 x i32> %hi, ptr %out + %lo = shufflevector <16 x i32> %b, <16 x i32> poison, <8 x i32> + store <8 x i32> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v32i32: ; CHECK: // %bb.0: @@ -262,6 +422,24 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ret void } +define void @extract_v4i64_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v4i64_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <4 x i64>, ptr %in + %hi = shufflevector <4 x i64> %b, <4 x i64> poison, <2 x i32> + store <2 x i64> %hi, ptr %out + %lo = shufflevector <4 x i64> %b, <4 x i64> poison, <2 x i32> + store <2 x i64> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v8i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: extract_subvector_v8i64: ; CHECK: // %bb.0: @@ -276,6 +454,25 @@ define void @extract_subvector_v8i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ret void } +define void @extract_v8i64_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(4,4) { +; CHECK-LABEL: extract_v8i64_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x i64>, ptr %in + %hi = shufflevector <8 x i64> %b, <8 x i64> poison, <4 x i32> + store <4 x i64> %hi, ptr %out + %lo = shufflevector <8 x i64> %b, <8 x i64> poison, <4 x i32> + store <4 x i64> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v16i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: extract_subvector_v16i64: ; VBITS_GE_256: // %bb.0: @@ -352,6 +549,24 @@ define void @extract_subvector_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ret void } +define void @extract_v16half_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v16half_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x half>, ptr %in + %hi = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + store <8 x half> %hi, ptr %out + %lo = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> + store <8 x half> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 { ; CHECK-LABEL: extract_subvector_v32f16: ; CHECK: // %bb.0: @@ -366,6 +581,25 @@ define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 { ret void } +define void @extract_v32half_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(4,4) { +; CHECK-LABEL: extract_v32half_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: st1h { z0.h }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <32 x half>, ptr %in + %hi = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + store <16 x half> %hi, ptr %out + %lo = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> + store <16 x half> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v64f16: ; CHECK: // %bb.0: @@ -430,6 +664,24 @@ define void @extract_subvector_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ret void } +define void @extract_v8float_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v8float_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x float>, ptr %in + %hi = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + store <4 x float> %hi, ptr %out + %lo = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> + store <4 x float> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 { ; CHECK-LABEL: extract_subvector_v16f32: ; CHECK: // %bb.0: @@ -444,6 +696,25 @@ define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 { ret void } +define void @extract_v16float_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(4,4) { +; CHECK-LABEL: extract_v16float_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1w { z1.s }, p0, [x1] +; CHECK-NEXT: st1w { z0.s }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x float>, ptr %in + %hi = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + store <8 x float> %hi, ptr %out + %lo = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> + store <8 x float> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v32f32: ; CHECK: // %bb.0: @@ -497,6 +768,24 @@ define void @extract_subvector_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ret void } +define void @extract_v4double_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v4double_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 +; CHECK-NEXT: str q1, [x1] +; CHECK-NEXT: str q0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <4 x double>, ptr %in + %hi = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + store <2 x double> %hi, ptr %out + %lo = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> + store <2 x double> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 { ; CHECK-LABEL: extract_subvector_v8f64: ; CHECK: // %bb.0: @@ -511,6 +800,25 @@ define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 { ret void } +define void @extract_v8double_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(4,4) { +; CHECK-LABEL: extract_v8double_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ext z1.b, z1.b, z0.b, #32 +; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: st1d { z0.d }, p0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x double>, ptr %in + %hi = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + store <4 x double> %hi, ptr %out + %lo = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> + store <4 x double> %lo, ptr %out2 + ret void +} + define void @extract_subvector_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: extract_subvector_v16f64: ; CHECK: // %bb.0: @@ -539,13 +847,65 @@ define void @extract_subvector_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ret void } +; bf16 + +define void @extract_v8bfloat_halves(ptr %in, ptr %out, ptr %out2) #0 { +; CHECK-LABEL: extract_v8bfloat_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: str d1, [x1] +; CHECK-NEXT: str d0, [x2] +; CHECK-NEXT: ret +entry: + %b = load <8 x bfloat>, ptr %in + %hi = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <4 x i32> + store <4 x bfloat> %hi, ptr %out + %lo = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <4 x i32> + store <4 x bfloat> %lo, ptr %out2 + ret void +} + +define void @extract_v16bfloat_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(2,2) { +; CHECK-LABEL: extract_v16bfloat_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: str q0, [x1] +; CHECK-NEXT: str q1, [x2] +; CHECK-NEXT: ret +entry: + %b = load <16 x bfloat>, ptr %in + %hi = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <8 x i32> + store <8 x bfloat> %hi, ptr %out + %lo = shufflevector <16 x bfloat> %b, <16 x bfloat> poison, <8 x i32> + store <8 x bfloat> %lo, ptr %out2 + ret void +} + +define void @extract_v32bfloat_halves(ptr %in, ptr %out, ptr %out2) #0 vscale_range(4,4) { +; CHECK-LABEL: extract_v32bfloat_halves: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: stp q0, q1, [x1] +; CHECK-NEXT: stp q3, q2, [x2] +; CHECK-NEXT: ret +entry: + %b = load <32 x bfloat>, ptr %in + %hi = shufflevector <32 x bfloat> %b, <32 x bfloat> poison, <16 x i32> + store <16 x bfloat> %hi, ptr %out + %lo = shufflevector <32 x bfloat> %b, <32 x bfloat> poison, <16 x i32> + store <16 x bfloat> %lo, ptr %out2 + ret void +} + ; Test for infinite loop due to fold: ; extract_subvector(insert_subvector(x,y,c1),c2)--> extract_subvector(y,c2-c1) define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 { ; CHECK-LABEL: extract_subvector_legalization_v8i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: adrp x8, .LCPI40_0 -; CHECK-NEXT: add x8, x8, :lo12:.LCPI40_0 +; CHECK-NEXT: adrp x8, .LCPI59_0 +; CHECK-NEXT: add x8, x8, :lo12:.LCPI59_0 ; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ldr z0, [x8] ; CHECK-NEXT: mov z1.d, z0.d @@ -556,11 +916,11 @@ define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 { ; CHECK-NEXT: sunpklo z1.d, z1.s ; CHECK-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: .LBB40_1: // %body +; CHECK-NEXT: .LBB59_1: // %body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: st1d { z0.d }, p1, [x8] ; CHECK-NEXT: st1d { z0.d }, p0, [x8] -; CHECK-NEXT: b .LBB40_1 +; CHECK-NEXT: b .LBB59_1 entry: %splat = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer br label %body