diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td index cdad8e106de61..1f4d16a67055b 100644 --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -7084,6 +7084,8 @@ def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> { let Predicates = [HasNEON] in { def : Pat<(v4f16 (NEONvext (v4f16 DPR:$Vn), (v4f16 DPR:$Vm), (i32 imm:$index))), (VEXTd16 DPR:$Vn, DPR:$Vm, imm:$index)>; +def : Pat<(v4bf16 (NEONvext (v4bf16 DPR:$Vn), (v4bf16 DPR:$Vm), (i32 imm:$index))), + (VEXTd16 DPR:$Vn, DPR:$Vm, imm:$index)>; } def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> { @@ -7105,6 +7107,8 @@ def VEXTq16 : VEXTq<"vext", "16", v8i16, imm0_7> { let Predicates = [HasNEON] in { def : Pat<(v8f16 (NEONvext (v8f16 QPR:$Vn), (v8f16 QPR:$Vm), (i32 imm:$index))), (VEXTq16 QPR:$Vn, QPR:$Vm, imm:$index)>; +def : Pat<(v8bf16 (NEONvext (v8bf16 QPR:$Vn), (v8bf16 QPR:$Vm), (i32 imm:$index))), + (VEXTq16 QPR:$Vn, QPR:$Vm, imm:$index)>; } def VEXTq32 : VEXTq<"vext", "32", v4i32, imm0_3> { diff --git a/llvm/test/CodeGen/ARM/bf16-shuffle.ll b/llvm/test/CodeGen/ARM/bf16-shuffle.ll new file mode 100644 index 0000000000000..726eb75332c36 --- /dev/null +++ b/llvm/test/CodeGen/ARM/bf16-shuffle.ll @@ -0,0 +1,286 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16 +; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon,+bf16 -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16 +; RUN: llc -mtriple=arm-eabi -mattr=+v8.6a,+neon,+fullfp16,+bf16 -float-abi=hard < %s | FileCheck %s --check-prefixes=CHECK,CHECK-FP16 + +%struct.float16x4x2_t = type { [2 x <4 x bfloat>] } +%struct.float16x8x2_t = type { [2 x <8 x bfloat>] } + +define dso_local <4 x bfloat> @test_vbsl_bf16(<4 x i16> %a, <4 x bfloat> %b, <4 x bfloat> %c) { +; CHECK-LABEL: test_vbsl_bf16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vbsl d0, d1, d2 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x i16> %a to <8 x i8> + %1 = bitcast <4 x bfloat> %b to <8 x i8> + %2 = bitcast <4 x bfloat> %c to <8 x i8> + %vbsl_v.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %0, <8 x i8> %1, <8 x i8> %2) + %3 = bitcast <8 x i8> %vbsl_v.i to <4 x bfloat> + ret <4 x bfloat> %3 +} + +define dso_local <8 x bfloat> @test_vbslq_bf16(<8 x i16> %a, <8 x bfloat> %b, <8 x bfloat> %c) { +; CHECK-LABEL: test_vbslq_bf16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vbsl q0, q1, q2 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x i16> %a to <16 x i8> + %1 = bitcast <8 x bfloat> %b to <16 x i8> + %2 = bitcast <8 x bfloat> %c to <16 x i8> + %vbslq_v.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) + %3 = bitcast <16 x i8> %vbslq_v.i to <8 x bfloat> + ret <8 x bfloat> %3 +} + +;define dso_local %struct.float16x4x2_t @test_vzip_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { +;entry: +; %vzip.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> +; %vzip1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> +; %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vzip.i, 0, 0 +; %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vzip1.i, 0, 1 +; ret %struct.float16x4x2_t %.fca.0.1.insert +;} + +;define dso_local %struct.float16x8x2_t @test_vzipq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { +;entry: +; %vzip.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> +; %vzip1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> +; %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vzip.i, 0, 0 +; %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vzip1.i, 0, 1 +; ret %struct.float16x8x2_t %.fca.0.1.insert +;} + +;define dso_local %struct.float16x4x2_t @test_vuzp_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { +;entry: +; %vuzp.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> +; %vuzp1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> +; %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vuzp.i, 0, 0 +; %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vuzp1.i, 0, 1 +; ret %struct.float16x4x2_t %.fca.0.1.insert +;} + +;define dso_local %struct.float16x8x2_t @test_vuzpq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { +;entry: +; %vuzp.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> +; %vuzp1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> +; %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vuzp.i, 0, 0 +; %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vuzp1.i, 0, 1 +; ret %struct.float16x8x2_t %.fca.0.1.insert +;} + +;define dso_local %struct.float16x4x2_t @test_vtrn_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { +;entry: +; %vtrn.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> +; %vtrn1.i = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> +; %.fca.0.0.insert = insertvalue %struct.float16x4x2_t undef, <4 x bfloat> %vtrn.i, 0, 0 +; %.fca.0.1.insert = insertvalue %struct.float16x4x2_t %.fca.0.0.insert, <4 x bfloat> %vtrn1.i, 0, 1 +; ret %struct.float16x4x2_t %.fca.0.1.insert +;} + +;define dso_local %struct.float16x8x2_t @test_vtrnq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { +;entry: +; %vtrn.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> +; %vtrn1.i = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> +; %.fca.0.0.insert = insertvalue %struct.float16x8x2_t undef, <8 x bfloat> %vtrn.i, 0, 0 +; %.fca.0.1.insert = insertvalue %struct.float16x8x2_t %.fca.0.0.insert, <8 x bfloat> %vtrn1.i, 0, 1 +; ret %struct.float16x8x2_t %.fca.0.1.insert +;} + +define dso_local <4 x bfloat> @test_vmov_n_bf16(float %a.coerce) { +; CHECK-NOFP16-LABEL: test_vmov_n_bf16: +; CHECK-NOFP16: @ %bb.0: @ %entry +; CHECK-NOFP16-NEXT: .pad #4 +; CHECK-NOFP16-NEXT: sub sp, sp, #4 +; CHECK-NOFP16-NEXT: vmov r0, s0 +; CHECK-NOFP16-NEXT: strh r0, [sp, #2] +; CHECK-NOFP16-NEXT: add r0, sp, #2 +; CHECK-NOFP16-NEXT: vld1.16 {d0[]}, [r0:16] +; CHECK-NOFP16-NEXT: add sp, sp, #4 +; CHECK-NOFP16-NEXT: bx lr +; +; CHECK-FP16-LABEL: test_vmov_n_bf16: +; CHECK-FP16: @ %bb.0: @ %entry +; CHECK-FP16-NEXT: @ kill: def $s0 killed $s0 def $d0 +; CHECK-FP16-NEXT: vdup.16 d0, d0[0] +; CHECK-FP16-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to bfloat + %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0 + %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer + ret <4 x bfloat> %vecinit4 +} + +define dso_local <8 x bfloat> @test_vmovq_n_bf16(float %a.coerce) { +; CHECK-NOFP16-LABEL: test_vmovq_n_bf16: +; CHECK-NOFP16: @ %bb.0: @ %entry +; CHECK-NOFP16-NEXT: .pad #4 +; CHECK-NOFP16-NEXT: sub sp, sp, #4 +; CHECK-NOFP16-NEXT: vmov r0, s0 +; CHECK-NOFP16-NEXT: strh r0, [sp, #2] +; CHECK-NOFP16-NEXT: add r0, sp, #2 +; CHECK-NOFP16-NEXT: vld1.16 {d0[], d1[]}, [r0:16] +; CHECK-NOFP16-NEXT: add sp, sp, #4 +; CHECK-NOFP16-NEXT: bx lr +; +; CHECK-FP16-LABEL: test_vmovq_n_bf16: +; CHECK-FP16: @ %bb.0: @ %entry +; CHECK-FP16-NEXT: @ kill: def $s0 killed $s0 def $d0 +; CHECK-FP16-NEXT: vdup.16 q0, d0[0] +; CHECK-FP16-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to bfloat + %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0 + %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer + ret <8 x bfloat> %vecinit8 +} + +define dso_local <4 x bfloat> @test_vdup_n_bf16(float %a.coerce) { +; CHECK-NOFP16-LABEL: test_vdup_n_bf16: +; CHECK-NOFP16: @ %bb.0: @ %entry +; CHECK-NOFP16-NEXT: .pad #4 +; CHECK-NOFP16-NEXT: sub sp, sp, #4 +; CHECK-NOFP16-NEXT: vmov r0, s0 +; CHECK-NOFP16-NEXT: strh r0, [sp, #2] +; CHECK-NOFP16-NEXT: add r0, sp, #2 +; CHECK-NOFP16-NEXT: vld1.16 {d0[]}, [r0:16] +; CHECK-NOFP16-NEXT: add sp, sp, #4 +; CHECK-NOFP16-NEXT: bx lr +; +; CHECK-FP16-LABEL: test_vdup_n_bf16: +; CHECK-FP16: @ %bb.0: @ %entry +; CHECK-FP16-NEXT: @ kill: def $s0 killed $s0 def $d0 +; CHECK-FP16-NEXT: vdup.16 d0, d0[0] +; CHECK-FP16-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to bfloat + %vecinit = insertelement <4 x bfloat> undef, bfloat %1, i32 0 + %vecinit4 = shufflevector <4 x bfloat> %vecinit, <4 x bfloat> undef, <4 x i32> zeroinitializer + ret <4 x bfloat> %vecinit4 +} + +define dso_local <8 x bfloat> @test_vdupq_n_bf16(float %a.coerce) { +; CHECK-NOFP16-LABEL: test_vdupq_n_bf16: +; CHECK-NOFP16: @ %bb.0: @ %entry +; CHECK-NOFP16-NEXT: .pad #4 +; CHECK-NOFP16-NEXT: sub sp, sp, #4 +; CHECK-NOFP16-NEXT: vmov r0, s0 +; CHECK-NOFP16-NEXT: strh r0, [sp, #2] +; CHECK-NOFP16-NEXT: add r0, sp, #2 +; CHECK-NOFP16-NEXT: vld1.16 {d0[], d1[]}, [r0:16] +; CHECK-NOFP16-NEXT: add sp, sp, #4 +; CHECK-NOFP16-NEXT: bx lr +; +; CHECK-FP16-LABEL: test_vdupq_n_bf16: +; CHECK-FP16: @ %bb.0: @ %entry +; CHECK-FP16-NEXT: @ kill: def $s0 killed $s0 def $d0 +; CHECK-FP16-NEXT: vdup.16 q0, d0[0] +; CHECK-FP16-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to bfloat + %vecinit = insertelement <8 x bfloat> undef, bfloat %1, i32 0 + %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer + ret <8 x bfloat> %vecinit8 +} + +define dso_local <4 x bfloat> @test_vdup_lane_bf16(<4 x bfloat> %a) { +; CHECK-LABEL: test_vdup_lane_bf16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdup.16 d0, d0[3] +; CHECK-NEXT: bx lr +entry: + %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> + ret <4 x bfloat> %shuffle +} + +define dso_local <8 x bfloat> @test_vdupq_lane_bf16(<4 x bfloat> %a) { +; CHECK-LABEL: test_vdupq_lane_bf16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vdup.16 q0, d0[3] +; CHECK-NEXT: bx lr +entry: + %shuffle = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <8 x i32> + ret <8 x bfloat> %shuffle +} + +define dso_local <4 x bfloat> @test_vext_bf16(<4 x bfloat> %a, <4 x bfloat> %b) { +; CHECK-LABEL: test_vext_bf16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vext.16 d0, d0, d1, #2 +; CHECK-NEXT: bx lr +entry: + %vext = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> + ret <4 x bfloat> %vext +} + +define dso_local <8 x bfloat> @test_vextq_bf16(<8 x bfloat> %a, <8 x bfloat> %b) { +; CHECK-LABEL: test_vextq_bf16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vext.16 q0, q0, q1, #5 +; CHECK-NEXT: bx lr +entry: + %vext = shufflevector <8 x bfloat> %a, <8 x bfloat> %b, <8 x i32> + ret <8 x bfloat> %vext +} + +;define dso_local <4 x bfloat> @test_vrev64_bf16(<4 x bfloat> %a) { +;entry: +; %shuffle.i = shufflevector <4 x bfloat> %a, <4 x bfloat> undef, <4 x i32> +; ret <4 x bfloat> %shuffle.i +;} + +;define dso_local <8 x bfloat> @test_vrev64q_bf16(<8 x bfloat> %a) { +;entry: +; %shuffle.i = shufflevector <8 x bfloat> %a, <8 x bfloat> undef, <8 x i32> +; ret <8 x bfloat> %shuffle.i +;} + +define <4 x bfloat> @test_vld_dup1_4xbfloat(bfloat* %b) { +; CHECK-LABEL: test_vld_dup1_4xbfloat: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld1.16 {d0[]}, [r0:16] +; CHECK-NEXT: bx lr +entry: + %b1 = load bfloat, bfloat* %b, align 2 + %vecinit = insertelement <4 x bfloat> undef, bfloat %b1, i32 0 + %vecinit2 = insertelement <4 x bfloat> %vecinit, bfloat %b1, i32 1 + %vecinit3 = insertelement <4 x bfloat> %vecinit2, bfloat %b1, i32 2 + %vecinit4 = insertelement <4 x bfloat> %vecinit3, bfloat %b1, i32 3 + ret <4 x bfloat> %vecinit4 +} + +define <8 x bfloat> @test_vld_dup1_8xbfloat(bfloat* %b) local_unnamed_addr { +; CHECK-LABEL: test_vld_dup1_8xbfloat: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld1.16 {d0[], d1[]}, [r0:16] +; CHECK-NEXT: bx lr +entry: + %b1 = load bfloat, bfloat* %b, align 2 + %vecinit = insertelement <8 x bfloat> undef, bfloat %b1, i32 0 + %vecinit8 = shufflevector <8 x bfloat> %vecinit, <8 x bfloat> undef, <8 x i32> zeroinitializer + ret <8 x bfloat> %vecinit8 +} + +define <8 x bfloat> @test_shufflevector8xbfloat(<4 x bfloat> %a) { +; CHECK-LABEL: test_shufflevector8xbfloat: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: vmov.f64 d1, d0 +; CHECK-NEXT: bx lr +entry: + %r = shufflevector <4 x bfloat> %a, <4 x bfloat> %a, <8 x i32> + ret <8 x bfloat> %r +} + +declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) +declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)