-
Notifications
You must be signed in to change notification settings - Fork 12k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AArch64] Add test showing incorrect register usage of FMLAL. NFC
See D156296 (cherry picked from commit e012c5c)
- Loading branch information
1 parent
cd2570a
commit 1993d2f
Showing
1 changed file
with
123 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 | ||
; RUN: llc -mtriple=aarch64 -mattr=+fp16fml -verify-machineinstrs %s -o - 2>&1 | FileCheck %s | ||
|
||
; This tests that the fmlal/fmlal2 instructions only accept lo registers for | ||
; the index operand, using inline asm to force the available registers. | ||
|
||
define <4 x float> @test(ptr %lhs_panel, ptr %rhs_panel, <4 x float> %a) { | ||
; CHECK-LABEL: test: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: fmov x8, d0 | ||
; CHECK-NEXT: ldr q16, [x0] | ||
; CHECK-NEXT: ldr q17, [x1] | ||
; CHECK-NEXT: lsr x9, x8, #32 | ||
; CHECK-NEXT: //APP | ||
; CHECK-NEXT: nop | ||
; CHECK-NEXT: //NO_APP | ||
; CHECK-NEXT: mov w8, w8 | ||
; CHECK-NEXT: orr x8, x8, x9, lsl #32 | ||
; CHECK-NEXT: fmov d0, x8 | ||
; CHECK-NEXT: fmlal v0.4s, v17.4h, v16.h[0] | ||
; CHECK-NEXT: mov v1.16b, v0.16b | ||
; CHECK-NEXT: fmlal2 v1.4s, v17.4h, v16.h[0] | ||
; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s | ||
; CHECK-NEXT: ret | ||
entry: | ||
%0 = load <8 x half>, ptr %lhs_panel, align 2 | ||
%1 = load <8 x half>, ptr %rhs_panel, align 2 | ||
%vecinit91 = shufflevector <8 x half> %0, <8 x half> undef, <8 x i32> zeroinitializer | ||
%b = call <4 x float> asm sideeffect "nop", "=r,r,~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7}"(<4 x float> %a) nounwind | ||
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %b, <8 x half> %1, <8 x half> %vecinit91) | ||
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %vfmlal_low3.i, <8 x half> %1, <8 x half> %vecinit91) | ||
%z = fadd <4 x float> %vfmlal_low3.i, %vfmlal_high3.i | ||
ret <4 x float> %z | ||
} | ||
|
||
define void @loop(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K, i32 noundef %flags) { | ||
; CHECK-LABEL: loop: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: movi v0.2d, #0000000000000000 | ||
; CHECK-NEXT: mov w8, w3 | ||
; CHECK-NEXT: movi v1.2d, #0000000000000000 | ||
; CHECK-NEXT: .LBB1_1: // %for.body | ||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: ldr q2, [x1], #2 | ||
; CHECK-NEXT: ldr q3, [x2], #2 | ||
; CHECK-NEXT: subs x8, x8, #1 | ||
; CHECK-NEXT: fmlal v0.4s, v3.4h, v2.h[0] | ||
; CHECK-NEXT: fmlal2 v1.4s, v3.4h, v2.h[0] | ||
; CHECK-NEXT: b.ne .LBB1_1 | ||
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup | ||
; CHECK-NEXT: stp q0, q1, [x0] | ||
; CHECK-NEXT: ret | ||
entry: | ||
%wide.trip.count = zext i32 %K to i64 | ||
br label %for.body | ||
|
||
for.cond.cleanup: ; preds = %for.body | ||
store <4 x float> %vfmlal_low3.i, ptr %out_tile, align 4 | ||
%add.ptr1399 = getelementptr inbounds float, ptr %out_tile, i64 4 | ||
store <4 x float> %vfmlal_high3.i, ptr %add.ptr1399, align 4 | ||
ret void | ||
|
||
for.body: ; preds = %entry, %for.body | ||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] | ||
%acc0.01714 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_low3.i, %for.body ] | ||
%acc1.01713 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_high3.i, %for.body ] | ||
%add.ptr = getelementptr inbounds half, ptr %lhs_panel, i64 %indvars.iv | ||
%0 = load <8 x half>, ptr %add.ptr, align 2 | ||
%add.ptr19 = getelementptr inbounds half, ptr %rhs_panel, i64 %indvars.iv | ||
%1 = load <8 x half>, ptr %add.ptr19, align 2 | ||
%vecinit93 = shufflevector <8 x half> %0, <8 x half> undef, <8 x i32> zeroinitializer | ||
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %acc0.01714, <8 x half> %1, <8 x half> %vecinit93) | ||
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %acc1.01713, <8 x half> %1, <8 x half> %vecinit93) | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count | ||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body | ||
} | ||
|
||
define void @sink(ptr %out_tile, ptr %lhs_panel, ptr %rhs_panel, i32 noundef %K, i32 noundef %flags, <8 x half> noundef %lhs) { | ||
; CHECK-LABEL: sink: | ||
; CHECK: // %bb.0: // %entry | ||
; CHECK-NEXT: movi v1.2d, #0000000000000000 | ||
; CHECK-NEXT: mov w8, w3 | ||
; CHECK-NEXT: movi v2.2d, #0000000000000000 | ||
; CHECK-NEXT: dup v0.8h, v0.h[0] | ||
; CHECK-NEXT: .LBB2_1: // %for.body | ||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: ldr q3, [x2], #2 | ||
; CHECK-NEXT: subs x8, x8, #1 | ||
; CHECK-NEXT: fmlal v1.4s, v3.4h, v0.4h | ||
; CHECK-NEXT: fmlal2 v2.4s, v3.4h, v0.4h | ||
; CHECK-NEXT: b.ne .LBB2_1 | ||
; CHECK-NEXT: // %bb.2: // %for.cond.cleanup | ||
; CHECK-NEXT: stp q1, q2, [x0] | ||
; CHECK-NEXT: ret | ||
entry: | ||
%vecinit89 = shufflevector <8 x half> %lhs, <8 x half> undef, <8 x i32> zeroinitializer | ||
%wide.trip.count = zext i32 %K to i64 | ||
br label %for.body | ||
|
||
for.cond.cleanup: ; preds = %for.body | ||
store <4 x float> %vfmlal_low3.i, ptr %out_tile, align 4 | ||
%add.ptr1395 = getelementptr inbounds float, ptr %out_tile, i64 4 | ||
store <4 x float> %vfmlal_high3.i, ptr %add.ptr1395, align 4 | ||
ret void | ||
|
||
for.body: ; preds = %entry, %for.body | ||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] | ||
%acc0.01702 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_low3.i, %for.body ] | ||
%acc1.01701 = phi <4 x float> [ zeroinitializer, %entry ], [ %vfmlal_high3.i, %for.body ] | ||
%add.ptr = getelementptr inbounds half, ptr %rhs_panel, i64 %indvars.iv | ||
%0 = load <8 x half>, ptr %add.ptr, align 2 | ||
%vfmlal_low3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %acc0.01702, <8 x half> %0, <8 x half> %vecinit89) | ||
%vfmlal_high3.i = tail call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %acc1.01701, <8 x half> %0, <8 x half> %vecinit89) | ||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 | ||
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count | ||
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body | ||
} | ||
|
||
|
||
declare <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) #2 | ||
declare <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float>, <8 x half>, <8 x half>) #2 | ||
|