Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ARM] Add a complex dotprod test case.
- Loading branch information
1 parent
309fccd
commit e6df795
Showing
1 changed file
with
203 additions
and
0 deletions.
There are no files selected for viewing
203 changes: 203 additions & 0 deletions
203
llvm/test/CodeGen/Thumb2/LowOverheadLoops/arm_cmplx_dot_prod_f32.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s | ||
|
||
define void @arm_cmplx_dot_prod_f32(float* %pSrcA, float* %pSrcB, i32 %numSamples, float* nocapture %realResult, float* nocapture %imagResult) { | ||
; CHECK-LABEL: arm_cmplx_dot_prod_f32: | ||
; CHECK: @ %bb.0: @ %entry | ||
; CHECK-NEXT: .save {r4, r5, r7, lr} | ||
; CHECK-NEXT: push {r4, r5, r7, lr} | ||
; CHECK-NEXT: .vsave {d8, d9} | ||
; CHECK-NEXT: vpush {d8, d9} | ||
; CHECK-NEXT: ldr.w r12, [sp, #32] | ||
; CHECK-NEXT: cmp r2, #8 | ||
; CHECK-NEXT: blo .LBB0_6 | ||
; CHECK-NEXT: @ %bb.1: @ %while.body.preheader | ||
; CHECK-NEXT: lsrs r4, r2, #2 | ||
; CHECK-NEXT: mov.w lr, #2 | ||
; CHECK-NEXT: cmp r4, #2 | ||
; CHECK-NEXT: it lt | ||
; CHECK-NEXT: lsrlt.w lr, r2, #2 | ||
; CHECK-NEXT: rsb r4, lr, r2, lsr #2 | ||
; CHECK-NEXT: vldrw.u32 q2, [r1], #32 | ||
; CHECK-NEXT: add.w lr, r4, #1 | ||
; CHECK-NEXT: vldrw.u32 q1, [r0], #32 | ||
; CHECK-NEXT: vmov.i32 q0, #0x0 | ||
; CHECK-NEXT: .LBB0_2: @ %while.body | ||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: vcmla.f32 q0, q1, q2, #0 | ||
; CHECK-NEXT: vldrw.u32 q3, [r1, #-16] | ||
; CHECK-NEXT: vldrw.u32 q4, [r0, #-16] | ||
; CHECK-NEXT: vcmla.f32 q0, q1, q2, #90 | ||
; CHECK-NEXT: vldrw.u32 q1, [r0], #32 | ||
; CHECK-NEXT: vldrw.u32 q2, [r1], #32 | ||
; CHECK-NEXT: vcmla.f32 q0, q4, q3, #0 | ||
; CHECK-NEXT: vcmla.f32 q0, q4, q3, #90 | ||
; CHECK-NEXT: le lr, .LBB0_2 | ||
; CHECK-NEXT: @ %bb.3: @ %while.end | ||
; CHECK-NEXT: vcmla.f32 q0, q1, q2, #0 | ||
; CHECK-NEXT: movs r4, #6 | ||
; CHECK-NEXT: vcmla.f32 q0, q1, q2, #90 | ||
; CHECK-NEXT: vldrw.u32 q1, [r1, #-16] | ||
; CHECK-NEXT: vldrw.u32 q2, [r0, #-16] | ||
; CHECK-NEXT: and.w r2, r4, r2, lsl #1 | ||
; CHECK-NEXT: vcmla.f32 q0, q2, q1, #0 | ||
; CHECK-NEXT: vcmla.f32 q0, q2, q1, #90 | ||
; CHECK-NEXT: cbz r2, .LBB0_8 | ||
; CHECK-NEXT: @ %bb.4: @ %while.body9 | ||
; CHECK-NEXT: cmp r2, #4 | ||
; CHECK-NEXT: vctp.32 r2 | ||
; CHECK-NEXT: vpstttt | ||
; CHECK-NEXT: vldrwt.u32 q1, [r1] | ||
; CHECK-NEXT: vldrwt.u32 q2, [r0] | ||
; CHECK-NEXT: vcmlat.f32 q0, q2, q1, #0 | ||
; CHECK-NEXT: vcmlat.f32 q0, q2, q1, #90 | ||
; CHECK-NEXT: bls .LBB0_8 | ||
; CHECK-NEXT: @ %bb.5: @ %while.body9.1 | ||
; CHECK-NEXT: subs r2, #4 | ||
; CHECK-NEXT: vctp.32 r2 | ||
; CHECK-NEXT: vpstttt | ||
; CHECK-NEXT: vldrwt.u32 q1, [r1, #16] | ||
; CHECK-NEXT: vldrwt.u32 q2, [r0, #16] | ||
; CHECK-NEXT: vcmlat.f32 q0, q2, q1, #0 | ||
; CHECK-NEXT: vcmlat.f32 q0, q2, q1, #90 | ||
; CHECK-NEXT: b .LBB0_8 | ||
; CHECK-NEXT: .LBB0_6: @ %if.else | ||
; CHECK-NEXT: lsls r4, r2, #1 | ||
; CHECK-NEXT: vmov.i32 q0, #0x0 | ||
; CHECK-NEXT: dlstp.32 lr, r4 | ||
; CHECK-NEXT: .LBB0_7: @ %do.body | ||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: vldrw.u32 q1, [r1], #16 | ||
; CHECK-NEXT: vldrw.u32 q2, [r0], #16 | ||
; CHECK-NEXT: vcmla.f32 q0, q2, q1, #0 | ||
; CHECK-NEXT: vcmla.f32 q0, q2, q1, #90 | ||
; CHECK-NEXT: letp lr, .LBB0_7 | ||
; CHECK-NEXT: .LBB0_8: @ %if.end | ||
; CHECK-NEXT: vadd.f32 s0, s0, s2 | ||
; CHECK-NEXT: vadd.f32 s2, s1, s3 | ||
; CHECK-NEXT: vstr s0, [r3] | ||
; CHECK-NEXT: vstr s2, [r12] | ||
; CHECK-NEXT: vpop {d8, d9} | ||
; CHECK-NEXT: pop {r4, r5, r7, pc} | ||
entry: | ||
%cmp = icmp ugt i32 %numSamples, 7 | ||
br i1 %cmp, label %while.body.preheader, label %if.else | ||
|
||
while.body.preheader: ; preds = %entry | ||
%vecSrcA.0.in108 = bitcast float* %pSrcA to <4 x float>* | ||
%vecSrcA.0109 = load <4 x float>, <4 x float>* %vecSrcA.0.in108, align 4 | ||
%vecSrcB.0.in106 = bitcast float* %pSrcB to <4 x float>* | ||
%vecSrcB.0107 = load <4 x float>, <4 x float>* %vecSrcB.0.in106, align 4 | ||
%pSrcB.addr.0105 = getelementptr inbounds float, float* %pSrcB, i32 4 | ||
%pSrcA.addr.0104 = getelementptr inbounds float, float* %pSrcA, i32 4 | ||
%shr = lshr i32 %numSamples, 2 | ||
br label %while.body | ||
|
||
while.body: ; preds = %while.body.preheader, %while.body | ||
%vecSrcA.0118 = phi <4 x float> [ %vecSrcA.0, %while.body ], [ %vecSrcA.0109, %while.body.preheader ] | ||
%vecSrcB.0117 = phi <4 x float> [ %vecSrcB.0, %while.body ], [ %vecSrcB.0107, %while.body.preheader ] | ||
%pSrcB.addr.0116 = phi float* [ %pSrcB.addr.0, %while.body ], [ %pSrcB.addr.0105, %while.body.preheader ] | ||
%pSrcA.addr.0115 = phi float* [ %pSrcA.addr.0, %while.body ], [ %pSrcA.addr.0104, %while.body.preheader ] | ||
%vec_acc.0114 = phi <4 x float> [ %7, %while.body ], [ zeroinitializer, %while.body.preheader ] | ||
%vecSrcB.0.in.in113 = phi float* [ %add.ptr4, %while.body ], [ %pSrcB, %while.body.preheader ] | ||
%vecSrcA.0.in.in112 = phi float* [ %add.ptr3, %while.body ], [ %pSrcA, %while.body.preheader ] | ||
%blkCnt.0.in111 = phi i32 [ %blkCnt.0, %while.body ], [ %shr, %while.body.preheader ] | ||
%blkCnt.0 = add nsw i32 %blkCnt.0.in111, -1 | ||
%0 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 0, <4 x float> %vec_acc.0114, <4 x float> %vecSrcA.0118, <4 x float> %vecSrcB.0117) | ||
%1 = bitcast float* %pSrcA.addr.0115 to <4 x float>* | ||
%2 = load <4 x float>, <4 x float>* %1, align 4 | ||
%add.ptr3 = getelementptr inbounds float, float* %vecSrcA.0.in.in112, i32 8 | ||
%3 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 1, <4 x float> %0, <4 x float> %vecSrcA.0118, <4 x float> %vecSrcB.0117) | ||
%4 = bitcast float* %pSrcB.addr.0116 to <4 x float>* | ||
%5 = load <4 x float>, <4 x float>* %4, align 4 | ||
%add.ptr4 = getelementptr inbounds float, float* %vecSrcB.0.in.in113, i32 8 | ||
%6 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 0, <4 x float> %3, <4 x float> %2, <4 x float> %5) | ||
%7 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 1, <4 x float> %6, <4 x float> %2, <4 x float> %5) | ||
%pSrcA.addr.0 = getelementptr inbounds float, float* %vecSrcA.0.in.in112, i32 12 | ||
%pSrcB.addr.0 = getelementptr inbounds float, float* %vecSrcB.0.in.in113, i32 12 | ||
%vecSrcB.0.in = bitcast float* %add.ptr4 to <4 x float>* | ||
%vecSrcB.0 = load <4 x float>, <4 x float>* %vecSrcB.0.in, align 4 | ||
%vecSrcA.0.in = bitcast float* %add.ptr3 to <4 x float>* | ||
%vecSrcA.0 = load <4 x float>, <4 x float>* %vecSrcA.0.in, align 4 | ||
%cmp2 = icmp sgt i32 %blkCnt.0.in111, 2 | ||
br i1 %cmp2, label %while.body, label %while.end | ||
|
||
while.end: ; preds = %while.body | ||
%8 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 0, <4 x float> %7, <4 x float> %vecSrcA.0, <4 x float> %vecSrcB.0) | ||
%9 = bitcast float* %pSrcA.addr.0 to <4 x float>* | ||
%10 = load <4 x float>, <4 x float>* %9, align 4 | ||
%11 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 1, <4 x float> %8, <4 x float> %vecSrcA.0, <4 x float> %vecSrcB.0) | ||
%12 = bitcast float* %pSrcB.addr.0 to <4 x float>* | ||
%13 = load <4 x float>, <4 x float>* %12, align 4 | ||
%14 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 0, <4 x float> %11, <4 x float> %10, <4 x float> %13) | ||
%15 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32 1, <4 x float> %14, <4 x float> %10, <4 x float> %13) | ||
%and = shl i32 %numSamples, 1 | ||
%mul = and i32 %and, 6 | ||
%cmp8123.not = icmp eq i32 %mul, 0 | ||
br i1 %cmp8123.not, label %if.end, label %while.body9 | ||
|
||
while.body9: ; preds = %while.end | ||
%16 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %mul) | ||
%add.ptr10 = getelementptr inbounds float, float* %vecSrcA.0.in.in112, i32 16 | ||
%add.ptr11 = getelementptr inbounds float, float* %vecSrcB.0.in.in113, i32 16 | ||
%17 = bitcast float* %add.ptr10 to <4 x float>* | ||
%18 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* nonnull %17, i32 4, <4 x i1> %16, <4 x float> zeroinitializer) | ||
%19 = bitcast float* %add.ptr11 to <4 x float>* | ||
%20 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* nonnull %19, i32 4, <4 x i1> %16, <4 x float> zeroinitializer) | ||
%21 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.predicated.v4f32.v4i1(i32 0, <4 x float> %15, <4 x float> %18, <4 x float> %20, <4 x i1> %16) | ||
%22 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.predicated.v4f32.v4i1(i32 1, <4 x float> %21, <4 x float> %18, <4 x float> %20, <4 x i1> %16) | ||
%cmp8 = icmp ugt i32 %mul, 4 | ||
br i1 %cmp8, label %while.body9.1, label %if.end | ||
|
||
if.else: ; preds = %entry | ||
%mul14 = shl nuw nsw i32 %numSamples, 1 | ||
br label %do.body | ||
|
||
do.body: ; preds = %do.body, %if.else | ||
%blkCnt.2 = phi i32 [ %mul14, %if.else ], [ %sub18, %do.body ] | ||
%vec_acc.2 = phi <4 x float> [ zeroinitializer, %if.else ], [ %29, %do.body ] | ||
%pSrcB.addr.2 = phi float* [ %pSrcB, %if.else ], [ %add.ptr17, %do.body ] | ||
%pSrcA.addr.2 = phi float* [ %pSrcA, %if.else ], [ %add.ptr16, %do.body ] | ||
%23 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %blkCnt.2) | ||
%24 = bitcast float* %pSrcA.addr.2 to <4 x float>* | ||
%25 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %24, i32 4, <4 x i1> %23, <4 x float> zeroinitializer) | ||
%26 = bitcast float* %pSrcB.addr.2 to <4 x float>* | ||
%27 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %26, i32 4, <4 x i1> %23, <4 x float> zeroinitializer) | ||
%28 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.predicated.v4f32.v4i1(i32 0, <4 x float> %vec_acc.2, <4 x float> %25, <4 x float> %27, <4 x i1> %23) | ||
%29 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.predicated.v4f32.v4i1(i32 1, <4 x float> %28, <4 x float> %25, <4 x float> %27, <4 x i1> %23) | ||
%add.ptr16 = getelementptr inbounds float, float* %pSrcA.addr.2, i32 4 | ||
%add.ptr17 = getelementptr inbounds float, float* %pSrcB.addr.2, i32 4 | ||
%sub18 = add nsw i32 %blkCnt.2, -4 | ||
%cmp19 = icmp sgt i32 %blkCnt.2, 4 | ||
br i1 %cmp19, label %do.body, label %if.end | ||
|
||
if.end: ; preds = %do.body, %while.body9, %while.body9.1, %while.end | ||
%vec_acc.3 = phi <4 x float> [ %15, %while.end ], [ %22, %while.body9 ], [ %40, %while.body9.1 ], [ %29, %do.body ] | ||
%30 = extractelement <4 x float> %vec_acc.3, i32 0 | ||
%31 = extractelement <4 x float> %vec_acc.3, i32 2 | ||
%add = fadd fast float %30, %31 | ||
%32 = extractelement <4 x float> %vec_acc.3, i32 1 | ||
%33 = extractelement <4 x float> %vec_acc.3, i32 3 | ||
%add20 = fadd fast float %32, %33 | ||
store float %add, float* %realResult, align 4 | ||
store float %add20, float* %imagResult, align 4 | ||
ret void | ||
|
||
while.body9.1: ; preds = %while.body9 | ||
%sub12 = add nsw i32 %mul, -4 | ||
%34 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %sub12) | ||
%add.ptr10.1 = getelementptr inbounds float, float* %vecSrcA.0.in.in112, i32 20 | ||
%add.ptr11.1 = getelementptr inbounds float, float* %vecSrcB.0.in.in113, i32 20 | ||
%35 = bitcast float* %add.ptr10.1 to <4 x float>* | ||
%36 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* nonnull %35, i32 4, <4 x i1> %34, <4 x float> zeroinitializer) | ||
%37 = bitcast float* %add.ptr11.1 to <4 x float>* | ||
%38 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* nonnull %37, i32 4, <4 x i1> %34, <4 x float> zeroinitializer) | ||
%39 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.predicated.v4f32.v4i1(i32 0, <4 x float> %22, <4 x float> %36, <4 x float> %38, <4 x i1> %34) | ||
%40 = tail call fast <4 x float> @llvm.arm.mve.vcmlaq.predicated.v4f32.v4i1(i32 1, <4 x float> %39, <4 x float> %36, <4 x float> %38, <4 x i1> %34) | ||
br label %if.end | ||
} | ||
|
||
declare <4 x float> @llvm.arm.mve.vcmlaq.v4f32(i32, <4 x float>, <4 x float>, <4 x float>) #1 | ||
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 | ||
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2 | ||
declare <4 x float> @llvm.arm.mve.vcmlaq.predicated.v4f32.v4i1(i32, <4 x float>, <4 x float>, <4 x float>, <4 x i1>) #1 |