-
Notifications
You must be signed in to change notification settings - Fork 11.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ARM][LowOverheadLoops] Change predicate inspection
Use the already provided helper function to get the operand type so that we can detect whether the vpr is being used as a predicate or not. Also use existing helpers to get the predicate indices when we converting the vpt blocks. This enables us to support both types of vpr predicate operand. Differential Revision: https://reviews.llvm.org/D72504
- Loading branch information
1 parent
d94d079
commit bad6032
Showing
2 changed files
with
257 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
230 changes: 230 additions & 0 deletions
230
llvm/test/CodeGen/Thumb2/LowOverheadLoops/vmaxmin_vpred_r.mir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py | ||
# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s | ||
|
||
--- | | ||
define hidden i32 @arm_elementwise_mul_s8(i8* %input_1_vect, i8* %input_2_vect, i32 %input_1_offset, i32 %input_2_offset, i8* %output, i32 %out_offset, i32 %out_mult, i32 %out_shift, i32 %out_activation_min, i32 %out_activation_max, i32 %block_size) local_unnamed_addr #0 { | ||
entry: | ||
%add = add i32 %block_size, 3 | ||
%div = lshr i32 %add, 2 | ||
%0 = call i1 @llvm.test.set.loop.iterations.i32(i32 %div) | ||
br i1 %0, label %for.body.lr.ph, label %for.cond.cleanup | ||
|
||
for.body.lr.ph: ; preds = %entry | ||
%.splatinsert.i41 = insertelement <4 x i32> undef, i32 %out_activation_min, i32 0 | ||
%.splat.i42 = shufflevector <4 x i32> %.splatinsert.i41, <4 x i32> undef, <4 x i32> zeroinitializer | ||
%.splatinsert.i = insertelement <4 x i32> undef, i32 %out_activation_max, i32 0 | ||
%.splat.i = shufflevector <4 x i32> %.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer | ||
br label %for.body | ||
|
||
for.cond.cleanup: ; preds = %for.body, %entry | ||
ret i32 0 | ||
|
||
for.body: ; preds = %for.body, %for.body.lr.ph | ||
%input_1_vect.addr.052 = phi i8* [ %input_1_vect, %for.body.lr.ph ], [ %add.ptr, %for.body ] | ||
%input_2_vect.addr.051 = phi i8* [ %input_2_vect, %for.body.lr.ph ], [ %add.ptr14, %for.body ] | ||
%output.addr.050 = phi i8* [ %output, %for.body.lr.ph ], [ %add.ptr15, %for.body ] | ||
%num_elements.049 = phi i32 [ %block_size, %for.body.lr.ph ], [ %sub, %for.body ] | ||
%iv = phi i32 [ %div, %for.body.lr.ph ], [ %iv.next, %for.body ] | ||
%output_cast = bitcast i8* %output.addr.050 to <4 x i32>* | ||
%input_2_cast = bitcast i8* %input_2_vect.addr.051 to <4 x i32>* | ||
%input_1_cast = bitcast i8* %input_1_vect.addr.052 to <4 x i32>* | ||
%pred = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %num_elements.049) | ||
%load.1 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %input_1_cast, i32 4, <4 x i1> %pred, <4 x i32> undef) | ||
%insert.input_1_offset = insertelement <4 x i32> undef, i32 %input_1_offset, i32 0 | ||
%splat.input_1_offset = shufflevector <4 x i32> %insert.input_1_offset, <4 x i32> undef, <4 x i32> zeroinitializer | ||
%insert.input_2_offset = insertelement <4 x i32> undef, i32 %input_2_offset, i32 0 | ||
%splat.input_2_offset = shufflevector <4 x i32> %insert.input_2_offset, <4 x i32> undef, <4 x i32> zeroinitializer | ||
%add.1 = add <4 x i32> %load.1, %splat.input_1_offset | ||
%load.2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %input_2_cast, i32 4, <4 x i1> %pred, <4 x i32> undef) | ||
%add.2 = add <4 x i32> %load.2, %splat.input_2_offset | ||
%mul = mul <4 x i32> %add.1, %add.2 | ||
%insert.output = insertelement <4 x i32> undef, i32 %out_offset, i32 0 | ||
%splat.output = shufflevector <4 x i32> %insert.output, <4 x i32> undef, <4 x i32> zeroinitializer | ||
%add7 = add <4 x i32> %mul, %splat.output | ||
%max = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> %add7, <4 x i32> %.splat.i42, i32 1, <4 x i1> %pred, <4 x i32> undef) | ||
%min = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %max, <4 x i32> %.splat.i, i32 1, <4 x i1> %pred, <4 x i32> undef) | ||
tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %min, <4 x i32>* %output_cast, i32 4, <4 x i1> %pred) | ||
%add.ptr = getelementptr inbounds i8, i8* %input_1_vect.addr.052, i32 4 | ||
%add.ptr14 = getelementptr inbounds i8, i8* %input_2_vect.addr.051, i32 4 | ||
%add.ptr15 = getelementptr inbounds i8, i8* %output.addr.050, i32 4 | ||
%sub = add i32 %num_elements.049, -4 | ||
%iv.next = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %iv, i32 1) | ||
%cmp = icmp ne i32 %iv.next, 0 | ||
br i1 %cmp, label %for.body, label %for.cond.cleanup | ||
} | ||
declare <4 x i1> @llvm.arm.mve.vctp32(i32) #1 | ||
declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2 | ||
declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3 | ||
declare <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 | ||
declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) #1 | ||
declare i1 @llvm.test.set.loop.iterations.i32(i32) #4 | ||
declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #4 | ||
declare void @llvm.stackprotector(i8*, i8**) #5 | ||
... | ||
--- | ||
name: arm_elementwise_mul_s8 | ||
alignment: 2 | ||
exposesReturnsTwice: false | ||
legalized: false | ||
regBankSelected: false | ||
selected: false | ||
failedISel: false | ||
tracksRegLiveness: true | ||
hasWinCFI: false | ||
registers: [] | ||
liveins: | ||
- { reg: '$r0', virtual-reg: '' } | ||
- { reg: '$r1', virtual-reg: '' } | ||
- { reg: '$r2', virtual-reg: '' } | ||
- { reg: '$r3', virtual-reg: '' } | ||
frameInfo: | ||
isFrameAddressTaken: false | ||
isReturnAddressTaken: false | ||
hasStackMap: false | ||
hasPatchPoint: false | ||
stackSize: 20 | ||
offsetAdjustment: 0 | ||
maxAlignment: 4 | ||
adjustsStack: false | ||
hasCalls: false | ||
stackProtector: '' | ||
maxCallFrameSize: 0 | ||
cvBytesOfCalleeSavedRegisters: 0 | ||
hasOpaqueSPAdjustment: false | ||
hasVAStart: false | ||
hasMustTailInVarArgFunc: false | ||
localFrameSize: 0 | ||
savePoint: '' | ||
restorePoint: '' | ||
fixedStack: | ||
- { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default, | ||
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 1, type: default, offset: 20, size: 4, alignment: 4, stack-id: default, | ||
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 2, type: default, offset: 16, size: 4, alignment: 8, stack-id: default, | ||
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 3, type: default, offset: 12, size: 4, alignment: 4, stack-id: default, | ||
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 4, type: default, offset: 8, size: 4, alignment: 8, stack-id: default, | ||
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 5, type: default, offset: 4, size: 4, alignment: 4, stack-id: default, | ||
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 6, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, | ||
isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
stack: | ||
- { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, | ||
stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, | ||
stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, | ||
stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, | ||
stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
- { id: 4, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4, | ||
stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } | ||
callSites: [] | ||
constants: [] | ||
machineFunctionInfo: {} | ||
body: | | ||
; CHECK-LABEL: name: arm_elementwise_mul_s8 | ||
; CHECK: bb.0.entry: | ||
; CHECK: successors: %bb.1(0x40000000), %bb.3(0x40000000) | ||
; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $lr | ||
; CHECK: frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $lr, implicit-def $sp, implicit $sp | ||
; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 20 | ||
; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 | ||
; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 | ||
; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -12 | ||
; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -16 | ||
; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -20 | ||
; CHECK: renamable $r12 = t2LDRi12 $sp, 44, 14, $noreg :: (load 4 from %fixed-stack.6, align 8) | ||
; CHECK: $lr = MVE_WLSTP_32 renamable $r12, %bb.3 | ||
; CHECK: bb.1.for.body.lr.ph: | ||
; CHECK: successors: %bb.2(0x80000000) | ||
; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r12 | ||
; CHECK: $r7, $r6 = t2LDRDi8 $sp, 36, 14, $noreg :: (load 4 from %fixed-stack.4, align 8), (load 4 from %fixed-stack.5) | ||
; CHECK: $r5, $r4 = t2LDRDi8 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.0, align 8), (load 4 from %fixed-stack.1) | ||
; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r6, 0, $noreg, undef renamable $q0 | ||
; CHECK: renamable $q1 = MVE_VDUP32 killed renamable $r7, 0, $noreg, undef renamable $q1 | ||
; CHECK: bb.2.for.body: | ||
; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) | ||
; CHECK: liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r5, $r12 | ||
; CHECK: renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 4, 0, $noreg :: (load 16 from %ir.input_2_cast, align 4) | ||
; CHECK: renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 4, 0, $noreg :: (load 16 from %ir.input_1_cast, align 4) | ||
; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r3, 0, $noreg, undef renamable $q2 | ||
; CHECK: renamable $q3 = MVE_VADD_qr_i32 killed renamable $q3, renamable $r2, 0, $noreg, undef renamable $q3 | ||
; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg | ||
; CHECK: renamable $q2 = MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2 | ||
; CHECK: renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r4, 0, $noreg, undef renamable $q2 | ||
; CHECK: renamable $q2 = MVE_VMAXu32 killed renamable $q2, renamable $q1, 0, $noreg, undef renamable $q2 | ||
; CHECK: renamable $q2 = MVE_VMINu32 killed renamable $q2, renamable $q0, 0, $noreg, undef renamable $q2 | ||
; CHECK: renamable $r5 = MVE_VSTRWU32_post killed renamable $q2, killed renamable $r5, 4, 0, killed $noreg :: (store 16 into %ir.output_cast, align 4) | ||
; CHECK: $lr = MVE_LETP renamable $lr, %bb.2 | ||
; CHECK: bb.3.for.cond.cleanup: | ||
; CHECK: $r0, dead $cpsr = tMOVi8 0, 14, $noreg | ||
; CHECK: tPOP_RET 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $pc, implicit killed $r0 | ||
bb.0.entry: | ||
successors: %bb.1(0x40000000), %bb.3(0x40000000) | ||
liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $lr | ||
frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $lr, implicit-def $sp, implicit $sp | ||
frame-setup CFI_INSTRUCTION def_cfa_offset 20 | ||
frame-setup CFI_INSTRUCTION offset $lr, -4 | ||
frame-setup CFI_INSTRUCTION offset $r7, -8 | ||
frame-setup CFI_INSTRUCTION offset $r6, -12 | ||
frame-setup CFI_INSTRUCTION offset $r5, -16 | ||
frame-setup CFI_INSTRUCTION offset $r4, -20 | ||
renamable $r12 = t2LDRi12 $sp, 44, 14, $noreg :: (load 4 from %fixed-stack.0, align 8) | ||
renamable $r5 = t2ADDri renamable $r12, 3, 14, $noreg, $noreg | ||
renamable $lr = t2LSRri killed renamable $r5, 2, 14, $noreg, $noreg | ||
t2WhileLoopStart renamable $lr, %bb.3, implicit-def dead $cpsr | ||
tB %bb.1, 14, $noreg | ||
bb.1.for.body.lr.ph: | ||
successors: %bb.2(0x80000000) | ||
liveins: $lr, $r0, $r1, $r2, $r3, $r12 | ||
$r7, $r6 = t2LDRDi8 $sp, 36, 14, $noreg :: (load 4 from %fixed-stack.2, align 8), (load 4 from %fixed-stack.1) | ||
$r5, $r4 = t2LDRDi8 $sp, 20, 14, $noreg :: (load 4 from %fixed-stack.6, align 8), (load 4 from %fixed-stack.5) | ||
renamable $q0 = MVE_VDUP32 killed renamable $r6, 0, $noreg, undef renamable $q0 | ||
renamable $q1 = MVE_VDUP32 killed renamable $r7, 0, $noreg, undef renamable $q1 | ||
bb.2.for.body: | ||
successors: %bb.2(0x7c000000), %bb.3(0x04000000) | ||
liveins: $lr, $q0, $q1, $r0, $r1, $r2, $r3, $r4, $r5, $r12 | ||
renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg | ||
MVE_VPST 8, implicit $vpr | ||
renamable $r1, renamable $q2 = MVE_VLDRWU32_post killed renamable $r1, 4, 1, renamable $vpr :: (load 16 from %ir.input_2_cast, align 4) | ||
MVE_VPST 8, implicit $vpr | ||
renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 4, 1, renamable $vpr :: (load 16 from %ir.input_1_cast, align 4) | ||
renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r3, 0, $noreg, undef renamable $q2 | ||
renamable $q3 = MVE_VADD_qr_i32 killed renamable $q3, renamable $r2, 0, $noreg, undef renamable $q3 | ||
renamable $r12 = t2SUBri killed renamable $r12, 4, 14, $noreg, $noreg | ||
renamable $q2 = MVE_VMULi32 killed renamable $q3, killed renamable $q2, 0, $noreg, undef renamable $q2 | ||
renamable $q2 = MVE_VADD_qr_i32 killed renamable $q2, renamable $r4, 0, $noreg, undef renamable $q2 | ||
MVE_VPST 2, implicit $vpr | ||
renamable $q2 = MVE_VMAXu32 killed renamable $q2, renamable $q1, 1, renamable $vpr, undef renamable $q2 | ||
renamable $q2 = MVE_VMINu32 killed renamable $q2, renamable $q0, 1, renamable $vpr, undef renamable $q2 | ||
renamable $r5 = MVE_VSTRWU32_post killed renamable $q2, killed renamable $r5, 4, 1, killed renamable $vpr :: (store 16 into %ir.output_cast, align 4) | ||
renamable $lr = t2LoopDec killed renamable $lr, 1 | ||
t2LoopEnd renamable $lr, %bb.2, implicit-def dead $cpsr | ||
tB %bb.3, 14, $noreg | ||
bb.3.for.cond.cleanup: | ||
$r0, dead $cpsr = tMOVi8 0, 14, $noreg | ||
tPOP_RET 14, $noreg, def $r4, def $r5, def $r6, def $r7, def $pc, implicit killed $r0 | ||
... |