Skip to content

Commit

Permalink
[AArch64][SVE] Optimize ptrue predicate pattern with known sve regist…
Browse files Browse the repository at this point in the history
…er width.

For vectors that are exactly equal to getMaxSVEVectorSizeInBits, just use
AArch64SVEPredPattern::all, which can enable the use of unpredicated ptrue when available.

TestPlan: check-llvm

Differential Revision: https://reviews.llvm.org/D108706
  • Loading branch information
junparser committed Aug 27, 2021
1 parent 8c47103 commit 15b2a8e
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 14 deletions.
11 changes: 9 additions & 2 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -18026,9 +18026,16 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
getSVEPredPatternFromNumElements(VT.getVectorNumElements());
assert(PgPattern && "Unexpected element count for SVE predicate");

// TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
// use AArch64SVEPredPattern::all, which can enable the use of unpredicated
// For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
// AArch64SVEPredPattern::all, which can enable the use of unpredicated
// variants of instructions when available.
const auto &Subtarget =
static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
if (MaxSVESize && MinSVESize == MaxSVESize &&
MaxSVESize == VT.getSizeInBits())
PgPattern = AArch64SVEPredPattern::all;

MVT MaskVT;
switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
Expand Down
Expand Up @@ -226,18 +226,17 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(<vsca
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ptrue p1.s, vl8
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
; CHECK-NEXT: cntd x8
; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1]
; CHECK-NEXT: subs x8, x8, #8
; CHECK-NEXT: csel x8, xzr, x8, lo
; CHECK-NEXT: mov w9, #8
; CHECK-NEXT: cmp x8, #8
; CHECK-NEXT: ptrue p1.d, vl8
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: uunpklo z0.d, z1.s
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: st1d { z0.d }, p1, [x9, x8, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p0, [x9, x8, lsl #3]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AArch64/sve-extract-vector.ll
Expand Up @@ -186,12 +186,11 @@ define <4 x i64> @extract_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec) nounwind
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: mov w10, #4
; CHECK-NEXT: cmp x9, #4
; CHECK-NEXT: ptrue p1.d, vl4
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: csel x9, x9, x10, lo
; CHECK-NEXT: mov x10, sp
; CHECK-NEXT: ld1d { z0.d }, p1/z, [x10, x9, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p1, [x8]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x10, x9, lsl #3]
; CHECK-NEXT: st1d { z0.d }, p0, [x8]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand Down
117 changes: 117 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-fixed-length-optimize-ptrue.ll
@@ -0,0 +1,117 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-sve-vector-bits-max=512 < %s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
; CHECK-LABEL: add_v64i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
; CHECK-NEXT: add z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: st1b { z0.b }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b
%res = add <64 x i8> %op1, %op2
store <64 x i8> %res, <64 x i8>* %a
ret void
}

define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b, <32 x i16>* %c) #0 {
; CHECK-LABEL: add_v32i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: add z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b
%res = add <32 x i16> %op1, %op2
store <32 x i16> %res, <32 x i16>* %a
ret void
}

define void @abs_v16i32(<16 x i32>* %a) #0 {
; CHECK-LABEL: abs_v16i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: abs z0.s, p0/m, z0.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x i32>, <16 x i32>* %a
%res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false)
store <16 x i32> %res, <16 x i32>* %a
ret void
}

define void @abs_v8i64(<8 x i64>* %a) #0 {
; CHECK-LABEL: abs_v8i64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: abs z0.d, p0/m, z0.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x i64>, <8 x i64>* %a
%res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false)
store <8 x i64> %res, <8 x i64>* %a
ret void
}

define void @fadd_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
; CHECK-LABEL: fadd_v32f16:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
; CHECK-NEXT: fadd z0.h, z0.h, z1.h
; CHECK-NEXT: st1h { z0.h }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <32 x half>, <32 x half>* %a
%op2 = load <32 x half>, <32 x half>* %b
%res = fadd <32 x half> %op1, %op2
store <32 x half> %res, <32 x half>* %a
ret void
}

define void @fadd_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
; CHECK-LABEL: fadd_v16f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: fadd z0.s, z0.s, z1.s
; CHECK-NEXT: st1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <16 x float>, <16 x float>* %a
%op2 = load <16 x float>, <16 x float>* %b
%res = fadd <16 x float> %op1, %op2
store <16 x float> %res, <16 x float>* %a
ret void
}

define void @fadd_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
; CHECK-LABEL: fadd_v8f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
; CHECK-NEXT: fadd z0.d, z0.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: ret
%op1 = load <8 x double>, <8 x double>* %a
%op2 = load <8 x double>, <8 x double>* %b
%res = fadd <8 x double> %op1, %op2
store <8 x double> %res, <8 x double>* %a
ret void
}

declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)

attributes #0 = { "target-features"="+sve" }
7 changes: 3 additions & 4 deletions llvm/test/CodeGen/AArch64/sve-insert-vector.ll
Expand Up @@ -329,19 +329,18 @@ define <vscale x 2 x i64> @insert_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, <
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: ptrue p0.d, vl4
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: cntd x8
; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
; CHECK-NEXT: subs x8, x8, #4
; CHECK-NEXT: csel x8, xzr, x8, lo
; CHECK-NEXT: mov w9, #4
; CHECK-NEXT: cmp x8, #4
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: st1d { z0.d }, p1, [sp]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
; CHECK-NEXT: st1d { z1.d }, p0, [x9, x8, lsl #3]
; CHECK-NEXT: ld1d { z0.d }, p1/z, [sp]
; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/sve-vscale-attr.ll
Expand Up @@ -63,7 +63,7 @@ define void @func_vscale2_2(<16 x i32>* %a, <16 x i32>* %b) #2 {
; CHECK-LABEL: func_vscale2_2:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #8
; CHECK-NEXT: ptrue p0.s, vl8
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
Expand Down Expand Up @@ -108,7 +108,7 @@ attributes #3 = { "target-features"="+sve" vscale_range(2,4) }
define void @func_vscale4_4(<16 x i32>* %a, <16 x i32>* %b) #4 {
; CHECK-LABEL: func_vscale4_4:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s, vl16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s
Expand Down

0 comments on commit 15b2a8e

Please sign in to comment.