Skip to content

Commit

Permalink
[AArch64][SVE] Add first-faulting load intrinsic
Browse files Browse the repository at this point in the history
Summary:
Implements the llvm.aarch64.sve.ldff1 intrinsic and DAG
combine rules for first-faulting loads with sign & zero extends

Reviewers: sdesmalen, efriedma, andwar, dancgr, rengolin

Reviewed By: sdesmalen

Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cameron.mcinally, cfe-commits, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73025
  • Loading branch information
kmclaughlin-arm committed Jan 23, 2020
1 parent 4321c6a commit aa0f37e
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 10 deletions.
1 change: 1 addition & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Expand Up @@ -1176,6 +1176,7 @@ class AdvSIMD_ScatterStore_VectorBase_Intrinsic
def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;

def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;

//
// Stores
Expand Down
15 changes: 12 additions & 3 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -1375,6 +1375,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE";
case AArch64ISD::LDNF1: return "AArch64ISD::LDNF1";
case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S";
case AArch64ISD::LDFF1: return "AArch64ISD::LDFF1";
case AArch64ISD::LDFF1S: return "AArch64ISD::LDFF1S";
case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW";
Expand Down Expand Up @@ -10237,6 +10239,7 @@ static SDValue performSVEAndCombine(SDNode *N,
// perfect candidates for combining.
switch (Src->getOpcode()) {
case AArch64ISD::LDNF1:
case AArch64ISD::LDFF1:
MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
break;
case AArch64ISD::GLD1:
Expand Down Expand Up @@ -11298,7 +11301,7 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
ISD::UNINDEXED, false, false);
}

static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG) {
static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
SDLoc DL(N);
EVT VT = N->getValueType(0);

Expand All @@ -11315,7 +11318,7 @@ static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG) {
N->getOperand(3), // Base
DAG.getValueType(VT) };

SDValue Load = DAG.getNode(AArch64ISD::LDNF1, DL, VTs, Ops);
SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
SDValue LoadChain = SDValue(Load.getNode(), 1);

if (ContainerVT.isInteger() && (VT != ContainerVT))
Expand Down Expand Up @@ -12571,6 +12574,10 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
NewOpc = AArch64ISD::LDNF1S;
MemVTOpNum = 3;
break;
case AArch64ISD::LDFF1:
NewOpc = AArch64ISD::LDFF1S;
MemVTOpNum = 3;
break;
case AArch64ISD::GLD1:
NewOpc = AArch64ISD::GLD1S;
break;
Expand Down Expand Up @@ -12706,7 +12713,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::aarch64_sve_ldnt1:
return performLDNT1Combine(N, DAG);
case Intrinsic::aarch64_sve_ldnf1:
return performLDNF1Combine(N, DAG);
return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1);
case Intrinsic::aarch64_sve_ldff1:
return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1);
case Intrinsic::aarch64_sve_stnt1:
return performSTNT1Combine(N, DAG);
case Intrinsic::aarch64_sve_ld1_gather:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Expand Up @@ -217,6 +217,8 @@ enum NodeType : unsigned {

LDNF1,
LDNF1S,
LDFF1,
LDFF1S,

// Unsigned gather loads.
GLD1,
Expand Down
7 changes: 0 additions & 7 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Expand Up @@ -549,13 +549,6 @@ def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDN

def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;

def SDT_AArch64_LDNF1 : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;

def AArch64ldnf1 : SDNode<"AArch64ISD::LDNF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;

//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
Expand Down
42 changes: 42 additions & 0 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Expand Up @@ -10,6 +10,11 @@
//
//===----------------------------------------------------------------------===//

def SDT_AArch64_LDNF1 : SDTypeProfile<1, 3, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
]>;

def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
Expand Down Expand Up @@ -38,6 +43,8 @@ def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED",
def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;

def AArch64ldnf1 : SDNode<"AArch64ISD::LDNF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ldff1 : SDNode<"AArch64ISD::LDFF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
Expand All @@ -58,6 +65,7 @@ def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;

def AArch64ldnf1s : SDNode<"AArch64ISD::LDNF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ldff1s : SDNode<"AArch64ISD::LDFF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
Expand Down Expand Up @@ -1340,6 +1348,40 @@ let Predicates = [HasSVE] in {
// 16-element contiguous non-faulting loads
defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>;

multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
// Add more complex addressing modes here as required.
// Base
def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
(I PPR:$gp, GPR64sp:$base, XZR)>;
}

// 2-element contiguous first faulting loads
defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i8>;
defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i8>;
defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i16>;
defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i16>;
defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i32>;
defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i32>;
defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i64>;
defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1, nxv2i1, nxv2f32>;
defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1, nxv2i1, nxv2f64>;

// 4-element contiguous first faulting loads
defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i8>;
defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i8>;
defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i16>;
defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i16>;
defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1, nxv4i1, nxv4i32>;
defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1, nxv4i1, nxv4f32>;

// 8-element contiguous first faulting loads
defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i8>;
defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s, nxv8i1, nxv8i8>;
defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i16>;
defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1, nxv8i1, nxv8f16>;

// 16-element contiguous first faulting loads
defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1, nxv16i1, nxv16i8>;
}

let Predicates = [HasSVE2] in {
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AArch64/SVEInstrFormats.td
Expand Up @@ -5780,6 +5780,13 @@ multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,

def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;

// We need a layer of indirection because early machine code passes balk at
// physical register (i.e. FFR) uses that have no previous definition.
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>,
PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>;
}
}

multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
Expand Down
220 changes: 220 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-intrinsics-loads-ff.ll
@@ -0,0 +1,220 @@
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s

;
; LDFF1B
;

define <vscale x 16 x i8> @ldff1b(<vscale x 16 x i1> %pg, i8* %a) {
; CHECK-LABEL: ldff1b:
; CHECK: ldff1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 16 x i8> @llvm.aarch64.sve.ldff1.nxv16i8(<vscale x 16 x i1> %pg, i8* %a)
ret <vscale x 16 x i8> %load
}

define <vscale x 8 x i16> @ldff1b_h(<vscale x 8 x i1> %pg, i8* %a) {
; CHECK-LABEL: ldff1b_h:
; CHECK: ldff1b { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a)
%res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
ret <vscale x 8 x i16> %res
}

define <vscale x 4 x i32> @ldff1b_s(<vscale x 4 x i1> %pg, i8* %a) {
; CHECK-LABEL: ldff1b_s:
; CHECK: ldff1b { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a)
%res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @ldff1b_d(<vscale x 2 x i1> %pg, i8* %a) {
; CHECK-LABEL: ldff1b_d:
; CHECK: ldff1b { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a)
%res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

;
; LDFF1SB
;

define <vscale x 8 x i16> @ldff1sb_h(<vscale x 8 x i1> %pg, i8* %a) {
; CHECK-LABEL: ldff1sb_h:
; CHECK: ldff1sb { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a)
%res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
ret <vscale x 8 x i16> %res
}

define <vscale x 4 x i32> @ldff1sb_s(<vscale x 4 x i1> %pg, i8* %a) {
; CHECK-LABEL: ldff1sb_s:
; CHECK: ldff1sb { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a)
%res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @ldff1sb_d(<vscale x 2 x i1> %pg, i8* %a) {
; CHECK-LABEL: ldff1sb_d:
; CHECK: ldff1sb { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a)
%res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

;
; LDFF1H
;

define <vscale x 8 x i16> @ldff1h(<vscale x 8 x i1> %pg, i16* %a) {
; CHECK-LABEL: ldff1h:
; CHECK: ldff1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1> %pg, i16* %a)
ret <vscale x 8 x i16> %load
}

define <vscale x 4 x i32> @ldff1h_s(<vscale x 4 x i1> %pg, i16* %a) {
; CHECK-LABEL: ldff1h_s:
; CHECK: ldff1h { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a)
%res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @ldff1h_d(<vscale x 2 x i1> %pg, i16* %a) {
; CHECK-LABEL: ldff1h_d:
; CHECK: ldff1h { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a)
%res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

define <vscale x 8 x half> @ldff1h_f16(<vscale x 8 x i1> %pg, half* %a) {
; CHECK-LABEL: ldff1h_f16:
; CHECK: ldff1h { z0.h }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 8 x half> @llvm.aarch64.sve.ldff1.nxv8f16(<vscale x 8 x i1> %pg, half* %a)
ret <vscale x 8 x half> %load
}

;
; LDFF1SH
;

define <vscale x 4 x i32> @ldff1sh_s(<vscale x 4 x i1> %pg, i16* %a) {
; CHECK-LABEL: ldff1sh_s:
; CHECK: ldff1sh { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a)
%res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
ret <vscale x 4 x i32> %res
}

define <vscale x 2 x i64> @ldff1sh_d(<vscale x 2 x i1> %pg, i16* %a) {
; CHECK-LABEL: ldff1sh_d:
; CHECK: ldff1sh { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a)
%res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

;
; LDFF1W
;

define <vscale x 4 x i32> @ldff1w(<vscale x 4 x i1> %pg, i32* %a) {
; CHECK-LABEL: ldff1w:
; CHECK: ldff1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.nxv4i32(<vscale x 4 x i1> %pg, i32* %a)
ret <vscale x 4 x i32> %load
}

define <vscale x 2 x i64> @ldff1w_d(<vscale x 2 x i1> %pg, i32* %a) {
; CHECK-LABEL: ldff1w_d:
; CHECK: ldff1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a)
%res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

define <vscale x 4 x float> @ldff1w_f32(<vscale x 4 x i1> %pg, float* %a) {
; CHECK-LABEL: ldff1w_f32:
; CHECK: ldff1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.nxv4f32(<vscale x 4 x i1> %pg, float* %a)
ret <vscale x 4 x float> %load
}

define <vscale x 2 x float> @ldff1w_2f32(<vscale x 2 x i1> %pg, float* %a) {
; CHECK-LABEL: ldff1w_2f32:
; CHECK: ldff1w { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x float> @llvm.aarch64.sve.ldff1.nxv2f32(<vscale x 2 x i1> %pg, float* %a)
ret <vscale x 2 x float> %load
}

;
; LDFF1SW
;

define <vscale x 2 x i64> @ldff1sw_d(<vscale x 2 x i1> %pg, i32* %a) {
; CHECK-LABEL: ldff1sw_d:
; CHECK: ldff1sw { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a)
%res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
ret <vscale x 2 x i64> %res
}

;
; LDFF1D
;

define <vscale x 2 x i64> @ldff1d(<vscale x 2 x i1> %pg, i64* %a) {
; CHECK-LABEL: ldff1d:
; CHECK: ldff1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.nxv2i64(<vscale x 2 x i1> %pg, i64* %a)
ret <vscale x 2 x i64> %load
}


define <vscale x 2 x double> @ldff1d_f64(<vscale x 2 x i1> %pg, double* %a) {
; CHECK-LABEL: ldff1d_f64:
; CHECK: ldff1d { z0.d }, p0/z, [x0]
; CHECK-NEXT: ret
%load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.nxv2f64(<vscale x 2 x i1> %pg, double* %a)
ret <vscale x 2 x double> %load
}

declare <vscale x 16 x i8> @llvm.aarch64.sve.ldff1.nxv16i8(<vscale x 16 x i1>, i8*)

declare <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1>, i8*)
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1>, i16*)
declare <vscale x 8 x half> @llvm.aarch64.sve.ldff1.nxv8f16(<vscale x 8 x i1>, half*)

declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1>, i8*)
declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1>, i16*)
declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.nxv4i32(<vscale x 4 x i1>, i32*)
declare <vscale x 2 x float> @llvm.aarch64.sve.ldff1.nxv2f32(<vscale x 2 x i1>, float*)
declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.nxv4f32(<vscale x 4 x i1>, float*)

declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1>, i8*)
declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1>, i16*)
declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1>, i32*)
declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.nxv2i64(<vscale x 2 x i1>, i64*)
declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.nxv2f64(<vscale x 2 x i1>, double*)

0 comments on commit aa0f37e

Please sign in to comment.