61 changes: 61 additions & 0 deletions llvm/lib/Target/VE/VVPISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {

// The representative and legalized vector type of this operation.
VECustomDAG CDAG(DAG, Op);
// Dispatch to complex lowering functions.
switch (VVPOpcode) {
case VEISD::VVP_LOAD:
case VEISD::VVP_STORE:
return lowerVVP_LOAD_STORE(Op, CDAG);
};

EVT OpVecVT = Op.getValueType();
EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
auto Packing = getTypePacking(LegalVecVT.getSimpleVT());
Expand Down Expand Up @@ -89,6 +96,60 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("lowerToVVP called for unexpected SDNode.");
}

SDValue VETargetLowering::lowerVVP_LOAD_STORE(SDValue Op,
VECustomDAG &CDAG) const {
auto VVPOpc = *getVVPOpcode(Op->getOpcode());
const bool IsLoad = (VVPOpc == VEISD::VVP_LOAD);

// Shares.
SDValue BasePtr = getMemoryPtr(Op);
SDValue Mask = getNodeMask(Op);
SDValue Chain = getNodeChain(Op);
SDValue AVL = getNodeAVL(Op);
// Store specific.
SDValue Data = getStoredValue(Op);
// Load specific.
SDValue PassThru = getNodePassthru(Op);

auto DataVT = *getIdiomaticVectorType(Op.getNode());
auto Packing = getTypePacking(DataVT);

assert(Packing == Packing::Normal && "TODO Packed load store isel");

// TODO: Infer lower AVL from mask.
if (!AVL)
AVL = CDAG.getConstant(DataVT.getVectorNumElements(), MVT::i32);

// Default to the all-true mask.
if (!Mask)
Mask = CDAG.getConstantMask(Packing, true);

SDValue StrideV = getLoadStoreStride(Op, CDAG);
if (IsLoad) {
MVT LegalDataVT = getLegalVectorType(
Packing, DataVT.getVectorElementType().getSimpleVT());

auto NewLoadV = CDAG.getNode(VEISD::VVP_LOAD, {LegalDataVT, MVT::Other},
{Chain, BasePtr, StrideV, Mask, AVL});

if (!PassThru || PassThru->isUndef())
return NewLoadV;

// Convert passthru to an explicit select node.
SDValue DataV = CDAG.getNode(VEISD::VVP_SELECT, DataVT,
{NewLoadV, PassThru, Mask, AVL});
SDValue NewLoadChainV = SDValue(NewLoadV.getNode(), 1);

// Merge them back into one node.
return CDAG.getMergeValues({DataV, NewLoadChainV});
}

// VVP_STORE
assert(VVPOpc == VEISD::VVP_STORE);
return CDAG.getNode(VEISD::VVP_STORE, Op.getNode()->getVTList(),
{Chain, Data, BasePtr, StrideV, Mask, AVL});
}

SDValue VETargetLowering::legalizeInternalVectorOp(SDValue Op,
SelectionDAG &DAG) const {
VECustomDAG CDAG(DAG, Op);
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Target/VE/VVPInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,24 @@
// TODO explain how VVP nodes relate to VP SDNodes once VP ISel is uptream.
//===----------------------------------------------------------------------===//

// vvp_load(ptr, stride, mask, avl)
def SDTLoadVVP : SDTypeProfile<1, 4, [
SDTCisVec<0>,
SDTCisPtrTy<1>,
SDTCisInt<2>,
SDTCisVec<3>,
IsVLVT<4>
]>;

// vvp_store(data, ptr, stride, mask, avl)
def SDTStoreVVP: SDTypeProfile<0, 5, [
SDTCisVec<0>,
SDTCisPtrTy<1>,
SDTCisInt<2>,
SDTCisVec<3>,
IsVLVT<4>
]>;

// Binary Operators {

// BinaryOp(x,y,mask,vl)
Expand Down Expand Up @@ -102,6 +120,12 @@ def vvp_fdiv : SDNode<"VEISD::VVP_FDIV", SDTFPBinOpVVP>;

// } Binary Operators

def vvp_load : SDNode<"VEISD::VVP_LOAD", SDTLoadVVP,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand ]>;
def vvp_store : SDNode<"VEISD::VVP_STORE", SDTStoreVVP,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;


def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>;

// setcc (lhs, rhs, cc, mask, vl)
Expand Down
79 changes: 79 additions & 0 deletions llvm/lib/Target/VE/VVPInstrPatternsVec.td
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,85 @@
//===----------------------------------------------------------------------===//
include "VVPInstrInfo.td"

multiclass VectorStore<ValueType DataVT,
ValueType PtrVT, ValueType MaskVT,
string STWithMask, string STNoMask> {
// Unmasked (imm stride).
def : Pat<(vvp_store
DataVT:$val, PtrVT:$addr,
(i64 simm7:$stride), (MaskVT true_mask), i32:$avl),
(!cast<Instruction>(STNoMask#"irvl")
(LO7 $stride), $addr, $val, $avl)>;
// Unmasked.
def : Pat<(vvp_store
DataVT:$val, PtrVT:$addr,
i64:$stride, (MaskVT true_mask), i32:$avl),
(!cast<Instruction>(STNoMask#"rrvl")
$stride, $addr, $val, $avl)>;
// Masked (imm stride).
def : Pat<(vvp_store
DataVT:$val, PtrVT:$addr,
(i64 simm7:$stride), MaskVT:$mask, i32:$avl),
(!cast<Instruction>(STWithMask#"irvml")
(LO7 $stride), $addr, $val, $mask, $avl)>;
// Masked.
def : Pat<(vvp_store
DataVT:$val, PtrVT:$addr,
i64:$stride, MaskVT:$mask, i32:$avl),
(!cast<Instruction>(STWithMask#"rrvml")
$stride, $addr, $val, $mask, $avl)>;
}

defm : VectorStore<v256f64, i64, v256i1, "VST", "VST">;
defm : VectorStore<v256i64, i64, v256i1, "VST", "VST">;
defm : VectorStore<v256f32, i64, v256i1, "VSTU", "VSTU">;
defm : VectorStore<v256i32, i64, v256i1, "VSTL", "VSTL">;

multiclass VectorLoad<ValueType DataVT,
ValueType PtrVT, ValueType MaskVT,
string GTWithMask, string LDNoMask> {
// Unmasked (imm stride).
def : Pat<(DataVT (vvp_load
PtrVT:$addr, (i64 simm7:$stride),
(MaskVT true_mask), i32:$avl)),
(!cast<Instruction>(LDNoMask#"irl")
(LO7 $stride), $addr, $avl)>;
// Unmasked.
def : Pat<(DataVT (vvp_load
PtrVT:$addr, i64:$stride,
(MaskVT true_mask), i32:$avl)),
(!cast<Instruction>(LDNoMask#"rrl")
$stride, PtrVT:$addr, $avl)>;
// Masked (imm stride).
def : Pat<(DataVT (vvp_load
PtrVT:$addr, (i64 simm7:$stride),
MaskVT:$mask, i32:$avl)),
(!cast<Instruction>(GTWithMask#"vizml")
(VADDULrvml $addr,
(VMULULivml (LO7 $stride), (VSEQl $avl), $mask, $avl),
$mask, $avl),
0, 0,
$mask,
$avl)>;
// Masked.
def : Pat<(DataVT (vvp_load
PtrVT:$addr, i64:$stride, MaskVT:$mask, i32:$avl)),
(!cast<Instruction>(GTWithMask#"vizml")
(VADDULrvml $addr,
(VMULULrvml $stride, (VSEQl $avl), $mask, $avl),
$mask, $avl),
0, 0,
$mask,
$avl)>;
}

defm : VectorLoad<v256f64, i64, v256i1, "VGT", "VLD">;
defm : VectorLoad<v256i64, i64, v256i1, "VGT", "VLD">;
defm : VectorLoad<v256f32, i64, v256i1, "VGTU", "VLDU">;
defm : VectorLoad<v256i32, i64, v256i1, "VGTLZX", "VLDLZX">;



multiclass Binary_rv<SDPatternOperator OpNode,
ValueType ScalarVT, ValueType DataVT,
ValueType MaskVT, string OpBaseName> {
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/VE/VVPNodes.def
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
#define REGISTER_PACKED(OPC)
#endif

ADD_VVP_OP(VVP_LOAD,LOAD) HANDLE_VP_TO_VVP(VP_LOAD, VVP_LOAD) REGISTER_PACKED(VVP_LOAD)
ADD_VVP_OP(VVP_STORE,STORE) HANDLE_VP_TO_VVP(VP_STORE, VVP_STORE) REGISTER_PACKED(VVP_STORE)

// Integer arithmetic.
ADD_BINARY_VVP_OP_COMPACT(ADD) REGISTER_PACKED(VVP_ADD)
ADD_BINARY_VVP_OP_COMPACT(SUB) REGISTER_PACKED(VVP_SUB)
Expand Down
127 changes: 127 additions & 0 deletions llvm/test/CodeGen/VE/Vector/vec_load.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s

declare <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>* %0, i32 immarg %1, <128 x i1> %2, <128 x double> %3) #0

; TODO: Custom widen by lowering to vvp_load in ReplaceNodeResult
; Function Attrs: nounwind
; define fastcc <128 x double> @vec_mload_v128f64(<128 x double>* %P, <128 x i1> %M) {
; %r = call <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>* %P, i32 16, <128 x i1> %M, <128 x double> undef)
; ret <128 x double> %r
; }


declare <256 x double> @llvm.masked.load.v256f64.p0v256f64(<256 x double>* %0, i32 immarg %1, <256 x i1> %2, <256 x double> %3) #0

; Function Attrs: nounwind
define fastcc <256 x double> @vec_mload_v256f64(<256 x double>* %P, <256 x i1> %M) {
; CHECK-LABEL: vec_mload_v256f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vseq %v0
; CHECK-NEXT: vmulu.l %v0, 8, %v0, %vm1
; CHECK-NEXT: vaddu.l %v0, %s0, %v0, %vm1
; CHECK-NEXT: vgt %v0, %v0, 0, 0, %vm1
; CHECK-NEXT: b.l.t (, %s10)
%r = call <256 x double> @llvm.masked.load.v256f64.p0v256f64(<256 x double>* %P, i32 16, <256 x i1> %M, <256 x double> undef)
ret <256 x double> %r
}

; Function Attrs: nounwind
define fastcc <256 x double> @vec_load_v256f64(<256 x double>* %P) {
; CHECK-LABEL: vec_load_v256f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vld %v0, 8, %s0
; CHECK-NEXT: b.l.t (, %s10)
%r = load <256 x double>, <256 x double>* %P, align 4
ret <256 x double> %r
}

; Function Attrs: nounwind
define fastcc <256 x double> @vec_mload_pt_v256f64(<256 x double>* %P, <256 x double> %PT, <256 x i1> %M) {
; CHECK-LABEL: vec_mload_pt_v256f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vseq %v1
; CHECK-NEXT: vmulu.l %v1, 8, %v1, %vm1
; CHECK-NEXT: vaddu.l %v1, %s0, %v1, %vm1
; CHECK-NEXT: vgt %v1, %v1, 0, 0, %vm1
; CHECK-NEXT: vmrg %v0, %v0, %v1, %vm1
; CHECK-NEXT: b.l.t (, %s10)
%r = call <256 x double> @llvm.masked.load.v256f64.p0v256f64(<256 x double>* %P, i32 16, <256 x i1> %M, <256 x double> %PT)
ret <256 x double> %r
}


declare <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* %0, i32 immarg %1, <256 x i1> %2, <256 x float> %3) #0

; Function Attrs: nounwind
define fastcc <256 x float> @vec_mload_v256f32(<256 x float>* %P, <256 x i1> %M) {
; CHECK-LABEL: vec_mload_v256f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vseq %v0
; CHECK-NEXT: vmulu.l %v0, 4, %v0, %vm1
; CHECK-NEXT: vaddu.l %v0, %s0, %v0, %vm1
; CHECK-NEXT: vgtu %v0, %v0, 0, 0, %vm1
; CHECK-NEXT: b.l.t (, %s10)
%r = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* %P, i32 16, <256 x i1> %M, <256 x float> undef)
ret <256 x float> %r
}

; Function Attrs: nounwind
define fastcc <256 x float> @vec_mload_pt_v256f32(<256 x float>* %P, <256 x float> %PT, <256 x i1> %M) {
; CHECK-LABEL: vec_mload_pt_v256f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vseq %v1
; CHECK-NEXT: vmulu.l %v1, 4, %v1, %vm1
; CHECK-NEXT: vaddu.l %v1, %s0, %v1, %vm1
; CHECK-NEXT: vgtu %v1, %v1, 0, 0, %vm1
; CHECK-NEXT: vmrg %v0, %v0, %v1, %vm1
; CHECK-NEXT: b.l.t (, %s10)
%r = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* %P, i32 16, <256 x i1> %M, <256 x float> %PT)
ret <256 x float> %r
}


declare <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>* %0, i32 immarg %1, <256 x i1> %2, <256 x i32> %3) #0

; Function Attrs: nounwind
define fastcc <256 x i32> @vec_mload_v256i32(<256 x i32>* %P, <256 x i1> %M) {
; CHECK-LABEL: vec_mload_v256i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vseq %v0
; CHECK-NEXT: vmulu.l %v0, 4, %v0, %vm1
; CHECK-NEXT: vaddu.l %v0, %s0, %v0, %vm1
; CHECK-NEXT: vgtl.zx %v0, %v0, 0, 0, %vm1
; CHECK-NEXT: b.l.t (, %s10)
%r = call <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>* %P, i32 16, <256 x i1> %M, <256 x i32> undef)
ret <256 x i32> %r
}

; Function Attrs: nounwind
define fastcc <256 x i32> @vec_mload_pt_v256i32(<256 x i32>* %P, <256 x i32> %PT, <256 x i1> %M) {
; CHECK-LABEL: vec_mload_pt_v256i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vseq %v1
; CHECK-NEXT: vmulu.l %v1, 4, %v1, %vm1
; CHECK-NEXT: vaddu.l %v1, %s0, %v1, %vm1
; CHECK-NEXT: vgtl.zx %v1, %v1, 0, 0, %vm1
; CHECK-NEXT: vmrg %v0, %v0, %v1, %vm1
; CHECK-NEXT: b.l.t (, %s10)
%r = call <256 x i32> @llvm.masked.load.v256i32.p0v256i32(<256 x i32>* %P, i32 16, <256 x i1> %M, <256 x i32> %PT)
ret <256 x i32> %r
}

attributes #0 = { argmemonly nounwind readonly willreturn }
43 changes: 43 additions & 0 deletions llvm/test/CodeGen/VE/Vector/vec_store.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=+vpu | FileCheck %s

declare void @llvm.masked.store.v256f64.p0v256f64(<256 x double>, <256 x double>*, i32 immarg, <256 x i1>)

define fastcc void @vec_mstore_v256f64(<256 x double>* %P, <256 x double> %V, <256 x i1> %M) {
; CHECK-LABEL: vec_mstore_v256f64:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vst %v0, 8, %s0
; CHECK-NEXT: b.l.t (, %s10)
call void @llvm.masked.store.v256f64.p0v256f64(<256 x double> %V, <256 x double>* %P, i32 16, <256 x i1> %M)
ret void
}


declare void @llvm.masked.store.v256f32.p0v256f32(<256 x float>, <256 x float>*, i32 immarg, <256 x i1>)

define fastcc void @vec_mstore_v256f32(<256 x float>* %P, <256 x float> %V, <256 x i1> %M) {
; CHECK-LABEL: vec_mstore_v256f32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vstu %v0, 4, %s0
; CHECK-NEXT: b.l.t (, %s10)
call void @llvm.masked.store.v256f32.p0v256f32(<256 x float> %V, <256 x float>* %P, i32 16, <256 x i1> %M)
ret void
}


declare void @llvm.masked.store.v256i32.p0v256i32(<256 x i32>, <256 x i32>*, i32 immarg, <256 x i1>)

define fastcc void @vec_mstore_v256i32(<256 x i32>* %P, <256 x i32> %V, <256 x i1> %M) {
; CHECK-LABEL: vec_mstore_v256i32:
; CHECK: # %bb.0:
; CHECK-NEXT: lea %s1, 256
; CHECK-NEXT: lvl %s1
; CHECK-NEXT: vstl %v0, 4, %s0
; CHECK-NEXT: b.l.t (, %s10)
call void @llvm.masked.store.v256i32.p0v256i32(<256 x i32> %V, <256 x i32>* %P, i32 16, <256 x i1> %M)
ret void
}