19 changes: 19 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,25 @@ bool LoongArchDAGToDAGISel::SelectBaseAddr(SDValue Addr, SDValue &Base) {
return true;
}

// Fold constant addresses.
bool LoongArchDAGToDAGISel::SelectAddrConstant(SDValue Addr, SDValue &Base,
SDValue &Offset) {
SDLoc DL(Addr);
MVT VT = Addr.getSimpleValueType();

if (!isa<ConstantSDNode>(Addr))
return false;

// If the constant is a simm12, we can fold the whole constant and use R0 as
// the base.
int64_t CVal = cast<ConstantSDNode>(Addr)->getSExtValue();
if (!isInt<12>(CVal))
return false;
Base = CurDAG->getRegister(LoongArch::R0, VT);
Offset = CurDAG->getTargetConstant(SignExtend64<12>(CVal), DL, VT);
return true;
}

bool LoongArchDAGToDAGISel::selectNonFIBaseAddr(SDValue Addr, SDValue &Base) {
// If this is FrameIndex, don't select it.
if (isa<FrameIndexSDNode>(Addr))
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class LoongArchDAGToDAGISel : public SelectionDAGISel {
std::vector<SDValue> &OutOps) override;

bool SelectBaseAddr(SDValue Addr, SDValue &Base);
bool SelectAddrConstant(SDValue Addr, SDValue &Base, SDValue &Offset);
bool selectNonFIBaseAddr(SDValue Addr, SDValue &Base);

bool selectShiftMask(SDValue N, unsigned ShiftWidth, SDValue &ShAmt);
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ def ImmSubFrom32 : SDNodeXForm<imm, [{
}]>;

def BaseAddr : ComplexPattern<iPTR, 1, "SelectBaseAddr">;
def AddrConstant : ComplexPattern<iPTR, 2, "SelectAddrConstant">;
def NonFIBaseAddr : ComplexPattern<iPTR, 1, "selectNonFIBaseAddr">;

def fma_nsz : PatFrag<(ops node:$fj, node:$fk, node:$fa),
Expand Down Expand Up @@ -1219,6 +1220,8 @@ def : Pat<(bitreverse (bswap GPR:$rj)), (BITREV_8B GPR:$rj)>;

multiclass LdPat<PatFrag LoadOp, LAInst Inst, ValueType vt = GRLenVT> {
def : Pat<(vt (LoadOp BaseAddr:$rj)), (Inst BaseAddr:$rj, 0)>;
def : Pat<(vt (LoadOp (AddrConstant GPR:$rj, simm12:$imm12))),
(Inst GPR:$rj, simm12:$imm12)>;
def : Pat<(vt (LoadOp (AddLike BaseAddr:$rj, simm12:$imm12))),
(Inst BaseAddr:$rj, simm12:$imm12)>;
}
Expand Down Expand Up @@ -1261,6 +1264,8 @@ multiclass StPat<PatFrag StoreOp, LAInst Inst, RegisterClass StTy,
ValueType vt> {
def : Pat<(StoreOp (vt StTy:$rd), BaseAddr:$rj),
(Inst StTy:$rd, BaseAddr:$rj, 0)>;
def : Pat<(StoreOp (vt StTy:$rs2), (AddrConstant GPR:$rj, simm12:$imm12)),
(Inst StTy:$rs2, GPR:$rj, simm12:$imm12)>;
def : Pat<(StoreOp (vt StTy:$rd), (AddLike BaseAddr:$rj, simm12:$imm12)),
(Inst StTy:$rd, BaseAddr:$rj, simm12:$imm12)>;
}
Expand Down
45 changes: 29 additions & 16 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1256,7 +1256,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) {
for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
setOperationAction(ISD::ABDS, VT, Custom);
setOperationAction(ISD::ABDU, VT, Custom);
}
Expand Down Expand Up @@ -1396,14 +1396,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// In the customized shift lowering, the legal v8i32/v4i64 cases
// in AVX2 will be recognized.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::ABDS, VT, Custom);
setOperationAction(ISD::ABDU, VT, Custom);
if (VT == MVT::v4i64) continue;
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::FSHL, VT, Custom);
setOperationAction(ISD::FSHR, VT, Custom);
}

// These types need custom splitting if their input is a 128-bit vector.
Expand Down Expand Up @@ -1499,8 +1501,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
setOperationAction(ISD::ABDS, VT, Custom);
setOperationAction(ISD::ABDU, VT, Custom);
}

for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
Expand Down Expand Up @@ -1968,8 +1968,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SMIN, VT, Legal);
setOperationAction(ISD::UMIN, VT, Legal);
setOperationAction(ISD::ABS, VT, Legal);
setOperationAction(ISD::ABDS, VT, Custom);
setOperationAction(ISD::ABDU, VT, Custom);
}

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
Expand Down Expand Up @@ -29659,15 +29657,30 @@ static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
return splitVectorIntBinary(Op, DAG);

// Default to expand: sub(smax(lhs,rhs),smin(lhs,rhs))
// TODO: Add TargetLowering expandABD() support.
SDLoc dl(Op);
bool IsSigned = Op.getOpcode() == ISD::ABDS;
SDValue LHS = DAG.getFreeze(Op.getOperand(0));
SDValue RHS = DAG.getFreeze(Op.getOperand(1));
SDValue Max = DAG.getNode(IsSigned ? ISD::SMAX : ISD::UMAX, dl, VT, LHS, RHS);
SDValue Min = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, dl, VT, LHS, RHS);
return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();

// abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))
// abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs))
unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;
unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;
if (TLI.isOperationLegal(MaxOpc, VT) && TLI.isOperationLegal(MinOpc, VT)) {
SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);
SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);
return DAG.getNode(ISD::SUB, dl, VT, Max, Min);
}

// abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
// abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))
EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;
SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);
return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),
DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));
}

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,9 +215,12 @@ bool SanitizerBinaryMetadata::run() {
Constant *CtorData = nullptr;
Constant *DtorData = nullptr;
if (TargetTriple.supportsCOMDAT()) {
// Use COMDAT to deduplicate constructor/destructor function.
// Use COMDAT to deduplicate constructor/destructor function. The COMDAT
// key needs to be a non-local linkage.
Ctor->setComdat(Mod.getOrInsertComdat(Ctor->getName()));
Dtor->setComdat(Mod.getOrInsertComdat(Dtor->getName()));
Ctor->setLinkage(GlobalValue::ExternalLinkage);
Dtor->setLinkage(GlobalValue::ExternalLinkage);
CtorData = Ctor;
DtorData = Dtor;
}
Expand Down
97 changes: 35 additions & 62 deletions llvm/test/CodeGen/LoongArch/load-store-offset.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,185 +2,158 @@
; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32
; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64

;; TODO: When the offset of the address is less than 12bit, merge the offset
;; of address calculation into the offset field of the instruction.

define i8 @load_i8() nounwind {
; LA32-LABEL: load_i8:
; LA32: # %bb.0:
; LA32-NEXT: ori $a0, $zero, 40
; LA32-NEXT: ld.b $a0, $a0, 0
; LA32-NEXT: ld.b $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: load_i8:
; LA64: # %bb.0:
; LA64-NEXT: ori $a0, $zero, 40
; LA64-NEXT: ld.b $a0, $a0, 0
; LA64-NEXT: ld.b $a0, $zero, 40
; LA64-NEXT: ret
%a = load i8, i8* inttoptr (i64 40 to i8*), align 8
%a = load i8, ptr inttoptr (i64 40 to ptr), align 8
ret i8 %a
}

define signext i8 @load_i8_sext() nounwind {
; LA32-LABEL: load_i8_sext:
; LA32: # %bb.0:
; LA32-NEXT: ori $a0, $zero, 40
; LA32-NEXT: ld.b $a0, $a0, 0
; LA32-NEXT: ld.b $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: load_i8_sext:
; LA64: # %bb.0:
; LA64-NEXT: ori $a0, $zero, 40
; LA64-NEXT: ld.b $a0, $a0, 0
; LA64-NEXT: ld.b $a0, $zero, 40
; LA64-NEXT: ret
%a = load i8, i8* inttoptr (i64 40 to i8*), align 8
%a = load i8, ptr inttoptr (i64 40 to ptr), align 8
ret i8 %a
}

define i16 @load_i16() nounwind {
; LA32-LABEL: load_i16:
; LA32: # %bb.0:
; LA32-NEXT: ori $a0, $zero, 40
; LA32-NEXT: ld.h $a0, $a0, 0
; LA32-NEXT: ld.h $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: load_i16:
; LA64: # %bb.0:
; LA64-NEXT: ori $a0, $zero, 40
; LA64-NEXT: ld.h $a0, $a0, 0
; LA64-NEXT: ld.h $a0, $zero, 40
; LA64-NEXT: ret
%a = load i16, i16* inttoptr (i64 40 to i16*), align 8
%a = load i16, ptr inttoptr (i64 40 to ptr), align 8
ret i16 %a
}

define signext i16 @load_i16_sext() nounwind {
; LA32-LABEL: load_i16_sext:
; LA32: # %bb.0:
; LA32-NEXT: ori $a0, $zero, 40
; LA32-NEXT: ld.h $a0, $a0, 0
; LA32-NEXT: ld.h $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: load_i16_sext:
; LA64: # %bb.0:
; LA64-NEXT: ori $a0, $zero, 40
; LA64-NEXT: ld.h $a0, $a0, 0
; LA64-NEXT: ld.h $a0, $zero, 40
; LA64-NEXT: ret
%a = load i16, i16* inttoptr (i64 40 to i16*), align 8
%a = load i16, ptr inttoptr (i64 40 to ptr), align 8
ret i16 %a
}

define i32 @load_i32() nounwind {
; LA32-LABEL: load_i32:
; LA32: # %bb.0:
; LA32-NEXT: ori $a0, $zero, 40
; LA32-NEXT: ld.w $a0, $a0, 0
; LA32-NEXT: ld.w $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: load_i32:
; LA64: # %bb.0:
; LA64-NEXT: ori $a0, $zero, 40
; LA64-NEXT: ld.w $a0, $a0, 0
; LA64-NEXT: ld.w $a0, $zero, 40
; LA64-NEXT: ret
%a = load i32, i32* inttoptr (i64 40 to i32*), align 8
%a = load i32, ptr inttoptr (i64 40 to ptr), align 8
ret i32 %a
}

define signext i32 @load_i32_sext() nounwind {
; LA32-LABEL: load_i32_sext:
; LA32: # %bb.0:
; LA32-NEXT: ori $a0, $zero, 40
; LA32-NEXT: ld.w $a0, $a0, 0
; LA32-NEXT: ld.w $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: load_i32_sext:
; LA64: # %bb.0:
; LA64-NEXT: ori $a0, $zero, 40
; LA64-NEXT: ld.w $a0, $a0, 0
; LA64-NEXT: ld.w $a0, $zero, 40
; LA64-NEXT: ret
%a = load i32, i32* inttoptr (i64 40 to i32*), align 8
%a = load i32, ptr inttoptr (i64 40 to ptr), align 8
ret i32 %a
}

define i64 @load_i64() nounwind {
; LA32-LABEL: load_i64:
; LA32: # %bb.0:
; LA32-NEXT: ori $a0, $zero, 40
; LA32-NEXT: ld.w $a0, $a0, 0
; LA32-NEXT: ori $a1, $zero, 44
; LA32-NEXT: ld.w $a1, $a1, 0
; LA32-NEXT: ld.w $a0, $zero, 40
; LA32-NEXT: ld.w $a1, $zero, 44
; LA32-NEXT: ret
;
; LA64-LABEL: load_i64:
; LA64: # %bb.0:
; LA64-NEXT: ori $a0, $zero, 40
; LA64-NEXT: ld.d $a0, $a0, 0
; LA64-NEXT: ld.d $a0, $zero, 40
; LA64-NEXT: ret
%a = load i64, i64* inttoptr (i64 40 to i64*), align 8
%a = load i64, ptr inttoptr (i64 40 to ptr), align 8
ret i64 %a
}

define void @store_i8(i8 %v) nounwind {
; LA32-LABEL: store_i8:
; LA32: # %bb.0:
; LA32-NEXT: ori $a1, $zero, 40
; LA32-NEXT: st.b $a0, $a1, 0
; LA32-NEXT: st.b $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: store_i8:
; LA64: # %bb.0:
; LA64-NEXT: ori $a1, $zero, 40
; LA64-NEXT: st.b $a0, $a1, 0
; LA64-NEXT: st.b $a0, $zero, 40
; LA64-NEXT: ret
store i8 %v, i8* inttoptr (i64 40 to i8*), align 8
store i8 %v, ptr inttoptr (i64 40 to ptr), align 8
ret void
}

define void @store_i16(i16 %v) nounwind {
; LA32-LABEL: store_i16:
; LA32: # %bb.0:
; LA32-NEXT: ori $a1, $zero, 40
; LA32-NEXT: st.h $a0, $a1, 0
; LA32-NEXT: st.h $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: store_i16:
; LA64: # %bb.0:
; LA64-NEXT: ori $a1, $zero, 40
; LA64-NEXT: st.h $a0, $a1, 0
; LA64-NEXT: st.h $a0, $zero, 40
; LA64-NEXT: ret
store i16 %v, i16* inttoptr (i64 40 to i16*), align 8
store i16 %v, ptr inttoptr (i64 40 to ptr), align 8
ret void
}

define void @store_i32(i32 %v) nounwind {
; LA32-LABEL: store_i32:
; LA32: # %bb.0:
; LA32-NEXT: ori $a1, $zero, 40
; LA32-NEXT: st.w $a0, $a1, 0
; LA32-NEXT: st.w $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: store_i32:
; LA64: # %bb.0:
; LA64-NEXT: ori $a1, $zero, 40
; LA64-NEXT: st.w $a0, $a1, 0
; LA64-NEXT: st.w $a0, $zero, 40
; LA64-NEXT: ret
store i32 %v, i32* inttoptr (i64 40 to i32*), align 8
store i32 %v, ptr inttoptr (i64 40 to ptr), align 8
ret void
}

define void @store_i64(i64 %v) nounwind {
; LA32-LABEL: store_i64:
; LA32: # %bb.0:
; LA32-NEXT: ori $a2, $zero, 44
; LA32-NEXT: st.w $a1, $a2, 0
; LA32-NEXT: ori $a1, $zero, 40
; LA32-NEXT: st.w $a0, $a1, 0
; LA32-NEXT: st.w $a1, $zero, 44
; LA32-NEXT: st.w $a0, $zero, 40
; LA32-NEXT: ret
;
; LA64-LABEL: store_i64:
; LA64: # %bb.0:
; LA64-NEXT: ori $a1, $zero, 40
; LA64-NEXT: st.d $a0, $a1, 0
; LA64-NEXT: st.d $a0, $zero, 40
; LA64-NEXT: ret
store i64 %v, i64* inttoptr (i64 40 to i64*), align 8
store i64 %v, ptr inttoptr (i64 40 to ptr), align 8
ret void
}
207 changes: 45 additions & 162 deletions llvm/test/CodeGen/X86/abds-vector-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -501,89 +501,30 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; SSE42-LABEL: abd_ext_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movq %xmm0, %rax
; SSE42-NEXT: movq %rax, %rcx
; SSE42-NEXT: sarq $63, %rcx
; SSE42-NEXT: pextrq $1, %xmm0, %rdx
; SSE42-NEXT: movq %rdx, %rsi
; SSE42-NEXT: sarq $63, %rsi
; SSE42-NEXT: movq %xmm1, %rdi
; SSE42-NEXT: movq %rdi, %r8
; SSE42-NEXT: sarq $63, %r8
; SSE42-NEXT: pextrq $1, %xmm1, %r9
; SSE42-NEXT: movq %r9, %r10
; SSE42-NEXT: sarq $63, %r10
; SSE42-NEXT: subq %r9, %rdx
; SSE42-NEXT: sbbq %r10, %rsi
; SSE42-NEXT: subq %rdi, %rax
; SSE42-NEXT: sbbq %r8, %rcx
; SSE42-NEXT: sarq $63, %rcx
; SSE42-NEXT: xorq %rcx, %rax
; SSE42-NEXT: subq %rcx, %rax
; SSE42-NEXT: sarq $63, %rsi
; SSE42-NEXT: xorq %rsi, %rdx
; SSE42-NEXT: subq %rsi, %rdx
; SSE42-NEXT: movq %rdx, %xmm1
; SSE42-NEXT: movq %rax, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: psubq %xmm1, %xmm3
; SSE42-NEXT: psubq %xmm0, %xmm1
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_ext_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
; AVX1-NEXT: movq %rdx, %rsi
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: vmovq %xmm1, %rdi
; AVX1-NEXT: movq %rdi, %r8
; AVX1-NEXT: sarq $63, %r8
; AVX1-NEXT: vpextrq $1, %xmm1, %r9
; AVX1-NEXT: movq %r9, %r10
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: subq %r9, %rdx
; AVX1-NEXT: sbbq %r10, %rsi
; AVX1-NEXT: subq %rdi, %rax
; AVX1-NEXT: sbbq %r8, %rcx
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: xorq %rcx, %rax
; AVX1-NEXT: subq %rcx, %rax
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: xorq %rsi, %rdx
; AVX1-NEXT: subq %rsi, %rdx
; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
; AVX2-NEXT: movq %rdx, %rsi
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: vmovq %xmm1, %rdi
; AVX2-NEXT: movq %rdi, %r8
; AVX2-NEXT: sarq $63, %r8
; AVX2-NEXT: vpextrq $1, %xmm1, %r9
; AVX2-NEXT: movq %r9, %r10
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: subq %r9, %rdx
; AVX2-NEXT: sbbq %r10, %rsi
; AVX2-NEXT: subq %rdi, %rax
; AVX2-NEXT: sbbq %r8, %rcx
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: xorq %rcx, %rax
; AVX2-NEXT: subq %rcx, %rax
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: xorq %rsi, %rdx
; AVX2-NEXT: subq %rsi, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v2i64:
Expand Down Expand Up @@ -634,89 +575,30 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; SSE42-LABEL: abd_ext_v2i64_undef:
; SSE42: # %bb.0:
; SSE42-NEXT: movq %xmm0, %rax
; SSE42-NEXT: movq %rax, %rcx
; SSE42-NEXT: sarq $63, %rcx
; SSE42-NEXT: pextrq $1, %xmm0, %rdx
; SSE42-NEXT: movq %rdx, %rsi
; SSE42-NEXT: sarq $63, %rsi
; SSE42-NEXT: movq %xmm1, %rdi
; SSE42-NEXT: movq %rdi, %r8
; SSE42-NEXT: sarq $63, %r8
; SSE42-NEXT: pextrq $1, %xmm1, %r9
; SSE42-NEXT: movq %r9, %r10
; SSE42-NEXT: sarq $63, %r10
; SSE42-NEXT: subq %r9, %rdx
; SSE42-NEXT: sbbq %r10, %rsi
; SSE42-NEXT: subq %rdi, %rax
; SSE42-NEXT: sbbq %r8, %rcx
; SSE42-NEXT: sarq $63, %rcx
; SSE42-NEXT: xorq %rcx, %rax
; SSE42-NEXT: subq %rcx, %rax
; SSE42-NEXT: sarq $63, %rsi
; SSE42-NEXT: xorq %rsi, %rdx
; SSE42-NEXT: subq %rsi, %rdx
; SSE42-NEXT: movq %rdx, %xmm1
; SSE42-NEXT: movq %rax, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: psubq %xmm1, %xmm3
; SSE42-NEXT: psubq %xmm0, %xmm1
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_ext_v2i64_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
; AVX1-NEXT: movq %rdx, %rsi
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: vmovq %xmm1, %rdi
; AVX1-NEXT: movq %rdi, %r8
; AVX1-NEXT: sarq $63, %r8
; AVX1-NEXT: vpextrq $1, %xmm1, %r9
; AVX1-NEXT: movq %r9, %r10
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: subq %r9, %rdx
; AVX1-NEXT: sbbq %r10, %rsi
; AVX1-NEXT: subq %rdi, %rax
; AVX1-NEXT: sbbq %r8, %rcx
; AVX1-NEXT: sarq $63, %rcx
; AVX1-NEXT: xorq %rcx, %rax
; AVX1-NEXT: subq %rcx, %rax
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: xorq %rsi, %rdx
; AVX1-NEXT: subq %rsi, %rdx
; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v2i64_undef:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: vpextrq $1, %xmm0, %rdx
; AVX2-NEXT: movq %rdx, %rsi
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: vmovq %xmm1, %rdi
; AVX2-NEXT: movq %rdi, %r8
; AVX2-NEXT: sarq $63, %r8
; AVX2-NEXT: vpextrq $1, %xmm1, %r9
; AVX2-NEXT: movq %r9, %r10
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: subq %r9, %rdx
; AVX2-NEXT: sbbq %r10, %rsi
; AVX2-NEXT: subq %rdi, %rax
; AVX2-NEXT: sbbq %r8, %rcx
; AVX2-NEXT: sarq $63, %rcx
; AVX2-NEXT: xorq %rcx, %rax
; AVX2-NEXT: subq %rcx, %rax
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: xorq %rsi, %rdx
; AVX2-NEXT: subq %rsi, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v2i64_undef:
Expand Down Expand Up @@ -866,28 +748,29 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; SSE42-LABEL: abd_minmax_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: movdqa %xmm2, %xmm3
; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm3
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE42-NEXT: psubq %xmm3, %xmm1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: pcmpgtq %xmm1, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: psubq %xmm1, %xmm3
; SSE42-NEXT: psubq %xmm0, %xmm1
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_minmax_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_minmax_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm3
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsubq %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_minmax_v2i64:
Expand Down
318 changes: 44 additions & 274 deletions llvm/test/CodeGen/X86/abds-vector-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -220,140 +220,25 @@ define <8 x i32> @abd_ext_v8i32_undef(<8 x i32> %a, <8 x i32> %b) nounwind {
define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_ext_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %r13
; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: vmovq %xmm0, %r11
; AVX1-NEXT: movq %r11, %r10
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %r9
; AVX1-NEXT: sarq $63, %r9
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rdx
; AVX1-NEXT: movq %rdx, %r8
; AVX1-NEXT: sarq $63, %r8
; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
; AVX1-NEXT: movq %rsi, %rdi
; AVX1-NEXT: sarq $63, %rdi
; AVX1-NEXT: vmovq %xmm1, %rbx
; AVX1-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX1-NEXT: sarq $63, %rbx
; AVX1-NEXT: vpextrq $1, %xmm1, %r14
; AVX1-NEXT: movq %r14, %r15
; AVX1-NEXT: sarq $63, %r15
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %r12
; AVX1-NEXT: movq %r12, %r13
; AVX1-NEXT: sarq $63, %r13
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rbp
; AVX1-NEXT: sarq $63, %rbp
; AVX1-NEXT: subq %rax, %rsi
; AVX1-NEXT: sbbq %rbp, %rdi
; AVX1-NEXT: subq %r12, %rdx
; AVX1-NEXT: sbbq %r13, %r8
; AVX1-NEXT: subq %r14, %rcx
; AVX1-NEXT: sbbq %r15, %r9
; AVX1-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; AVX1-NEXT: sbbq %rbx, %r10
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: xorq %r10, %r11
; AVX1-NEXT: subq %r10, %r11
; AVX1-NEXT: sarq $63, %r9
; AVX1-NEXT: xorq %r9, %rcx
; AVX1-NEXT: subq %r9, %rcx
; AVX1-NEXT: sarq $63, %r8
; AVX1-NEXT: xorq %r8, %rdx
; AVX1-NEXT: subq %r8, %rdx
; AVX1-NEXT: sarq $63, %rdi
; AVX1-NEXT: xorq %rdi, %rsi
; AVX1-NEXT: subq %rdi, %rsi
; AVX1-NEXT: vmovq %rsi, %xmm0
; AVX1-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: vmovq %r11, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r12
; AVX1-NEXT: popq %r13
; AVX1-NEXT: popq %r14
; AVX1-NEXT: popq %r15
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm5
; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: vmovq %xmm0, %r11
; AVX2-NEXT: movq %r11, %r10
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %r9
; AVX2-NEXT: sarq $63, %r9
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rdx
; AVX2-NEXT: movq %rdx, %r8
; AVX2-NEXT: sarq $63, %r8
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
; AVX2-NEXT: movq %rsi, %rdi
; AVX2-NEXT: sarq $63, %rdi
; AVX2-NEXT: vmovq %xmm1, %rbx
; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: sarq $63, %rbx
; AVX2-NEXT: vpextrq $1, %xmm1, %r14
; AVX2-NEXT: movq %r14, %r15
; AVX2-NEXT: sarq $63, %r15
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %r12
; AVX2-NEXT: movq %r12, %r13
; AVX2-NEXT: sarq $63, %r13
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: movq %rax, %rbp
; AVX2-NEXT: sarq $63, %rbp
; AVX2-NEXT: subq %rax, %rsi
; AVX2-NEXT: sbbq %rbp, %rdi
; AVX2-NEXT: subq %r12, %rdx
; AVX2-NEXT: sbbq %r13, %r8
; AVX2-NEXT: subq %r14, %rcx
; AVX2-NEXT: sbbq %r15, %r9
; AVX2-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; AVX2-NEXT: sbbq %rbx, %r10
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: xorq %r10, %r11
; AVX2-NEXT: subq %r10, %r11
; AVX2-NEXT: sarq $63, %r9
; AVX2-NEXT: xorq %r9, %rcx
; AVX2-NEXT: subq %r9, %rcx
; AVX2-NEXT: sarq $63, %r8
; AVX2-NEXT: xorq %r8, %rdx
; AVX2-NEXT: subq %r8, %rdx
; AVX2-NEXT: sarq $63, %rdi
; AVX2-NEXT: xorq %rdi, %rsi
; AVX2-NEXT: subq %rdi, %rsi
; AVX2-NEXT: vmovq %rsi, %xmm0
; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vmovq %rcx, %xmm2
; AVX2-NEXT: vmovq %r11, %xmm3
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v4i64:
Expand All @@ -373,140 +258,25 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_ext_v4i64_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: pushq %rbp
; AVX1-NEXT: pushq %r15
; AVX1-NEXT: pushq %r14
; AVX1-NEXT: pushq %r13
; AVX1-NEXT: pushq %r12
; AVX1-NEXT: pushq %rbx
; AVX1-NEXT: vmovq %xmm0, %r11
; AVX1-NEXT: movq %r11, %r10
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %r9
; AVX1-NEXT: sarq $63, %r9
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rdx
; AVX1-NEXT: movq %rdx, %r8
; AVX1-NEXT: sarq $63, %r8
; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
; AVX1-NEXT: movq %rsi, %rdi
; AVX1-NEXT: sarq $63, %rdi
; AVX1-NEXT: vmovq %xmm1, %rbx
; AVX1-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX1-NEXT: sarq $63, %rbx
; AVX1-NEXT: vpextrq $1, %xmm1, %r14
; AVX1-NEXT: movq %r14, %r15
; AVX1-NEXT: sarq $63, %r15
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %r12
; AVX1-NEXT: movq %r12, %r13
; AVX1-NEXT: sarq $63, %r13
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: movq %rax, %rbp
; AVX1-NEXT: sarq $63, %rbp
; AVX1-NEXT: subq %rax, %rsi
; AVX1-NEXT: sbbq %rbp, %rdi
; AVX1-NEXT: subq %r12, %rdx
; AVX1-NEXT: sbbq %r13, %r8
; AVX1-NEXT: subq %r14, %rcx
; AVX1-NEXT: sbbq %r15, %r9
; AVX1-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; AVX1-NEXT: sbbq %rbx, %r10
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: xorq %r10, %r11
; AVX1-NEXT: subq %r10, %r11
; AVX1-NEXT: sarq $63, %r9
; AVX1-NEXT: xorq %r9, %rcx
; AVX1-NEXT: subq %r9, %rcx
; AVX1-NEXT: sarq $63, %r8
; AVX1-NEXT: xorq %r8, %rdx
; AVX1-NEXT: subq %r8, %rdx
; AVX1-NEXT: sarq $63, %rdi
; AVX1-NEXT: xorq %rdi, %rsi
; AVX1-NEXT: subq %rdi, %rsi
; AVX1-NEXT: vmovq %rsi, %xmm0
; AVX1-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vmovq %rcx, %xmm2
; AVX1-NEXT: vmovq %r11, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: popq %rbx
; AVX1-NEXT: popq %r12
; AVX1-NEXT: popq %r13
; AVX1-NEXT: popq %r14
; AVX1-NEXT: popq %r15
; AVX1-NEXT: popq %rbp
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm5
; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v4i64_undef:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: vmovq %xmm0, %r11
; AVX2-NEXT: movq %r11, %r10
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %r9
; AVX2-NEXT: sarq $63, %r9
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rdx
; AVX2-NEXT: movq %rdx, %r8
; AVX2-NEXT: sarq $63, %r8
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
; AVX2-NEXT: movq %rsi, %rdi
; AVX2-NEXT: sarq $63, %rdi
; AVX2-NEXT: vmovq %xmm1, %rbx
; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: sarq $63, %rbx
; AVX2-NEXT: vpextrq $1, %xmm1, %r14
; AVX2-NEXT: movq %r14, %r15
; AVX2-NEXT: sarq $63, %r15
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %r12
; AVX2-NEXT: movq %r12, %r13
; AVX2-NEXT: sarq $63, %r13
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: movq %rax, %rbp
; AVX2-NEXT: sarq $63, %rbp
; AVX2-NEXT: subq %rax, %rsi
; AVX2-NEXT: sbbq %rbp, %rdi
; AVX2-NEXT: subq %r12, %rdx
; AVX2-NEXT: sbbq %r13, %r8
; AVX2-NEXT: subq %r14, %rcx
; AVX2-NEXT: sbbq %r15, %r9
; AVX2-NEXT: subq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
; AVX2-NEXT: sbbq %rbx, %r10
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: xorq %r10, %r11
; AVX2-NEXT: subq %r10, %r11
; AVX2-NEXT: sarq $63, %r9
; AVX2-NEXT: xorq %r9, %rcx
; AVX2-NEXT: subq %r9, %rcx
; AVX2-NEXT: sarq $63, %r8
; AVX2-NEXT: xorq %r8, %rdx
; AVX2-NEXT: subq %r8, %rdx
; AVX2-NEXT: sarq $63, %rdi
; AVX2-NEXT: xorq %rdi, %rsi
; AVX2-NEXT: subq %rdi, %rsi
; AVX2-NEXT: vmovq %rsi, %xmm0
; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vmovq %rcx, %xmm2
; AVX2-NEXT: vmovq %r11, %xmm3
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v4i64_undef:
Expand Down Expand Up @@ -629,25 +399,25 @@ define <8 x i32> @abd_minmax_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_minmax_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm7
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm1
; AVX1-NEXT: vpsubq %xmm7, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm5
; AVX1-NEXT: vpsubq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vblendvpd %xmm4, %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_minmax_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm3
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_minmax_v4i64:
Expand Down
213 changes: 72 additions & 141 deletions llvm/test/CodeGen/X86/abdu-vector-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -339,71 +339,39 @@ define <2 x i64> @abd_ext_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; SSE42-LABEL: abd_ext_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movq %xmm0, %rax
; SSE42-NEXT: pextrq $1, %xmm0, %rcx
; SSE42-NEXT: movq %xmm1, %rdx
; SSE42-NEXT: pextrq $1, %xmm1, %rsi
; SSE42-NEXT: xorl %edi, %edi
; SSE42-NEXT: subq %rsi, %rcx
; SSE42-NEXT: movl $0, %esi
; SSE42-NEXT: sbbq %rsi, %rsi
; SSE42-NEXT: subq %rdx, %rax
; SSE42-NEXT: sbbq %rdi, %rdi
; SSE42-NEXT: sarq $63, %rdi
; SSE42-NEXT: xorq %rdi, %rax
; SSE42-NEXT: subq %rdi, %rax
; SSE42-NEXT: sarq $63, %rsi
; SSE42-NEXT: xorq %rsi, %rcx
; SSE42-NEXT: subq %rsi, %rcx
; SSE42-NEXT: movq %rcx, %xmm1
; SSE42-NEXT: movq %rax, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm3
; SSE42-NEXT: pxor %xmm2, %xmm3
; SSE42-NEXT: pxor %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm3, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: psubq %xmm1, %xmm3
; SSE42-NEXT: psubq %xmm0, %xmm1
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_ext_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: vmovq %xmm1, %rdx
; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
; AVX1-NEXT: xorl %edi, %edi
; AVX1-NEXT: subq %rsi, %rcx
; AVX1-NEXT: movl $0, %esi
; AVX1-NEXT: sbbq %rsi, %rsi
; AVX1-NEXT: subq %rdx, %rax
; AVX1-NEXT: sbbq %rdi, %rdi
; AVX1-NEXT: sarq $63, %rdi
; AVX1-NEXT: xorq %rdi, %rax
; AVX1-NEXT: subq %rdi, %rax
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: xorq %rsi, %rcx
; AVX1-NEXT: subq %rsi, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: vmovq %xmm1, %rdx
; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
; AVX2-NEXT: xorl %edi, %edi
; AVX2-NEXT: subq %rsi, %rcx
; AVX2-NEXT: movl $0, %esi
; AVX2-NEXT: sbbq %rsi, %rsi
; AVX2-NEXT: subq %rdx, %rax
; AVX2-NEXT: sbbq %rdi, %rdi
; AVX2-NEXT: sarq $63, %rdi
; AVX2-NEXT: xorq %rdi, %rax
; AVX2-NEXT: subq %rdi, %rax
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: xorq %rsi, %rcx
; AVX2-NEXT: subq %rsi, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v2i64:
Expand Down Expand Up @@ -448,71 +416,39 @@ define <2 x i64> @abd_ext_v2i64_undef(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; SSE42-LABEL: abd_ext_v2i64_undef:
; SSE42: # %bb.0:
; SSE42-NEXT: movq %xmm0, %rax
; SSE42-NEXT: pextrq $1, %xmm0, %rcx
; SSE42-NEXT: movq %xmm1, %rdx
; SSE42-NEXT: pextrq $1, %xmm1, %rsi
; SSE42-NEXT: xorl %edi, %edi
; SSE42-NEXT: subq %rsi, %rcx
; SSE42-NEXT: movl $0, %esi
; SSE42-NEXT: sbbq %rsi, %rsi
; SSE42-NEXT: subq %rdx, %rax
; SSE42-NEXT: sbbq %rdi, %rdi
; SSE42-NEXT: sarq $63, %rdi
; SSE42-NEXT: xorq %rdi, %rax
; SSE42-NEXT: subq %rdi, %rax
; SSE42-NEXT: sarq $63, %rsi
; SSE42-NEXT: xorq %rsi, %rcx
; SSE42-NEXT: subq %rsi, %rcx
; SSE42-NEXT: movq %rcx, %xmm1
; SSE42-NEXT: movq %rax, %xmm0
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm3
; SSE42-NEXT: pxor %xmm2, %xmm3
; SSE42-NEXT: pxor %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm3, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: psubq %xmm1, %xmm3
; SSE42-NEXT: psubq %xmm0, %xmm1
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_ext_v2i64_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: vmovq %xmm1, %rdx
; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
; AVX1-NEXT: xorl %edi, %edi
; AVX1-NEXT: subq %rsi, %rcx
; AVX1-NEXT: movl $0, %esi
; AVX1-NEXT: sbbq %rsi, %rsi
; AVX1-NEXT: subq %rdx, %rax
; AVX1-NEXT: sbbq %rdi, %rdi
; AVX1-NEXT: sarq $63, %rdi
; AVX1-NEXT: xorq %rdi, %rax
; AVX1-NEXT: subq %rdi, %rax
; AVX1-NEXT: sarq $63, %rsi
; AVX1-NEXT: xorq %rsi, %rcx
; AVX1-NEXT: subq %rsi, %rcx
; AVX1-NEXT: vmovq %rcx, %xmm0
; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v2i64_undef:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: vmovq %xmm1, %rdx
; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
; AVX2-NEXT: xorl %edi, %edi
; AVX2-NEXT: subq %rsi, %rcx
; AVX2-NEXT: movl $0, %esi
; AVX2-NEXT: sbbq %rsi, %rsi
; AVX2-NEXT: subq %rdx, %rax
; AVX2-NEXT: sbbq %rdi, %rdi
; AVX2-NEXT: sarq $63, %rdi
; AVX2-NEXT: xorq %rdi, %rax
; AVX2-NEXT: subq %rdi, %rax
; AVX2-NEXT: sarq $63, %rsi
; AVX2-NEXT: xorq %rsi, %rcx
; AVX2-NEXT: subq %rsi, %rcx
; AVX2-NEXT: vmovq %rcx, %xmm0
; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v2i64_undef:
Expand Down Expand Up @@ -659,44 +595,39 @@ define <2 x i64> @abd_minmax_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; SSE42-LABEL: abd_minmax_v2i64:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm2
; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; SSE42-NEXT: movdqa %xmm1, %xmm3
; SSE42-NEXT: pxor %xmm2, %xmm3
; SSE42-NEXT: pxor %xmm0, %xmm2
; SSE42-NEXT: pcmpgtq %xmm3, %xmm2
; SSE42-NEXT: movdqa %xmm0, %xmm3
; SSE42-NEXT: pxor %xmm4, %xmm3
; SSE42-NEXT: pxor %xmm1, %xmm4
; SSE42-NEXT: movdqa %xmm4, %xmm0
; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
; SSE42-NEXT: movdqa %xmm1, %xmm5
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm5
; SSE42-NEXT: pcmpgtq %xmm4, %xmm3
; SSE42-NEXT: movdqa %xmm3, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm2, %xmm1
; SSE42-NEXT: psubq %xmm5, %xmm1
; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: psubq %xmm1, %xmm3
; SSE42-NEXT: psubq %xmm0, %xmm1
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
; SSE42-NEXT: movapd %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: abd_minmax_v2i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_minmax_v2i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm4
; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vpsubq %xmm4, %xmm0, %xmm0
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2
; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm3
; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vblendvpd %xmm2, %xmm3, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_minmax_v2i64:
Expand Down
255 changes: 67 additions & 188 deletions llvm/test/CodeGen/X86/abdu-vector-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -220,92 +220,33 @@ define <8 x i32> @abd_ext_v8i32_undef(<8 x i32> %a, <8 x i32> %b) nounwind {
define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_ext_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rdx
; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
; AVX1-NEXT: vmovq %xmm1, %r8
; AVX1-NEXT: vpextrq $1, %xmm1, %r9
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %r10
; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
; AVX1-NEXT: xorl %r11d, %r11d
; AVX1-NEXT: subq %rdi, %rsi
; AVX1-NEXT: movl $0, %edi
; AVX1-NEXT: sbbq %rdi, %rdi
; AVX1-NEXT: subq %r10, %rdx
; AVX1-NEXT: movl $0, %r10d
; AVX1-NEXT: sbbq %r10, %r10
; AVX1-NEXT: subq %r9, %rcx
; AVX1-NEXT: movl $0, %r9d
; AVX1-NEXT: sbbq %r9, %r9
; AVX1-NEXT: subq %r8, %rax
; AVX1-NEXT: sbbq %r11, %r11
; AVX1-NEXT: sarq $63, %r11
; AVX1-NEXT: xorq %r11, %rax
; AVX1-NEXT: subq %r11, %rax
; AVX1-NEXT: sarq $63, %r9
; AVX1-NEXT: xorq %r9, %rcx
; AVX1-NEXT: subq %r9, %rcx
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: xorq %r10, %rdx
; AVX1-NEXT: subq %r10, %rdx
; AVX1-NEXT: sarq $63, %rdi
; AVX1-NEXT: xorq %rdi, %rsi
; AVX1-NEXT: subq %rdi, %rsi
; AVX1-NEXT: vmovq %rsi, %xmm0
; AVX1-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovq %rcx, %xmm1
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm6
; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rdx
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
; AVX2-NEXT: vmovq %xmm1, %r8
; AVX2-NEXT: vpextrq $1, %xmm1, %r9
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %r10
; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
; AVX2-NEXT: xorl %r11d, %r11d
; AVX2-NEXT: subq %rdi, %rsi
; AVX2-NEXT: movl $0, %edi
; AVX2-NEXT: sbbq %rdi, %rdi
; AVX2-NEXT: subq %r10, %rdx
; AVX2-NEXT: movl $0, %r10d
; AVX2-NEXT: sbbq %r10, %r10
; AVX2-NEXT: subq %r9, %rcx
; AVX2-NEXT: movl $0, %r9d
; AVX2-NEXT: sbbq %r9, %r9
; AVX2-NEXT: subq %r8, %rax
; AVX2-NEXT: sbbq %r11, %r11
; AVX2-NEXT: sarq $63, %r11
; AVX2-NEXT: xorq %r11, %rax
; AVX2-NEXT: subq %r11, %rax
; AVX2-NEXT: sarq $63, %r9
; AVX2-NEXT: xorq %r9, %rcx
; AVX2-NEXT: subq %r9, %rcx
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: xorq %r10, %rdx
; AVX2-NEXT: subq %r10, %rdx
; AVX2-NEXT: sarq $63, %rdi
; AVX2-NEXT: xorq %rdi, %rsi
; AVX2-NEXT: subq %rdi, %rsi
; AVX2-NEXT: vmovq %rsi, %xmm0
; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v4i64:
Expand All @@ -325,92 +266,33 @@ define <4 x i64> @abd_ext_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <4 x i64> @abd_ext_v4i64_undef(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_ext_v4i64_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %xmm0, %rax
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rdx
; AVX1-NEXT: vpextrq $1, %xmm0, %rsi
; AVX1-NEXT: vmovq %xmm1, %r8
; AVX1-NEXT: vpextrq $1, %xmm1, %r9
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX1-NEXT: vmovq %xmm0, %r10
; AVX1-NEXT: vpextrq $1, %xmm0, %rdi
; AVX1-NEXT: xorl %r11d, %r11d
; AVX1-NEXT: subq %rdi, %rsi
; AVX1-NEXT: movl $0, %edi
; AVX1-NEXT: sbbq %rdi, %rdi
; AVX1-NEXT: subq %r10, %rdx
; AVX1-NEXT: movl $0, %r10d
; AVX1-NEXT: sbbq %r10, %r10
; AVX1-NEXT: subq %r9, %rcx
; AVX1-NEXT: movl $0, %r9d
; AVX1-NEXT: sbbq %r9, %r9
; AVX1-NEXT: subq %r8, %rax
; AVX1-NEXT: sbbq %r11, %r11
; AVX1-NEXT: sarq $63, %r11
; AVX1-NEXT: xorq %r11, %rax
; AVX1-NEXT: subq %r11, %rax
; AVX1-NEXT: sarq $63, %r9
; AVX1-NEXT: xorq %r9, %rcx
; AVX1-NEXT: subq %r9, %rcx
; AVX1-NEXT: sarq $63, %r10
; AVX1-NEXT: xorq %r10, %rdx
; AVX1-NEXT: subq %r10, %rdx
; AVX1-NEXT: sarq $63, %rdi
; AVX1-NEXT: xorq %rdi, %rsi
; AVX1-NEXT: subq %rdi, %rsi
; AVX1-NEXT: vmovq %rsi, %xmm0
; AVX1-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vmovq %rcx, %xmm1
; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm6
; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_ext_v4i64_undef:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rdx
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
; AVX2-NEXT: vmovq %xmm1, %r8
; AVX2-NEXT: vpextrq $1, %xmm1, %r9
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovq %xmm0, %r10
; AVX2-NEXT: vpextrq $1, %xmm0, %rdi
; AVX2-NEXT: xorl %r11d, %r11d
; AVX2-NEXT: subq %rdi, %rsi
; AVX2-NEXT: movl $0, %edi
; AVX2-NEXT: sbbq %rdi, %rdi
; AVX2-NEXT: subq %r10, %rdx
; AVX2-NEXT: movl $0, %r10d
; AVX2-NEXT: sbbq %r10, %r10
; AVX2-NEXT: subq %r9, %rcx
; AVX2-NEXT: movl $0, %r9d
; AVX2-NEXT: sbbq %r9, %r9
; AVX2-NEXT: subq %r8, %rax
; AVX2-NEXT: sbbq %r11, %r11
; AVX2-NEXT: sarq $63, %r11
; AVX2-NEXT: xorq %r11, %rax
; AVX2-NEXT: subq %r11, %rax
; AVX2-NEXT: sarq $63, %r9
; AVX2-NEXT: xorq %r9, %rcx
; AVX2-NEXT: subq %r9, %rcx
; AVX2-NEXT: sarq $63, %r10
; AVX2-NEXT: xorq %r10, %rdx
; AVX2-NEXT: subq %r10, %rdx
; AVX2-NEXT: sarq $63, %rdi
; AVX2-NEXT: xorq %rdi, %rsi
; AVX2-NEXT: subq %rdi, %rsi
; AVX2-NEXT: vmovq %rsi, %xmm0
; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vmovq %rcx, %xmm1
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_ext_v4i64_undef:
Expand Down Expand Up @@ -533,36 +415,33 @@ define <8 x i32> @abd_minmax_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <4 x i64> @abd_minmax_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
; AVX1-LABEL: abd_minmax_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX1-NEXT: vpxor %xmm2, %xmm6, %xmm7
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
; AVX1-NEXT: vpxor %xmm2, %xmm8, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9
; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm8, %xmm9
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm6
; AVX1-NEXT: vpsubq %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpsubq %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm1
; AVX1-NEXT: vblendvpd %xmm1, %xmm6, %xmm8, %xmm1
; AVX1-NEXT: vpsubq %xmm9, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4
; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: abd_minmax_v4i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm1, %ymm4
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm3
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: abd_minmax_v4i64:
Expand Down
407 changes: 188 additions & 219 deletions llvm/test/CodeGen/X86/midpoint-int-vec-128.ll

Large diffs are not rendered by default.

827 changes: 402 additions & 425 deletions llvm/test/CodeGen/X86/midpoint-int-vec-256.ll

Large diffs are not rendered by default.

22 changes: 22 additions & 0 deletions llvm/test/Instrumentation/SanitizerBinaryMetadata/ctor.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
; RUN: opt < %s -passes='module(sanmd-module)' -sanitizer-metadata-atomics -S | FileCheck %s

; CHECK: $__sanitizer_metadata_atomics.module_ctor = comdat any
; CHECK: $__sanitizer_metadata_atomics.module_dtor = comdat any
; CHECK: $__sanitizer_metadata_covered.module_ctor = comdat any
; CHECK: $__sanitizer_metadata_covered.module_dtor = comdat any

; CHECK: @llvm.used = appending global [4 x ptr] [ptr @__sanitizer_metadata_atomics.module_ctor, ptr @__sanitizer_metadata_atomics.module_dtor, ptr @__sanitizer_metadata_covered.module_ctor, ptr @__sanitizer_metadata_covered.module_dtor], section "llvm.metadata"
; CHECK: @llvm.global_ctors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_atomics.module_ctor, ptr @__sanitizer_metadata_atomics.module_ctor }, { i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_covered.module_ctor, ptr @__sanitizer_metadata_covered.module_ctor }]
; CHECK: @llvm.global_dtors = appending global [2 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_atomics.module_dtor, ptr @__sanitizer_metadata_atomics.module_dtor }, { i32, ptr, ptr } { i32 2, ptr @__sanitizer_metadata_covered.module_dtor, ptr @__sanitizer_metadata_covered.module_dtor }]

; CHECK: define dso_local void @__sanitizer_metadata_covered.module_ctor() #1 comdat {
; CHECK: define dso_local void @__sanitizer_metadata_covered.module_dtor() #1 comdat {

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"

define i8 @foo(ptr %a) nounwind uwtable {
entry:
%0 = load atomic i8, ptr %a unordered, align 1
ret i8 %0
}
1 change: 1 addition & 0 deletions llvm/utils/lit/lit/TestingConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def fromdefaults(litConfig):
'LLDB',
'LD_PRELOAD',
'LLVM_SYMBOLIZER_PATH',
'LLVM_PROFILE_FILE',
'ASAN_SYMBOLIZER_PATH',
'HWASAN_SYMBOLIZER_PATH',
'LSAN_SYMBOLIZER_PATH',
Expand Down
1 change: 1 addition & 0 deletions mlir/include/mlir/Conversion/LLVMCommon/LoweringOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class LowerToLLVMOptions {
LowerToLLVMOptions(MLIRContext *ctx, const DataLayout &dl);

bool useBarePtrCallConv = false;
bool useOpaquePointers = false;

enum class AllocLowering {
/// Use malloc for for heap allocations.
Expand Down
19 changes: 12 additions & 7 deletions mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ class MemRefDescriptor : public StructBuilder {
static unsigned getNumUnpackedValues(MemRefType type);

private:
bool useOpaquePointers();

// Cached index type.
Type indexType;
};
Expand Down Expand Up @@ -194,36 +196,39 @@ class UnrankedMemRefDescriptor : public StructBuilder {

/// Builds IR extracting the allocated pointer from the descriptor.
static Value allocatedPtr(OpBuilder &builder, Location loc,
Value memRefDescPtr, Type elemPtrPtrType);
Value memRefDescPtr,
LLVM::LLVMPointerType elemPtrType);
/// Builds IR inserting the allocated pointer into the descriptor.
static void setAllocatedPtr(OpBuilder &builder, Location loc,
Value memRefDescPtr, Type elemPtrPtrType,
Value memRefDescPtr,
LLVM::LLVMPointerType elemPtrType,
Value allocatedPtr);

/// Builds IR extracting the aligned pointer from the descriptor.
static Value alignedPtr(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter, Value memRefDescPtr,
Type elemPtrPtrType);
LLVM::LLVMPointerType elemPtrType);
/// Builds IR inserting the aligned pointer into the descriptor.
static void setAlignedPtr(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value memRefDescPtr, Type elemPtrPtrType,
Value memRefDescPtr,
LLVM::LLVMPointerType elemPtrType,
Value alignedPtr);

/// Builds IR extracting the offset from the descriptor.
static Value offset(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter, Value memRefDescPtr,
Type elemPtrPtrType);
LLVM::LLVMPointerType elemPtrType);
/// Builds IR inserting the offset into the descriptor.
static void setOffset(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter, Value memRefDescPtr,
Type elemPtrPtrType, Value offset);
LLVM::LLVMPointerType elemPtrType, Value offset);

/// Builds IR extracting the pointer to the first element of the size array.
static Value sizeBasePtr(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value memRefDescPtr,
LLVM::LLVMPointerType elemPtrPtrType);
LLVM::LLVMPointerType elemPtrType);
/// Builds IR extracting the size[index] from the descriptor.
static Value size(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter, Value sizeBasePtr,
Expand Down
12 changes: 12 additions & 0 deletions mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class LowerToLLVMOptions;

namespace LLVM {
class LLVMDialect;
class LLVMPointerType;
} // namespace LLVM

/// Conversion from types to the LLVM IR dialect.
Expand Down Expand Up @@ -119,6 +120,17 @@ class LLVMTypeConverter : public TypeConverter {
/// integer type with the size configured for this type converter.
Type getIndexType();

/// Returns true if using opaque pointers was enabled in the lowering options.
bool useOpaquePointers() const { return getOptions().useOpaquePointers; }

/// Creates an LLVM pointer type with the given element type and address
/// space.
/// This function is meant to be used in code supporting both typed and opaque
/// pointers, as it will create an opaque pointer with the given address space
/// if opaque pointers are enabled in the lowering options.
LLVM::LLVMPointerType getPointerType(Type elementType,
unsigned addressSpace = 0);

/// Gets the bitwidth of the index type when converted to LLVM.
unsigned getIndexTypeBitwidth() { return options.getIndexBitwidth(); }

Expand Down
5 changes: 4 additions & 1 deletion mlir/include/mlir/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -582,7 +582,10 @@ def FinalizeMemRefToLLVMConversionPass :
"bool",
/*default=*/"false",
"Use generic allocation and deallocation functions instead of the "
"classic 'malloc', 'aligned_alloc' and 'free' functions">
"classic 'malloc', 'aligned_alloc' and 'free' functions">,
Option<"useOpaquePointers", "use-opaque-pointers", "bool",
/*default=*/"false", "Generate LLVM IR using opaque pointers "
"instead of typed pointers">
];
}

Expand Down
193 changes: 79 additions & 114 deletions mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
#include <string>

namespace mlir {
class DominanceInfo;

namespace bufferization {

struct OneShotBufferizationOptions;
class BufferizationAliasInfo;
struct BufferizationStatistics;
class OneShotAnalysisState;

Expand All @@ -40,108 +41,11 @@ struct OneShotBufferizationOptions : public BufferizationOptions {
llvm::ArrayRef<std::string> noAnalysisFuncFilter;
};

/// The BufferizationAliasInfo class maintains a list of buffer aliases and
/// equivalence classes to support bufferization.
class BufferizationAliasInfo {
public:
explicit BufferizationAliasInfo(Operation *rootOp);

// BufferizationAliasInfo should be passed as a reference.
BufferizationAliasInfo(const BufferizationAliasInfo &) = delete;

/// Add a new entry for `v` in the `aliasInfo` and `equivalentInfo`. In the
/// beginning the alias and equivalence sets only contain `v` itself.
void createAliasInfoEntry(Value v);

/// Insert an info entry for `newValue` and merge its alias set with that of
/// `alias`.
void insertNewBufferAlias(Value newValue, Value alias);

/// Insert an info entry for `newValue` and merge its alias set with that of
/// `alias`. Additionally, merge their equivalence classes.
void insertNewBufferEquivalence(Value newValue, Value alias);

/// Set the inPlace bufferization spec to true.
/// Merge result's and operand's aliasing sets and iterate to a fixed point.
void bufferizeInPlace(OpOperand &operand, AnalysisState &state);

/// Set the inPlace bufferization spec to false.
void bufferizeOutOfPlace(OpOperand &operand);

/// Return true if `v1` and `v2` may bufferize to aliasing buffers.
bool areAliasingBufferizedValues(Value v1, Value v2) const {
return aliasInfo.isEquivalent(v1, v2);
}

/// Return true if `v1` and `v2` bufferize to equivalent buffers.
bool areEquivalentBufferizedValues(Value v1, Value v2) const {
return equivalentInfo.isEquivalent(v1, v2);
}

/// Union the alias sets of `v1` and `v2`.
void unionAliasSets(Value v1, Value v2) { aliasInfo.unionSets(v1, v2); }

/// Union the equivalence classes of `v1` and `v2`.
void unionEquivalenceClasses(Value v1, Value v2) {
equivalentInfo.unionSets(v1, v2);
}

/// Apply `fun` to all the members of the equivalence class of `v`.
void applyOnEquivalenceClass(Value v, function_ref<void(Value)> fun) const;

/// Apply `fun` to all aliases of `v`.
void applyOnAliases(Value v, function_ref<void(Value)> fun) const;

/// Mark a value as in-place bufferized.
void markInPlace(OpOperand &o) { inplaceBufferized.insert(&o); }

/// Return `true` if a value was marked as in-place bufferized.
bool isInPlace(OpOperand &opOperand) const;

int64_t getStatNumTensorOutOfPlace() const { return statNumTensorOutOfPlace; }
int64_t getStatNumTensorInPlace() const { return statNumTensorInPlace; }

private:
/// llvm::EquivalenceClasses wants comparable elements. This comparator uses
/// uses pointer comparison on the defining op. This is a poor man's
/// comparison but it's not like UnionFind needs ordering anyway.
struct ValueComparator {
bool operator()(const Value &lhs, const Value &rhs) const {
return lhs.getImpl() < rhs.getImpl();
}
};

using EquivalenceClassRangeType = llvm::iterator_range<
llvm::EquivalenceClasses<Value, ValueComparator>::member_iterator>;
/// Check that aliasInfo for `v` exists and return a reference to it.
EquivalenceClassRangeType getAliases(Value v) const;

/// Set of all OpResults that were decided to bufferize in-place.
llvm::DenseSet<OpOperand *> inplaceBufferized;

/// Auxiliary structure to store all the values a given value may alias with.
/// Alias information is "may be" conservative: In the presence of branches, a
/// value may alias with one of multiple other values. The concrete aliasing
/// value may not even be known at compile time. All such values are
/// considered to be aliases.
llvm::EquivalenceClasses<Value, ValueComparator> aliasInfo;

/// Auxiliary structure to store all the equivalent buffer classes. Equivalent
/// buffer information is "must be" conservative: Only if two values are
/// guaranteed to be equivalent at runtime, they said to be equivalent. It is
/// possible that, in the presence of branches, it cannot be determined
/// statically if two values are equivalent. In that case, the values are
/// considered to be not equivalent.
llvm::EquivalenceClasses<Value, ValueComparator> equivalentInfo;

// Bufferization statistics.
int64_t statNumTensorOutOfPlace = 0;
int64_t statNumTensorInPlace = 0;
};

/// State for analysis-enabled bufferization. This class keeps track of alias
/// (via BufferizationAliasInfo) to decide if tensor OpOperands should bufferize
/// in-place.
/// sets, equivalence sets, in-place OpOperands and other things.
///
/// Note: Modifying the IR generally invalidates the result of the analysis.
/// Adding new operations is safe if they are analyzed subsequently.
class OneShotAnalysisState : public AnalysisState {
public:
OneShotAnalysisState(Operation *op,
Expand All @@ -161,24 +65,34 @@ class OneShotAnalysisState : public AnalysisState {
AnalysisState::getOptions());
}

/// Return a reference to the BufferizationAliasInfo.
BufferizationAliasInfo &getAliasInfo() { return aliasInfo; }
/// Analyze the given op and its nested ops.
LogicalResult analyzeOp(Operation *op, const DominanceInfo &domInfo);

/// Return `true` if the given OpResult has been decided to bufferize inplace.
bool isInPlace(OpOperand &opOperand) const override;
/// Analyze a single op (without nested ops).
LogicalResult analyzeSingleOp(Operation *op, const DominanceInfo &domInfo);

/// Apply `fun` to all the members of the equivalence class of `v`.
void applyOnEquivalenceClass(Value v, function_ref<void(Value)> fun) const;

/// Apply `fun` to all aliases of `v`.
void applyOnAliases(Value v, function_ref<void(Value)> fun) const;

/// Return true if `v1` and `v2` bufferize to equivalent buffers.
bool areEquivalentBufferizedValues(Value v1, Value v2) const override;

/// Return true if `v1` and `v2` may bufferize to aliasing buffers.
bool areAliasingBufferizedValues(Value v1, Value v2) const override;

/// Return `true` if the given tensor has undefined contents.
bool hasUndefinedContents(OpOperand *opOperand) const override;
/// Mark the given OpOperand as in-place and merge the results' and operand's
/// aliasing sets.
void bufferizeInPlace(OpOperand &operand);

/// Return true if the given tensor (or an aliasing tensor) is yielded from
/// the containing block. Also include all aliasing tensors in the same block.
bool isTensorYielded(Value tensor) const override;
/// Mark the given OpOperand as out-of-place.
void bufferizeOutOfPlace(OpOperand &operand);

/// Add a new entry for `v` in the `aliasInfo` and `equivalentInfo`. In the
/// beginning the alias and equivalence sets only contain `v` itself.
void createAliasInfoEntry(Value v);

/// Find all tensor values in the given operation that have undefined contents
/// and store them in `undefinedTensorUses`.
Expand All @@ -188,13 +102,32 @@ class OneShotAnalysisState : public AnalysisState {
/// `yieldedTensors`. Also include all aliasing tensors in the same block.
void gatherYieldedTensors(Operation *op);

int64_t getStatNumTensorOutOfPlace() const { return statNumTensorOutOfPlace; }
int64_t getStatNumTensorInPlace() const { return statNumTensorInPlace; }

/// Return `true` if the given tensor has undefined contents.
bool hasUndefinedContents(OpOperand *opOperand) const override;

/// Return `true` if the given OpResult has been decided to bufferize inplace.
bool isInPlace(OpOperand &opOperand) const override;

/// Return true if the given tensor (or an aliasing tensor) is yielded from
/// the containing block. Also include all aliasing tensors in the same block.
bool isTensorYielded(Value tensor) const override;

/// Return true if the buffer of the given tensor value is written to. Must
/// not be called for values inside not yet analyzed functions.
bool isValueWritten(Value value) const;

/// Return true if the buffer of the given tensor value is writable.
bool isWritable(Value value) const;

/// Union the alias sets of `v1` and `v2`.
void unionAliasSets(Value v1, Value v2);

/// Union the equivalence classes of `v1` and `v2`.
void unionEquivalenceClasses(Value v1, Value v2);

/// Base class for OneShotAnalysisState extensions that allow
/// OneShotAnalysisState to contain user-specified information in the state
/// object. Clients are expected to derive this class, add the desired fields,
Expand Down Expand Up @@ -279,9 +212,41 @@ class OneShotAnalysisState : public AnalysisState {
}

private:
/// `aliasInfo` keeps track of aliasing and equivalent values. Only internal
/// functions and `runOneShotBufferize` may access this object.
BufferizationAliasInfo aliasInfo;
/// llvm::EquivalenceClasses wants comparable elements. This comparator uses
/// pointer comparison on the defining op. This is a poor man's comparison
/// but it's not like UnionFind needs ordering anyway.
struct ValueComparator {
bool operator()(const Value &lhs, const Value &rhs) const {
return lhs.getImpl() < rhs.getImpl();
}
};

using EquivalenceClassRangeType = llvm::iterator_range<
llvm::EquivalenceClasses<Value, ValueComparator>::member_iterator>;
/// Check that aliasInfo for `v` exists and return a reference to it.
EquivalenceClassRangeType getAliases(Value v) const;

/// Set of all OpResults that were decided to bufferize in-place.
llvm::DenseSet<OpOperand *> inplaceBufferized;

/// Auxiliary structure to store all the values a given value may alias with.
/// Alias information is "may be" conservative: In the presence of branches, a
/// value may alias with one of multiple other values. The concrete aliasing
/// value may not even be known at compile time. All such values are
/// considered to be aliases.
llvm::EquivalenceClasses<Value, ValueComparator> aliasInfo;

/// Auxiliary structure to store all the equivalent buffer classes. Equivalent
/// buffer information is "must be" conservative: Only if two values are
/// guaranteed to be equivalent at runtime, they said to be equivalent. It is
/// possible that, in the presence of branches, it cannot be determined
/// statically if two values are equivalent. In that case, the values are
/// considered to be not equivalent.
llvm::EquivalenceClasses<Value, ValueComparator> equivalentInfo;

// Bufferization statistics.
int64_t statNumTensorOutOfPlace = 0;
int64_t statNumTensorInPlace = 0;

/// A set of all tensors (and maybe aliasing tensors) that yielded from a
/// block.
Expand Down
22 changes: 13 additions & 9 deletions mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,24 @@ LLVM::LLVMFuncOp lookupOrCreatePrintI64Fn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreatePrintU64Fn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreatePrintF32Fn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreatePrintF64Fn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreatePrintStrFn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreatePrintStrFn(ModuleOp moduleOp,
bool opaquePointers);
LLVM::LLVMFuncOp lookupOrCreatePrintOpenFn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreatePrintCloseFn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreatePrintCommaFn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreatePrintNewlineFn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreateMallocFn(ModuleOp moduleOp, Type indexType);
LLVM::LLVMFuncOp lookupOrCreateAlignedAllocFn(ModuleOp moduleOp,
Type indexType);
LLVM::LLVMFuncOp lookupOrCreateFreeFn(ModuleOp moduleOp);
LLVM::LLVMFuncOp lookupOrCreateGenericAllocFn(ModuleOp moduleOp,
Type indexType);
LLVM::LLVMFuncOp lookupOrCreateMallocFn(ModuleOp moduleOp, Type indexType,
bool opaquePointers);
LLVM::LLVMFuncOp lookupOrCreateAlignedAllocFn(ModuleOp moduleOp, Type indexType,
bool opaquePointers);
LLVM::LLVMFuncOp lookupOrCreateFreeFn(ModuleOp moduleOp, bool opaquePointers);
LLVM::LLVMFuncOp lookupOrCreateGenericAllocFn(ModuleOp moduleOp, Type indexType,
bool opaquePointers);
LLVM::LLVMFuncOp lookupOrCreateGenericAlignedAllocFn(ModuleOp moduleOp,
Type indexType);
LLVM::LLVMFuncOp lookupOrCreateGenericFreeFn(ModuleOp moduleOp);
Type indexType,
bool opaquePointers);
LLVM::LLVMFuncOp lookupOrCreateGenericFreeFn(ModuleOp moduleOp,
bool opaquePointers);
LLVM::LLVMFuncOp lookupOrCreateMemRefCopyFn(ModuleOp moduleOp, Type indexType,
Type unrankedDescriptorType);

Expand Down
7 changes: 4 additions & 3 deletions mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,8 @@ class CoroBeginOpConversion : public OpConversionPattern<CoroBeginOp> {

// Allocate memory for the coroutine frame.
auto allocFuncOp = LLVM::lookupOrCreateAlignedAllocFn(
op->getParentOfType<ModuleOp>(), rewriter.getI64Type());
op->getParentOfType<ModuleOp>(), rewriter.getI64Type(),
/*TODO: opaquePointers=*/false);
auto coroAlloc = rewriter.create<LLVM::CallOp>(
loc, allocFuncOp, ValueRange{coroAlign, coroSize});

Expand Down Expand Up @@ -412,8 +413,8 @@ class CoroFreeOpConversion : public OpConversionPattern<CoroFreeOp> {
rewriter.create<LLVM::CoroFreeOp>(loc, i8Ptr, adaptor.getOperands());

// Free the memory.
auto freeFuncOp =
LLVM::lookupOrCreateFreeFn(op->getParentOfType<ModuleOp>());
auto freeFuncOp = LLVM::lookupOrCreateFreeFn(
op->getParentOfType<ModuleOp>(), /*TODO: opaquePointers=*/false);
rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFuncOp,
ValueRange(coroMem.getResult()));

Expand Down
3 changes: 2 additions & 1 deletion mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ static void createPrintMsg(OpBuilder &builder, Location loc, ModuleOp moduleOp,
SmallVector<LLVM::GEPArg> indices(1, 0);
Value gep = builder.create<LLVM::GEPOp>(
loc, LLVM::LLVMPointerType::get(builder.getI8Type()), msgAddr, indices);
Operation *printer = LLVM::lookupOrCreatePrintStrFn(moduleOp);
Operation *printer =
LLVM::lookupOrCreatePrintStrFn(moduleOp, /*TODO: opaquePointers=*/false);
builder.create<LLVM::CallOp>(loc, TypeRange(), SymbolRefAttr::get(printer),
gep);
}
Expand Down
215 changes: 144 additions & 71 deletions mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,22 +132,30 @@ Value MemRefDescriptor::size(OpBuilder &builder, Location loc, unsigned pos) {

Value MemRefDescriptor::size(OpBuilder &builder, Location loc, Value pos,
int64_t rank) {
auto indexPtrTy = LLVM::LLVMPointerType::get(indexType);
auto arrayTy = LLVM::LLVMArrayType::get(indexType, rank);
auto arrayPtrTy = LLVM::LLVMPointerType::get(arrayTy);

LLVM::LLVMPointerType indexPtrTy;
LLVM::LLVMPointerType arrayPtrTy;

if (useOpaquePointers()) {
arrayPtrTy = indexPtrTy = LLVM::LLVMPointerType::get(builder.getContext());
} else {
indexPtrTy = LLVM::LLVMPointerType::get(indexType);
arrayPtrTy = LLVM::LLVMPointerType::get(arrayTy);
}

// Copy size values to stack-allocated memory.
auto one = createIndexAttrConstant(builder, loc, indexType, 1);
auto sizes = builder.create<LLVM::ExtractValueOp>(
loc, value, llvm::ArrayRef<int64_t>({kSizePosInMemRefDescriptor}));
auto sizesPtr =
builder.create<LLVM::AllocaOp>(loc, arrayPtrTy, one, /*alignment=*/0);
auto sizesPtr = builder.create<LLVM::AllocaOp>(loc, arrayPtrTy, arrayTy, one,
/*alignment=*/0);
builder.create<LLVM::StoreOp>(loc, sizes, sizesPtr);

// Load an return size value of interest.
auto resultPtr = builder.create<LLVM::GEPOp>(loc, indexPtrTy, sizesPtr,
ArrayRef<LLVM::GEPArg>{0, pos});
return builder.create<LLVM::LoadOp>(loc, resultPtr);
auto resultPtr = builder.create<LLVM::GEPOp>(
loc, indexPtrTy, arrayTy, sizesPtr, ArrayRef<LLVM::GEPArg>{0, pos});
return builder.create<LLVM::LoadOp>(loc, indexType, resultPtr);
}

/// Builds IR inserting the pos-th size into the descriptor
Expand Down Expand Up @@ -242,6 +250,10 @@ unsigned MemRefDescriptor::getNumUnpackedValues(MemRefType type) {
return 3 + 2 * type.getRank();
}

bool MemRefDescriptor::useOpaquePointers() {
return getElementPtrType().isOpaque();
}

//===----------------------------------------------------------------------===//
// MemRefDescriptorView implementation.
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -372,134 +384,195 @@ void UnrankedMemRefDescriptor::computeSizes(
}
}

Value UnrankedMemRefDescriptor::allocatedPtr(OpBuilder &builder, Location loc,
Value memRefDescPtr,
Type elemPtrPtrType) {
Value UnrankedMemRefDescriptor::allocatedPtr(
OpBuilder &builder, Location loc, Value memRefDescPtr,
LLVM::LLVMPointerType elemPtrType) {

Value elementPtrPtr;
if (elemPtrType.isOpaque())
elementPtrPtr = memRefDescPtr;
else
elementPtrPtr = builder.create<LLVM::BitcastOp>(
loc, LLVM::LLVMPointerType::get(elemPtrType), memRefDescPtr);

Value elementPtrPtr =
builder.create<LLVM::BitcastOp>(loc, elemPtrPtrType, memRefDescPtr);
return builder.create<LLVM::LoadOp>(loc, elementPtrPtr);
return builder.create<LLVM::LoadOp>(loc, elemPtrType, elementPtrPtr);
}

void UnrankedMemRefDescriptor::setAllocatedPtr(OpBuilder &builder, Location loc,
Value memRefDescPtr,
Type elemPtrPtrType,
Value allocatedPtr) {
Value elementPtrPtr =
builder.create<LLVM::BitcastOp>(loc, elemPtrPtrType, memRefDescPtr);
void UnrankedMemRefDescriptor::setAllocatedPtr(
OpBuilder &builder, Location loc, Value memRefDescPtr,
LLVM::LLVMPointerType elemPtrType, Value allocatedPtr) {
Value elementPtrPtr;
if (elemPtrType.isOpaque())
elementPtrPtr = memRefDescPtr;
else
elementPtrPtr = builder.create<LLVM::BitcastOp>(
loc, LLVM::LLVMPointerType::get(elemPtrType), memRefDescPtr);

builder.create<LLVM::StoreOp>(loc, allocatedPtr, elementPtrPtr);
}

static std::pair<Value, Type>
castToElemPtrPtr(OpBuilder &builder, Location loc, Value memRefDescPtr,
LLVM::LLVMPointerType elemPtrType) {
Value elementPtrPtr;
Type elemPtrPtrType;
if (elemPtrType.isOpaque()) {
elementPtrPtr = memRefDescPtr;
elemPtrPtrType = LLVM::LLVMPointerType::get(builder.getContext());
} else {
elemPtrPtrType = LLVM::LLVMPointerType::get(elemPtrType);
elementPtrPtr =
builder.create<LLVM::BitcastOp>(loc, elemPtrPtrType, memRefDescPtr);
}
return {elementPtrPtr, elemPtrPtrType};
}

Value UnrankedMemRefDescriptor::alignedPtr(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value memRefDescPtr,
Type elemPtrPtrType) {
Value elementPtrPtr =
builder.create<LLVM::BitcastOp>(loc, elemPtrPtrType, memRefDescPtr);
LLVM::LLVMPointerType elemPtrType) {
auto [elementPtrPtr, elemPtrPtrType] =
castToElemPtrPtr(builder, loc, memRefDescPtr, elemPtrType);

Value alignedGep = builder.create<LLVM::GEPOp>(
loc, elemPtrPtrType, elementPtrPtr, ArrayRef<LLVM::GEPArg>{1});
return builder.create<LLVM::LoadOp>(loc, alignedGep);
Value alignedGep =
builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
elementPtrPtr, ArrayRef<LLVM::GEPArg>{1});
return builder.create<LLVM::LoadOp>(loc, elemPtrType, alignedGep);
}

void UnrankedMemRefDescriptor::setAlignedPtr(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value memRefDescPtr,
Type elemPtrPtrType,
LLVM::LLVMPointerType elemPtrType,
Value alignedPtr) {
Value elementPtrPtr =
builder.create<LLVM::BitcastOp>(loc, elemPtrPtrType, memRefDescPtr);
auto [elementPtrPtr, elemPtrPtrType] =
castToElemPtrPtr(builder, loc, memRefDescPtr, elemPtrType);

Value alignedGep = builder.create<LLVM::GEPOp>(
loc, elemPtrPtrType, elementPtrPtr, ArrayRef<LLVM::GEPArg>{1});
Value alignedGep =
builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
elementPtrPtr, ArrayRef<LLVM::GEPArg>{1});
builder.create<LLVM::StoreOp>(loc, alignedPtr, alignedGep);
}

Value UnrankedMemRefDescriptor::offset(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value memRefDescPtr,
Type elemPtrPtrType) {
Value elementPtrPtr =
builder.create<LLVM::BitcastOp>(loc, elemPtrPtrType, memRefDescPtr);
LLVM::LLVMPointerType elemPtrType) {
auto [elementPtrPtr, elemPtrPtrType] =
castToElemPtrPtr(builder, loc, memRefDescPtr, elemPtrType);

Value offsetGep =
builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
elementPtrPtr, ArrayRef<LLVM::GEPArg>{2});

if (!elemPtrType.isOpaque()) {
offsetGep = builder.create<LLVM::BitcastOp>(
loc, LLVM::LLVMPointerType::get(typeConverter.getIndexType()),
offsetGep);
}

Value offsetGep = builder.create<LLVM::GEPOp>(
loc, elemPtrPtrType, elementPtrPtr, ArrayRef<LLVM::GEPArg>{2});
offsetGep = builder.create<LLVM::BitcastOp>(
loc, LLVM::LLVMPointerType::get(typeConverter.getIndexType()), offsetGep);
return builder.create<LLVM::LoadOp>(loc, offsetGep);
return builder.create<LLVM::LoadOp>(loc, typeConverter.getIndexType(),
offsetGep);
}

void UnrankedMemRefDescriptor::setOffset(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value memRefDescPtr,
Type elemPtrPtrType, Value offset) {
Value elementPtrPtr =
builder.create<LLVM::BitcastOp>(loc, elemPtrPtrType, memRefDescPtr);

Value offsetGep = builder.create<LLVM::GEPOp>(
loc, elemPtrPtrType, elementPtrPtr, ArrayRef<LLVM::GEPArg>{2});
offsetGep = builder.create<LLVM::BitcastOp>(
loc, LLVM::LLVMPointerType::get(typeConverter.getIndexType()), offsetGep);
LLVM::LLVMPointerType elemPtrType,
Value offset) {
auto [elementPtrPtr, elemPtrPtrType] =
castToElemPtrPtr(builder, loc, memRefDescPtr, elemPtrType);

Value offsetGep =
builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
elementPtrPtr, ArrayRef<LLVM::GEPArg>{2});

if (!elemPtrType.isOpaque()) {
offsetGep = builder.create<LLVM::BitcastOp>(
loc, LLVM::LLVMPointerType::get(typeConverter.getIndexType()),
offsetGep);
}

builder.create<LLVM::StoreOp>(loc, offset, offsetGep);
}

Value UnrankedMemRefDescriptor::sizeBasePtr(
OpBuilder &builder, Location loc, LLVMTypeConverter &typeConverter,
Value memRefDescPtr, LLVM::LLVMPointerType elemPtrPtrType) {
Type elemPtrTy = elemPtrPtrType.getElementType();
Value UnrankedMemRefDescriptor::sizeBasePtr(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value memRefDescPtr,
LLVM::LLVMPointerType elemPtrType) {
Type indexTy = typeConverter.getIndexType();
Type structPtrTy =
LLVM::LLVMPointerType::get(LLVM::LLVMStructType::getLiteral(
indexTy.getContext(), {elemPtrTy, elemPtrTy, indexTy, indexTy}));
Value structPtr =
builder.create<LLVM::BitcastOp>(loc, structPtrTy, memRefDescPtr);
Type structTy = LLVM::LLVMStructType::getLiteral(
indexTy.getContext(), {elemPtrType, elemPtrType, indexTy, indexTy});
Value structPtr;
if (elemPtrType.isOpaque()) {
structPtr = memRefDescPtr;
} else {
Type structPtrTy = LLVM::LLVMPointerType::get(structTy);
structPtr =
builder.create<LLVM::BitcastOp>(loc, structPtrTy, memRefDescPtr);
}

return builder.create<LLVM::GEPOp>(loc, LLVM::LLVMPointerType::get(indexTy),
structPtr, ArrayRef<LLVM::GEPArg>{0, 3});
auto resultType = elemPtrType.isOpaque()
? LLVM::LLVMPointerType::get(indexTy.getContext())
: LLVM::LLVMPointerType::get(indexTy);
return builder.create<LLVM::GEPOp>(loc, resultType, structTy, structPtr,
ArrayRef<LLVM::GEPArg>{0, 3});
}

Value UnrankedMemRefDescriptor::size(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value sizeBasePtr, Value index) {
Type indexPtrTy = LLVM::LLVMPointerType::get(typeConverter.getIndexType());

Type indexTy = typeConverter.getIndexType();
Type indexPtrTy = typeConverter.getPointerType(indexTy);

Value sizeStoreGep =
builder.create<LLVM::GEPOp>(loc, indexPtrTy, sizeBasePtr, index);
return builder.create<LLVM::LoadOp>(loc, sizeStoreGep);
builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy, sizeBasePtr, index);
return builder.create<LLVM::LoadOp>(loc, indexTy, sizeStoreGep);
}

void UnrankedMemRefDescriptor::setSize(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value sizeBasePtr, Value index,
Value size) {
Type indexPtrTy = LLVM::LLVMPointerType::get(typeConverter.getIndexType());
Type indexTy = typeConverter.getIndexType();
Type indexPtrTy = typeConverter.getPointerType(indexTy);

Value sizeStoreGep =
builder.create<LLVM::GEPOp>(loc, indexPtrTy, sizeBasePtr, index);
builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy, sizeBasePtr, index);
builder.create<LLVM::StoreOp>(loc, size, sizeStoreGep);
}

Value UnrankedMemRefDescriptor::strideBasePtr(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value sizeBasePtr, Value rank) {
Type indexPtrTy = LLVM::LLVMPointerType::get(typeConverter.getIndexType());
return builder.create<LLVM::GEPOp>(loc, indexPtrTy, sizeBasePtr, rank);
Type indexTy = typeConverter.getIndexType();
Type indexPtrTy = typeConverter.getPointerType(indexTy);

return builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy, sizeBasePtr,
rank);
}

Value UnrankedMemRefDescriptor::stride(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value strideBasePtr, Value index,
Value stride) {
Type indexPtrTy = LLVM::LLVMPointerType::get(typeConverter.getIndexType());
Value strideStoreGep =
builder.create<LLVM::GEPOp>(loc, indexPtrTy, strideBasePtr, index);
return builder.create<LLVM::LoadOp>(loc, strideStoreGep);
Type indexTy = typeConverter.getIndexType();
Type indexPtrTy = typeConverter.getPointerType(indexTy);

Value strideStoreGep = builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy,
strideBasePtr, index);
return builder.create<LLVM::LoadOp>(loc, indexTy, strideStoreGep);
}

void UnrankedMemRefDescriptor::setStride(OpBuilder &builder, Location loc,
LLVMTypeConverter &typeConverter,
Value strideBasePtr, Value index,
Value stride) {
Type indexPtrTy = LLVM::LLVMPointerType::get(typeConverter.getIndexType());
Value strideStoreGep =
builder.create<LLVM::GEPOp>(loc, indexPtrTy, strideBasePtr, index);
Type indexTy = typeConverter.getIndexType();
Type indexPtrTy = typeConverter.getPointerType(indexTy);

Value strideStoreGep = builder.create<LLVM::GEPOp>(loc, indexPtrTy, indexTy,
strideBasePtr, index);
builder.create<LLVM::StoreOp>(loc, stride, strideStoreGep);
}
36 changes: 21 additions & 15 deletions mlir/lib/Conversion/LLVMCommon/Pattern.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ Type ConvertToLLVMPattern::getVoidType() const {
}

Type ConvertToLLVMPattern::getVoidPtrType() const {
return LLVM::LLVMPointerType::get(
return getTypeConverter()->getPointerType(
IntegerType::get(&getTypeConverter()->getContext(), 8));
}

Expand Down Expand Up @@ -93,7 +93,10 @@ Value ConvertToLLVMPattern::getStridedElementPtr(
}

Type elementPtrType = memRefDescriptor.getElementPtrType();
return index ? rewriter.create<LLVM::GEPOp>(loc, elementPtrType, base, index)
return index ? rewriter.create<LLVM::GEPOp>(
loc, elementPtrType,
getTypeConverter()->convertType(type.getElementType()),
base, index)
: base;
}

Expand All @@ -109,8 +112,8 @@ bool ConvertToLLVMPattern::isConvertibleAndHasIdentityMaps(
Type ConvertToLLVMPattern::getElementPtrType(MemRefType type) const {
auto elementType = type.getElementType();
auto structElementType = typeConverter->convertType(elementType);
return LLVM::LLVMPointerType::get(structElementType,
type.getMemorySpaceAsInt());
return getTypeConverter()->getPointerType(structElementType,
type.getMemorySpaceAsInt());
}

void ConvertToLLVMPattern::getMemRefDescriptorSizes(
Expand Down Expand Up @@ -157,10 +160,11 @@ void ConvertToLLVMPattern::getMemRefDescriptorSizes(
}

// Buffer size in bytes.
Type elementPtrType = getElementPtrType(memRefType);
Type elementType = typeConverter->convertType(memRefType.getElementType());
Type elementPtrType = getTypeConverter()->getPointerType(elementType);
Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
Value gepPtr =
rewriter.create<LLVM::GEPOp>(loc, elementPtrType, nullPtr, runningStride);
Value gepPtr = rewriter.create<LLVM::GEPOp>(loc, elementPtrType, elementType,
nullPtr, runningStride);
sizeBytes = rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
}

Expand All @@ -171,11 +175,11 @@ Value ConvertToLLVMPattern::getSizeInBytes(
// %0 = getelementptr %elementType* null, %indexType 1
// %1 = ptrtoint %elementType* %0 to %indexType
// which is a common pattern of getting the size of a type in bytes.
auto convertedPtrType =
LLVM::LLVMPointerType::get(typeConverter->convertType(type));
Type llvmType = typeConverter->convertType(type);
auto convertedPtrType = getTypeConverter()->getPointerType(llvmType);
auto nullPtr = rewriter.create<LLVM::NullOp>(loc, convertedPtrType);
auto gep = rewriter.create<LLVM::GEPOp>(loc, convertedPtrType, nullPtr,
ArrayRef<LLVM::GEPArg>{1});
auto gep = rewriter.create<LLVM::GEPOp>(loc, convertedPtrType, llvmType,
nullPtr, ArrayRef<LLVM::GEPArg>{1});
return rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gep);
}

Expand Down Expand Up @@ -241,17 +245,18 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(

// Get frequently used types.
MLIRContext *context = builder.getContext();
Type voidPtrType = LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
auto i1Type = IntegerType::get(context, 1);
Type indexType = getTypeConverter()->getIndexType();

// Find the malloc and free, or declare them if necessary.
auto module = builder.getInsertionPoint()->getParentOfType<ModuleOp>();
LLVM::LLVMFuncOp freeFunc, mallocFunc;
if (toDynamic)
mallocFunc = LLVM::lookupOrCreateMallocFn(module, indexType);
mallocFunc = LLVM::lookupOrCreateMallocFn(
module, indexType, getTypeConverter()->useOpaquePointers());
if (!toDynamic)
freeFunc = LLVM::lookupOrCreateFreeFn(module);
freeFunc = LLVM::lookupOrCreateFreeFn(
module, getTypeConverter()->useOpaquePointers());

// Initialize shared constants.
Value zero =
Expand All @@ -270,7 +275,8 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
toDynamic
? builder.create<LLVM::CallOp>(loc, mallocFunc, allocationSize)
.getResult()
: builder.create<LLVM::AllocaOp>(loc, voidPtrType, allocationSize,
: builder.create<LLVM::AllocaOp>(loc, getVoidPtrType(),
getVoidType(), allocationSize,
/*alignment=*/0);
Value source = desc.memRefDescPtr(builder, loc);
builder.create<LLVM::MemcpyOp>(loc, memory, source, allocationSize, zero);
Expand Down
23 changes: 15 additions & 8 deletions mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,13 @@ Type LLVMTypeConverter::getIndexType() {
return IntegerType::get(&getContext(), getIndexTypeBitwidth());
}

LLVM::LLVMPointerType
LLVMTypeConverter::getPointerType(Type elementType, unsigned int addressSpace) {
if (useOpaquePointers())
return LLVM::LLVMPointerType::get(&getContext(), addressSpace);
return LLVM::LLVMPointerType::get(elementType, addressSpace);
}

unsigned LLVMTypeConverter::getPointerBitwidth(unsigned addressSpace) {
return options.dataLayout.getPointerSizeInBits(addressSpace);
}
Expand Down Expand Up @@ -201,7 +208,7 @@ Type LLVMTypeConverter::convertFunctionType(FunctionType type) {
convertFunctionSignature(type, /*isVariadic=*/false, conversion);
if (!converted)
return {};
return LLVM::LLVMPointerType::get(converted);
return getPointerType(converted);
}

// Function types are converted to LLVM Function types by recursively converting
Expand Down Expand Up @@ -311,8 +318,9 @@ LLVMTypeConverter::getMemRefDescriptorFields(MemRefType type,
Type elementType = convertType(type.getElementType());
if (!elementType)
return {};
auto ptrTy =
LLVM::LLVMPointerType::get(elementType, type.getMemorySpaceAsInt());

LLVM::LLVMPointerType ptrTy =
getPointerType(elementType, type.getMemorySpaceAsInt());
auto indexTy = getIndexType();

SmallVector<Type, 5> results = {ptrTy, ptrTy, indexTy};
Expand Down Expand Up @@ -355,8 +363,7 @@ Type LLVMTypeConverter::convertMemRefType(MemRefType type) {
/// stack allocated (alloca) copy of a MemRef descriptor that got casted to
/// be unranked.
SmallVector<Type, 2> LLVMTypeConverter::getUnrankedMemRefDescriptorFields() {
return {getIndexType(),
LLVM::LLVMPointerType::get(IntegerType::get(&getContext(), 8))};
return {getIndexType(), getPointerType(IntegerType::get(&getContext(), 8))};
}

unsigned
Expand Down Expand Up @@ -406,7 +413,7 @@ Type LLVMTypeConverter::convertMemRefToBarePtr(BaseMemRefType type) {
Type elementType = convertType(type.getElementType());
if (!elementType)
return {};
return LLVM::LLVMPointerType::get(elementType, type.getMemorySpaceAsInt());
return getPointerType(elementType, type.getMemorySpaceAsInt());
}

/// Convert an n-D vector type to an LLVM vector type:
Expand Down Expand Up @@ -483,11 +490,11 @@ Value LLVMTypeConverter::promoteOneMemRefDescriptor(Location loc, Value operand,
OpBuilder &builder) {
// Alloca with proper alignment. We do not expect optimizations of this
// alloca op and so we omit allocating at the entry block.
auto ptrType = LLVM::LLVMPointerType::get(operand.getType());
auto ptrType = getPointerType(operand.getType());
Value one = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
builder.getIndexAttr(1));
Value allocated =
builder.create<LLVM::AllocaOp>(loc, ptrType, one, /*alignment=*/0);
builder.create<LLVM::AllocaOp>(loc, ptrType, operand.getType(), one);
// Store into the alloca'ed descriptor.
builder.create<LLVM::StoreOp>(loc, operand, allocated);
return allocated;
Expand Down
Loading