diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 52248b9da85fe..ea152bd130e9a 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -2394,6 +2394,41 @@ def atomic_store_128 : let MemoryVT = i128; } +// Matches store or atomic_store, no alignment requirement. +def any_store : PatFrags<(ops node:$val, node:$ptr), + [(store node:$val, node:$ptr), + (atomic_store node:$val, node:$ptr)]>; + +def any_store_8 : PatFrags<(ops node:$val, node:$ptr), + [(store node:$val, node:$ptr), + (atomic_store node:$val, node:$ptr)]> { + let MemoryVT = i8; +} + +def any_store_16 : PatFrags<(ops node:$val, node:$ptr), + [(store node:$val, node:$ptr), + (atomic_store node:$val, node:$ptr)]> { + let MemoryVT = i16; +} + +def any_store_32 : PatFrags<(ops node:$val, node:$ptr), + [(store node:$val, node:$ptr), + (atomic_store node:$val, node:$ptr)]> { + let MemoryVT = i32; +} + +def any_store_64 : PatFrags<(ops node:$val, node:$ptr), + [(store node:$val, node:$ptr), + (atomic_store node:$val, node:$ptr)]> { + let MemoryVT = i64; +} + +def any_store_128 : PatFrags<(ops node:$val, node:$ptr), + [(store node:$val, node:$ptr), + (atomic_store node:$val, node:$ptr)]> { + let MemoryVT = i128; +} + //===----------------------------------------------------------------------===// // Selection DAG Pattern Support. // diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 9c37eb8065ba5..450eba435cc0b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -986,6 +986,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SplitVecOp_ExtVecInRegOp(SDNode *N); SDValue SplitVecOp_FAKE_USE(SDNode *N); SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo); + SDValue SplitVecOp_ATOMIC_STORE(AtomicSDNode *N); SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_VP_STRIDED_STORE(VPStridedStoreSDNode *N, unsigned OpNo); SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo); @@ -1104,6 +1105,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecOp_EXTEND_VECTOR_INREG(SDNode *N); SDValue WidenVecOp_FAKE_USE(SDNode *N); SDValue WidenVecOp_STORE(SDNode* N); + SDValue WidenVecOp_ATOMIC_STORE(AtomicSDNode *ST); SDValue WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo); SDValue WidenVecOp_VP_STRIDED_STORE(SDNode *N, unsigned OpNo); SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 02e25e5855577..389a6e1285fae 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3769,6 +3769,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::STORE: Res = SplitVecOp_STORE(cast(N), OpNo); break; + case ISD::ATOMIC_STORE: + Res = SplitVecOp_ATOMIC_STORE(cast(N)); + break; case ISD::VP_STORE: Res = SplitVecOp_VP_STORE(cast(N), OpNo); break; @@ -4706,6 +4709,23 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) { return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } +SDValue DAGTypeLegalizer::SplitVecOp_ATOMIC_STORE(AtomicSDNode *N) { + SDLoc DL(N); + SDValue StVal = N->getVal(); + EVT VT = StVal.getValueType(); + + // Issue a single atomic store of an integer that spans the full memory + // width. Bitcasting the (illegal) vector value to that integer lets the + // type legalizer further legalize the BITCAST input as needed, while the + // ATOMIC_STORE itself uses only the legal integer type. + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + EVT MemIntVT = + EVT::getIntegerVT(*DAG.getContext(), N->getMemoryVT().getSizeInBits()); + SDValue AsInt = DAG.getBitcast(IntVT, StVal); + return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MemIntVT, N->getChain(), AsInt, + N->getBasePtr(), N->getMemOperand()); +} + SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) { SDLoc DL(N); @@ -6611,6 +6631,23 @@ static SDValue coerceLoadedValue(SDValue LdOp, EVT FirstVT, EVT WidenVT, return LdOp; } +/// Inverse of coerceLoadedValue: pull a FirstVT-sized scalar/vector out of the +/// widened value so it can be issued in a single atomic store. +static SDValue coerceStoredValue(SDValue StVal, EVT FirstVT, EVT WidenVT, + TypeSize FirstVTWidth, const SDLoc &dl, + SelectionDAG &DAG) { + TypeSize WidenWidth = WidenVT.getSizeInBits(); + if (!FirstVT.isVector()) { + unsigned NumElts = + WidenWidth.getFixedValue() / FirstVTWidth.getFixedValue(); + EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), FirstVT, NumElts); + SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, StVal); + return DAG.getExtractVectorElt(dl, FirstVT, VecOp, 0); + } + assert(FirstVT == WidenVT && "First value type must equal widen value type"); + return StVal; +} + static std::optional findMemType(SelectionDAG &DAG, const TargetLowering &TLI, unsigned Width, EVT WidenVT, unsigned Align, @@ -7448,6 +7485,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) { case ISD::EXTRACT_SUBVECTOR: Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break; case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break; case ISD::STORE: Res = WidenVecOp_STORE(N); break; + case ISD::ATOMIC_STORE: + Res = WidenVecOp_ATOMIC_STORE(cast(N)); + break; case ISD::VP_STORE: Res = WidenVecOp_VP_STORE(N, OpNo); break; case ISD::EXPERIMENTAL_VP_STRIDED_STORE: Res = WidenVecOp_VP_STRIDED_STORE(N, OpNo); @@ -8064,6 +8104,42 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) { report_fatal_error("Unable to widen vector store"); } +SDValue DAGTypeLegalizer::WidenVecOp_ATOMIC_STORE(AtomicSDNode *ST) { + EVT StVT = ST->getMemoryVT(); + SDLoc dl(ST); + assert(StVT.isVector() && "Expected vector"); + + SDValue StVal = GetWidenedVector(ST->getVal()); + EVT WidenVT = StVal.getValueType(); + assert(WidenVT.isVector() && "Expected vector"); + assert(StVT.isScalableVector() == WidenVT.isScalableVector() && + "Must be scalable"); + assert(StVT.getVectorElementType() == WidenVT.getVectorElementType() && + "Expected equivalent element types"); + + TypeSize StWidth = StVT.getSizeInBits(); + TypeSize WidenWidth = WidenVT.getSizeInBits(); + TypeSize WidthDiff = WidenWidth - StWidth; + + // Find the vector type that can store the original memory width in one + // atomic operation. Pass StAlign=0 (like atomic loads); a real align would + // let findMemType widen the access past the value (e.g. <2 x i8> at align 4 + // implies a 4-byte movl, writing undef bytes past its object). + std::optional FirstVT = + findMemType(DAG, TLI, StWidth.getKnownMinValue(), WidenVT, /*StAlign=*/0, + WidthDiff.getKnownMinValue()); + if (!FirstVT) + return SDValue(); + + TypeSize FirstVTWidth = FirstVT->getSizeInBits(); + + SDValue StOp = + coerceStoredValue(StVal, *FirstVT, WidenVT, FirstVTWidth, dl, DAG); + + return DAG.getAtomic(ISD::ATOMIC_STORE, dl, *FirstVT, ST->getChain(), StOp, + ST->getBasePtr(), ST->getMemOperand()); +} + SDValue DAGTypeLegalizer::WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo) { assert((OpNo == 1 || OpNo == 3) && "Can widen only data or mask operand of vp_store"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 6694441468bb4..8268ab04713e2 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2945,7 +2945,7 @@ bool X86::mayFoldIntoStore(SDValue Op) { return false; User = *User->user_begin(); } - return ISD::isNormalStore(User); + return ISD::isNormalStore(User) || User->getOpcode() == ISD::ATOMIC_STORE; } bool X86::mayFoldIntoZeroExtend(SDValue Op) { diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 95c75165ed4eb..d8367412be587 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3869,8 +3869,8 @@ def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$s def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", - [(store (i32 (extractelt (v4i32 VR128X:$src), - (iPTR 0))), addr:$dst)]>, + [(any_store_32 (i32 (extractelt (v4i32 VR128X:$src), + (iPTR 0))), addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteVecStore]>; } // ExeDomain = SSEPackedInt @@ -3893,8 +3893,8 @@ def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src), def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)), - addr:$dst)]>, + [(any_store_64 (extractelt (v2i64 VR128X:$src), (iPTR 0)), + addr:$dst)]>, EVEX, TB, PD, REX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteVecStore]>, Requires<[HasAVX512]>; @@ -11476,8 +11476,8 @@ multiclass avx512_extract_elt_bw_m opc, string OpcodeStr, SDNode OpNode, def mri : AVX512Ii8, + [(any_store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), timm:$src2))), + addr:$dst)]>, EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteVecExtractSt]>; } diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 203344cd81aab..e93b3c91b7861 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -4206,8 +4206,8 @@ def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (extractelt (v4i32 VR128:$src), - (iPTR 0))), addr:$dst)]>, + [(any_store_32 (i32 (extractelt (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)]>, VEX, Sched<[WriteVecStore]>; def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), "movd\t{$src, $dst|$dst, $src}", @@ -4216,8 +4216,8 @@ def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src), Sched<[WriteVecMoveToGpr]>; def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src), "movd\t{$src, $dst|$dst, $src}", - [(store (i32 (extractelt (v4i32 VR128:$src), - (iPTR 0))), addr:$dst)]>, + [(any_store_32 (i32 (extractelt (v4i32 VR128:$src), + (iPTR 0))), addr:$dst)]>, Sched<[WriteVecStore]>; } // ExeDomain = SSEPackedInt @@ -4346,13 +4346,13 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (extractelt (v2i64 VR128:$src), - (iPTR 0))), addr:$dst)]>, + [(any_store_64 (i64 (extractelt (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)]>, VEX, WIG; def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src), "movq\t{$src, $dst|$dst, $src}", - [(store (i64 (extractelt (v2i64 VR128:$src), - (iPTR 0))), addr:$dst)]>; + [(any_store_64 (i64 (extractelt (v2i64 VR128:$src), + (iPTR 0))), addr:$dst)]>; } // ExeDomain, SchedRW // For disassembler only @@ -5280,8 +5280,8 @@ multiclass SS41I_extract16 opc, string OpcodeStr> { (ins i16mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), - addr:$dst)]>, Sched<[WriteVecExtractSt]>; + [(any_store_16 (i16 (trunc (X86pextrw (v8i16 VR128:$src1), timm:$src2))), + addr:$dst)]>, Sched<[WriteVecExtractSt]>; } let Predicates = [HasAVX, NoBWI] in diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 8619386fe3c88..b2d2eab1e8a47 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -352,6 +352,187 @@ define void @store_atomic_vec1_double_align(ptr %x, <1 x double> %v) nounwind { ret void } +define void @store_atomic_vec2_i8(ptr %x, <2 x i8> %v) { +; CHECK-SSE2-O3-LABEL: store_atomic_vec2_i8: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: movd %xmm0, %eax +; CHECK-SSE2-O3-NEXT: movw %ax, (%rdi) +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: store_atomic_vec2_i8: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, (%rdi) +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec2_i8: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vpextrw $0, %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: store_atomic_vec2_i8: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: movd %xmm0, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: movw %ax, (%rdi) +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: store_atomic_vec2_i8: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, (%rdi) +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec2_i8: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <2 x i8> %v, ptr %x release, align 4 + ret void +} + +define void @store_atomic_vec2_i16(ptr %x, <2 x i16> %v) { +; CHECK-SSE-O3-LABEL: store_atomic_vec2_i16: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movss %xmm0, (%rdi) +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec2_i16: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec2_i16: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movd %xmm0, (%rdi) +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec2_i16: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovd %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <2 x i16> %v, ptr %x release, align 4 + ret void +} + +define void @store_atomic_vec2_ptr270(ptr %x, <2 x ptr addrspace(270)> %v) { +; CHECK-SSE-O3-LABEL: store_atomic_vec2_ptr270: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movlps %xmm0, (%rdi) +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec2_ptr270: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec2_ptr270: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq %xmm0, (%rdi) +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec2_ptr270: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovq %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <2 x ptr addrspace(270)> %v, ptr %x release, align 8 + ret void +} + +define void @store_atomic_vec2_i32_align(ptr %x, <2 x i32> %v) { +; CHECK-SSE-O3-LABEL: store_atomic_vec2_i32_align: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movlps %xmm0, (%rdi) +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec2_i32_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec2_i32_align: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq %xmm0, (%rdi) +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec2_i32_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovq %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <2 x i32> %v, ptr %x release, align 8 + ret void +} + +define void @store_atomic_vec2_float_align(ptr %x, <2 x float> %v) { +; CHECK-SSE-O3-LABEL: store_atomic_vec2_float_align: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movlps %xmm0, (%rdi) +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec2_float_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec2_float_align: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq %xmm0, (%rdi) +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec2_float_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovq %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <2 x float> %v, ptr %x release, align 8 + ret void +} + +define void @store_atomic_vec4_i8(ptr %x, <4 x i8> %v) nounwind { +; CHECK-SSE-O3-LABEL: store_atomic_vec4_i8: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movss %xmm0, (%rdi) +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec4_i8: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec4_i8: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movd %xmm0, (%rdi) +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec4_i8: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovd %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <4 x i8> %v, ptr %x release, align 4 + ret void +} + +define void @store_atomic_vec4_i16(ptr %x, <4 x i16> %v) nounwind { +; CHECK-SSE-O3-LABEL: store_atomic_vec4_i16: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movlps %xmm0, (%rdi) +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec4_i16: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec4_i16: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq %xmm0, (%rdi) +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec4_i16: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovq %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <4 x i16> %v, ptr %x release, align 8 + ret void +} + define <2 x i8> @atomic_vec2_i8(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_i8: ; CHECK-SSE-O3: # %bb.0: @@ -570,6 +751,446 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { ret <2 x float> %ret } +define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) { +; CHECK-SSE-O3-LABEL: store_atomic_vec2_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SSE-O3-NEXT: psrld $16, %xmm0 +; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-SSE-O3-NEXT: shll $16, %ecx +; CHECK-SSE-O3-NEXT: movzwl %ax, %eax +; CHECK-SSE-O3-NEXT: orl %ecx, %eax +; CHECK-SSE-O3-NEXT: movl %eax, (%rdi) +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec2_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec2_half: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE-O0-NEXT: movaps %xmm1, %xmm0 +; CHECK-SSE-O0-NEXT: psrld $16, %xmm1 +; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE-O0-NEXT: movw %ax, %cx +; CHECK-SSE-O0-NEXT: shll $16, %ecx +; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE-O0-NEXT: movzwl %ax, %eax +; CHECK-SSE-O0-NEXT: orl %ecx, %eax +; CHECK-SSE-O0-NEXT: movl %eax, (%rdi) +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec2_half: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovd %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <2 x half> %v, ptr %x release, align 4 + ret void +} + +define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) nounwind { +; CHECK-SSE-O3-LABEL: store_atomic_vec2_bfloat: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SSE-O3-NEXT: psrld $16, %xmm0 +; CHECK-SSE-O3-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-SSE-O3-NEXT: shll $16, %ecx +; CHECK-SSE-O3-NEXT: movzwl %ax, %eax +; CHECK-SSE-O3-NEXT: orl %ecx, %eax +; CHECK-SSE-O3-NEXT: movl %eax, (%rdi) +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec2_bfloat: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: subq $24, %rsp +; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-SSE-O0-NEXT: pextrw $1, %xmm1, %eax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O0-NEXT: movd %xmm1, %eax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE-O0-NEXT: movw %ax, %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE-O0-NEXT: movzwl %ax, %eax +; CHECK-SSE-O0-NEXT: orl %ecx, %eax +; CHECK-SSE-O0-NEXT: movl %eax, (%rdi) +; CHECK-SSE-O0-NEXT: addq $24, %rsp +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec2_bfloat: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: subq $24, %rsp +; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-AVX-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm1, %eax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O0-NEXT: vmovd %xmm1, %eax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax +; CHECK-AVX-O0-NEXT: movw %ax, %cx +; CHECK-AVX-O0-NEXT: # implicit-def: $eax +; CHECK-AVX-O0-NEXT: movw %cx, %ax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, %eax +; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-AVX-O0-NEXT: movzwl %ax, %eax +; CHECK-AVX-O0-NEXT: orl %ecx, %eax +; CHECK-AVX-O0-NEXT: movl %eax, (%rdi) +; CHECK-AVX-O0-NEXT: addq $24, %rsp +; CHECK-AVX-O0-NEXT: retq + store atomic <2 x bfloat> %v, ptr %x release, align 4 + ret void +} + +define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind { +; CHECK-SSE2-O3-LABEL: store_atomic_vec4_half: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-O3-NEXT: psrld $16, %xmm1 +; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-SSE2-O3-NEXT: shll $16, %eax +; CHECK-SSE2-O3-NEXT: movzwl %cx, %ecx +; CHECK-SSE2-O3-NEXT: orl %eax, %ecx +; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-O3-NEXT: psrlq $48, %xmm1 +; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE2-O3-NEXT: shll $16, %eax +; CHECK-SSE2-O3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %edx +; CHECK-SSE2-O3-NEXT: movzwl %dx, %edx +; CHECK-SSE2-O3-NEXT: orl %eax, %edx +; CHECK-SSE2-O3-NEXT: shlq $32, %rdx +; CHECK-SSE2-O3-NEXT: orq %rcx, %rdx +; CHECK-SSE2-O3-NEXT: movq %rdx, (%rdi) +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: store_atomic_vec4_half: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE4-O3-NEXT: psrld $16, %xmm1 +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE4-O3-NEXT: shll $16, %eax +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-SSE4-O3-NEXT: movzwl %cx, %ecx +; CHECK-SSE4-O3-NEXT: orl %eax, %ecx +; CHECK-SSE4-O3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE4-O3-NEXT: psrlq $48, %xmm0 +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SSE4-O3-NEXT: shll $16, %eax +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %edx +; CHECK-SSE4-O3-NEXT: movzwl %dx, %edx +; CHECK-SSE4-O3-NEXT: orl %eax, %edx +; CHECK-SSE4-O3-NEXT: shlq $32, %rdx +; CHECK-SSE4-O3-NEXT: orq %rcx, %rdx +; CHECK-SSE4-O3-NEXT: movq %rdx, (%rdi) +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec4_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: store_atomic_vec4_half: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: movaps %xmm0, %xmm3 +; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm2 +; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm1 +; CHECK-SSE2-O0-NEXT: psrlq $48, %xmm1 +; CHECK-SSE2-O0-NEXT: movaps %xmm3, %xmm0 +; CHECK-SSE2-O0-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-SSE2-O0-NEXT: psrld $16, %xmm3 +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm3, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE2-O0-NEXT: movw %ax, %cx +; CHECK-SSE2-O0-NEXT: shll $16, %ecx +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm2, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE2-O0-NEXT: movzwl %ax, %eax +; CHECK-SSE2-O0-NEXT: orl %ecx, %eax +; CHECK-SSE2-O0-NEXT: # kill: def $rax killed $eax +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm1, %ecx +; CHECK-SSE2-O0-NEXT: movw %cx, %dx +; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE2-O0-NEXT: movw %dx, %cx +; CHECK-SSE2-O0-NEXT: shll $16, %ecx +; CHECK-SSE2-O0-NEXT: pextrw $0, %xmm0, %edx +; CHECK-SSE2-O0-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-SSE2-O0-NEXT: movzwl %dx, %edx +; CHECK-SSE2-O0-NEXT: orl %ecx, %edx +; CHECK-SSE2-O0-NEXT: # implicit-def: $rcx +; CHECK-SSE2-O0-NEXT: movl %edx, %ecx +; CHECK-SSE2-O0-NEXT: shlq $32, %rcx +; CHECK-SSE2-O0-NEXT: orq %rcx, %rax +; CHECK-SSE2-O0-NEXT: movq %rax, (%rdi) +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: store_atomic_vec4_half: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movaps %xmm0, %xmm3 +; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm2 +; CHECK-SSE4-O0-NEXT: movaps %xmm3, %xmm1 +; CHECK-SSE4-O0-NEXT: psrlq $48, %xmm1 +; CHECK-SSE4-O0-NEXT: movshdup {{.*#+}} xmm0 = xmm3[1,1,3,3] +; CHECK-SSE4-O0-NEXT: psrld $16, %xmm3 +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm3, %eax +; CHECK-SSE4-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE4-O0-NEXT: movw %ax, %cx +; CHECK-SSE4-O0-NEXT: shll $16, %ecx +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm2, %eax +; CHECK-SSE4-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE4-O0-NEXT: movzwl %ax, %eax +; CHECK-SSE4-O0-NEXT: orl %ecx, %eax +; CHECK-SSE4-O0-NEXT: # kill: def $rax killed $eax +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm1, %ecx +; CHECK-SSE4-O0-NEXT: movw %cx, %dx +; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE4-O0-NEXT: movw %dx, %cx +; CHECK-SSE4-O0-NEXT: shll $16, %ecx +; CHECK-SSE4-O0-NEXT: pextrw $0, %xmm0, %edx +; CHECK-SSE4-O0-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-SSE4-O0-NEXT: movzwl %dx, %edx +; CHECK-SSE4-O0-NEXT: orl %ecx, %edx +; CHECK-SSE4-O0-NEXT: # implicit-def: $rcx +; CHECK-SSE4-O0-NEXT: movl %edx, %ecx +; CHECK-SSE4-O0-NEXT: shlq $32, %rcx +; CHECK-SSE4-O0-NEXT: orq %rcx, %rax +; CHECK-SSE4-O0-NEXT: movq %rax, (%rdi) +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec4_half: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovq %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <4 x half> %v, ptr %x release, align 8 + ret void +} + +define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind { +; CHECK-SSE2-O3-LABEL: store_atomic_vec4_bfloat: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-O3-NEXT: psrld $16, %xmm1 +; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-SSE2-O3-NEXT: shll $16, %eax +; CHECK-SSE2-O3-NEXT: movzwl %cx, %ecx +; CHECK-SSE2-O3-NEXT: orl %eax, %ecx +; CHECK-SSE2-O3-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-O3-NEXT: psrlq $48, %xmm1 +; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE2-O3-NEXT: shll $16, %eax +; CHECK-SSE2-O3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-SSE2-O3-NEXT: pextrw $0, %xmm0, %edx +; CHECK-SSE2-O3-NEXT: movzwl %dx, %edx +; CHECK-SSE2-O3-NEXT: orl %eax, %edx +; CHECK-SSE2-O3-NEXT: shlq $32, %rdx +; CHECK-SSE2-O3-NEXT: orq %rcx, %rdx +; CHECK-SSE2-O3-NEXT: movq %rdx, (%rdi) +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: store_atomic_vec4_bfloat: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE4-O3-NEXT: psrld $16, %xmm1 +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE4-O3-NEXT: shll $16, %eax +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %ecx +; CHECK-SSE4-O3-NEXT: movzwl %cx, %ecx +; CHECK-SSE4-O3-NEXT: orl %eax, %ecx +; CHECK-SSE4-O3-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; CHECK-SSE4-O3-NEXT: psrlq $48, %xmm0 +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm0, %eax +; CHECK-SSE4-O3-NEXT: shll $16, %eax +; CHECK-SSE4-O3-NEXT: pextrw $0, %xmm1, %edx +; CHECK-SSE4-O3-NEXT: movzwl %dx, %edx +; CHECK-SSE4-O3-NEXT: orl %eax, %edx +; CHECK-SSE4-O3-NEXT: shlq $32, %rdx +; CHECK-SSE4-O3-NEXT: orq %rcx, %rdx +; CHECK-SSE4-O3-NEXT: movq %rdx, (%rdi) +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec4_bfloat: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: store_atomic_vec4_bfloat: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: subq $40, %rsp +; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-SSE-O0-NEXT: pextrw $3, %xmm1, %eax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O0-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-O0-NEXT: pextrw $2, %xmm1, %eax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O0-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-O0-NEXT: pextrw $1, %xmm1, %eax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 +; CHECK-SSE-O0-NEXT: movd %xmm1, %eax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movd %eax, %xmm1 +; CHECK-SSE-O0-NEXT: movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE-O0-NEXT: movw %ax, %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-SSE-O0-NEXT: movzwl %ax, %eax +; CHECK-SSE-O0-NEXT: orl %ecx, %eax +; CHECK-SSE-O0-NEXT: # kill: def $rax killed $eax +; CHECK-SSE-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE-O0-NEXT: movaps %xmm0, %xmm1 +; CHECK-SSE-O0-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-SSE-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-SSE-O0-NEXT: pextrw $0, %xmm1, %eax +; CHECK-SSE-O0-NEXT: movw %ax, %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: shll $16, %eax +; CHECK-SSE-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-SSE-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-SSE-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-SSE-O0-NEXT: pextrw $0, %xmm0, %edx +; CHECK-SSE-O0-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-SSE-O0-NEXT: movzwl %dx, %edx +; CHECK-SSE-O0-NEXT: orl %ecx, %edx +; CHECK-SSE-O0-NEXT: # implicit-def: $rcx +; CHECK-SSE-O0-NEXT: movl %edx, %ecx +; CHECK-SSE-O0-NEXT: shlq $32, %rcx +; CHECK-SSE-O0-NEXT: orq %rcx, %rax +; CHECK-SSE-O0-NEXT: movq %rax, (%rdi) +; CHECK-SSE-O0-NEXT: addq $40, %rsp +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec4_bfloat: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: subq $40, %rsp +; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-AVX-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $3, %xmm1, %eax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O0-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $2, %xmm1, %eax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O0-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: vpextrw $1, %xmm1, %eax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 +; CHECK-AVX-O0-NEXT: vmovd %xmm1, %eax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm1 +; CHECK-AVX-O0-NEXT: vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax +; CHECK-AVX-O0-NEXT: movw %ax, %cx +; CHECK-AVX-O0-NEXT: # implicit-def: $eax +; CHECK-AVX-O0-NEXT: movw %cx, %ax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax +; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-AVX-O0-NEXT: movzwl %ax, %eax +; CHECK-AVX-O0-NEXT: orl %ecx, %eax +; CHECK-AVX-O0-NEXT: # kill: def $rax killed $eax +; CHECK-AVX-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-AVX-O0-NEXT: vmovaps %xmm0, %xmm1 +; CHECK-AVX-O0-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; CHECK-AVX-O0-NEXT: # xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm1, %eax +; CHECK-AVX-O0-NEXT: movw %ax, %cx +; CHECK-AVX-O0-NEXT: # implicit-def: $eax +; CHECK-AVX-O0-NEXT: movw %cx, %ax +; CHECK-AVX-O0-NEXT: shll $16, %eax +; CHECK-AVX-O0-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-AVX-O0-NEXT: callq __truncsfbf2@PLT +; CHECK-AVX-O0-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Reload +; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; CHECK-AVX-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload +; CHECK-AVX-O0-NEXT: vpextrw $0, %xmm0, %edx +; CHECK-AVX-O0-NEXT: # kill: def $dx killed $dx killed $edx +; CHECK-AVX-O0-NEXT: movzwl %dx, %edx +; CHECK-AVX-O0-NEXT: orl %ecx, %edx +; CHECK-AVX-O0-NEXT: # implicit-def: $rcx +; CHECK-AVX-O0-NEXT: movl %edx, %ecx +; CHECK-AVX-O0-NEXT: shlq $32, %rcx +; CHECK-AVX-O0-NEXT: orq %rcx, %rax +; CHECK-AVX-O0-NEXT: movq %rax, (%rdi) +; CHECK-AVX-O0-NEXT: addq $40, %rsp +; CHECK-AVX-O0-NEXT: retq + store atomic <4 x bfloat> %v, ptr %x release, align 8 + ret void +} + define <2 x half> @atomic_vec2_half(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_half: ; CHECK-SSE-O3: # %bb.0: diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index 7946204865ab6..edfdec37150c2 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -348,18 +348,16 @@ define void @store_i256(ptr %ptr, i256 %v) { define void @vec_store(ptr %p0, <2 x i32> %vec) { ; CHECK-O0-LABEL: vec_store: ; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: vmovd %xmm0, %ecx ; CHECK-O0-NEXT: vpextrd $1, %xmm0, %eax -; CHECK-O0-NEXT: movl %ecx, (%rdi) +; CHECK-O0-NEXT: vmovd %xmm0, (%rdi) ; CHECK-O0-NEXT: movl %eax, 4(%rdi) ; CHECK-O0-NEXT: retq ; ; CHECK-O3-LABEL: vec_store: ; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: vmovd %xmm0, %eax -; CHECK-O3-NEXT: vpextrd $1, %xmm0, %ecx -; CHECK-O3-NEXT: movl %eax, (%rdi) -; CHECK-O3-NEXT: movl %ecx, 4(%rdi) +; CHECK-O3-NEXT: vextractps $1, %xmm0, %eax +; CHECK-O3-NEXT: vmovss %xmm0, (%rdi) +; CHECK-O3-NEXT: movl %eax, 4(%rdi) ; CHECK-O3-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 @@ -373,18 +371,16 @@ define void @vec_store(ptr %p0, <2 x i32> %vec) { define void @vec_store_unaligned(ptr %p0, <2 x i32> %vec) { ; CHECK-O0-LABEL: vec_store_unaligned: ; CHECK-O0: # %bb.0: -; CHECK-O0-NEXT: vmovd %xmm0, %ecx ; CHECK-O0-NEXT: vpextrd $1, %xmm0, %eax -; CHECK-O0-NEXT: movl %ecx, (%rdi) +; CHECK-O0-NEXT: vmovd %xmm0, (%rdi) ; CHECK-O0-NEXT: movl %eax, 4(%rdi) ; CHECK-O0-NEXT: retq ; ; CHECK-O3-LABEL: vec_store_unaligned: ; CHECK-O3: # %bb.0: -; CHECK-O3-NEXT: vmovd %xmm0, %eax -; CHECK-O3-NEXT: vpextrd $1, %xmm0, %ecx -; CHECK-O3-NEXT: movl %eax, (%rdi) -; CHECK-O3-NEXT: movl %ecx, 4(%rdi) +; CHECK-O3-NEXT: vextractps $1, %xmm0, %eax +; CHECK-O3-NEXT: vmovss %xmm0, (%rdi) +; CHECK-O3-NEXT: movl %eax, 4(%rdi) ; CHECK-O3-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %v2 = extractelement <2 x i32> %vec, i32 1 @@ -399,12 +395,17 @@ define void @vec_store_unaligned(ptr %p0, <2 x i32> %vec) { ; Legal if wider type is also atomic (TODO) ; Also, can avoid register move from xmm to eax (TODO) define void @widen_broadcast2(ptr %p0, <2 x i32> %vec) { -; CHECK-LABEL: widen_broadcast2: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movl %eax, (%rdi) -; CHECK-NEXT: movl %eax, 4(%rdi) -; CHECK-NEXT: retq +; CHECK-O0-LABEL: widen_broadcast2: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: vmovd %xmm0, (%rdi) +; CHECK-O0-NEXT: vmovd %xmm0, 4(%rdi) +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: widen_broadcast2: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: vmovss %xmm0, (%rdi) +; CHECK-O3-NEXT: vmovss %xmm0, 4(%rdi) +; CHECK-O3-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %p1 = getelementptr i32, ptr %p0, i64 1 store atomic i32 %v1, ptr %p0 unordered, align 8 @@ -414,12 +415,17 @@ define void @widen_broadcast2(ptr %p0, <2 x i32> %vec) { ; Not legal to widen due to alignment restriction define void @widen_broadcast2_unaligned(ptr %p0, <2 x i32> %vec) { -; CHECK-LABEL: widen_broadcast2_unaligned: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: movl %eax, (%rdi) -; CHECK-NEXT: movl %eax, 4(%rdi) -; CHECK-NEXT: retq +; CHECK-O0-LABEL: widen_broadcast2_unaligned: +; CHECK-O0: # %bb.0: +; CHECK-O0-NEXT: vmovd %xmm0, (%rdi) +; CHECK-O0-NEXT: vmovd %xmm0, 4(%rdi) +; CHECK-O0-NEXT: retq +; +; CHECK-O3-LABEL: widen_broadcast2_unaligned: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: vmovss %xmm0, (%rdi) +; CHECK-O3-NEXT: vmovss %xmm0, 4(%rdi) +; CHECK-O3-NEXT: retq %v1 = extractelement <2 x i32> %vec, i32 0 %p1 = getelementptr i32, ptr %p0, i64 1 store atomic i32 %v1, ptr %p0 unordered, align 4