diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 318763113fb42..357f765136a20 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2452,6 +2452,10 @@ class LLVM_ABI TargetLoweringBase { return AtomicExpansionKind::None; } + virtual bool supportsUnalignedAtomicLoadInIR(const LoadInst *LI) const { + return SupportsUnalignedAtomics; + } + /// Returns how the given (atomic) load should be cast by the IR-level /// AtomicExpand pass. virtual AtomicExpansionKind shouldCastAtomicLoadInIR(LoadInst *LI) const { @@ -2467,6 +2471,10 @@ class LLVM_ABI TargetLoweringBase { return AtomicExpansionKind::None; } + virtual bool supportsUnalignedAtomicStoreInIR(const StoreInst *SI) const { + return SupportsUnalignedAtomics; + } + /// Returns how the given (atomic) store should be cast by the IR-level /// AtomicExpand pass into. For instance AtomicExpansionKind::CastToInteger /// will try to cast the operands to integer values. diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp index 7327290f62970..113021e4880b5 100644 --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -338,8 +338,12 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { return false; if (!atomicSizeSupported(TLI, LI)) { - expandAtomicLoadToLibcall(LI); - return true; + unsigned Size = getAtomicOpSize(LI); + unsigned MaxSize = TLI->getMaxAtomicSizeInBitsSupported() / 8; + if (Size > MaxSize || !TLI->supportsUnalignedAtomicLoadInIR(LI)) { + expandAtomicLoadToLibcall(LI); + return true; + } } bool MadeChange = false; @@ -361,8 +365,12 @@ bool AtomicExpandImpl::processAtomicInstr(Instruction *I) { return false; if (!atomicSizeSupported(TLI, SI)) { - expandAtomicStoreToLibcall(SI); - return true; + unsigned Size = getAtomicOpSize(SI); + unsigned MaxSize = TLI->getMaxAtomicSizeInBitsSupported() / 8; + if (Size > MaxSize || !TLI->supportsUnalignedAtomicStoreInIR(SI)) { + expandAtomicStoreToLibcall(SI); + return true; + } } bool MadeChange = false; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 20436ce2a6a8c..21669131c9a9c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5321,7 +5321,8 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) { EVT MemVT = TLI.getMemValueType(DAG.getDataLayout(), I.getType()); if (!TLI.supportsUnalignedAtomics() && - I.getAlign().value() < MemVT.getSizeInBits() / 8) + I.getAlign().value() < MemVT.getSizeInBits() / 8 && + !TLI.supportsUnalignedAtomicLoadInIR(&I)) report_fatal_error("Cannot generate unaligned atomic load"); auto Flags = TLI.getLoadMemOperandFlags(I, DAG.getDataLayout(), AC, LibInfo); @@ -5358,7 +5359,8 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) { TLI.getMemValueType(DAG.getDataLayout(), I.getValueOperand()->getType()); if (!TLI.supportsUnalignedAtomics() && - I.getAlign().value() < MemVT.getSizeInBits() / 8) + I.getAlign().value() < MemVT.getSizeInBits() / 8 && + !TLI.supportsUnalignedAtomicStoreInIR(&I)) report_fatal_error("Cannot generate unaligned atomic store"); auto Flags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout()); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 89596149529ad..58afa8b258d8c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -20341,6 +20341,50 @@ SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { : AtomicExpansionKind::None; } +bool SITargetLowering::supportsUnalignedAtomicLoadInIR( + const LoadInst *LI) const { + unsigned AS = LI->getPointerAddressSpace(); + // LDS uses ds_read2_b32 for 4 byte aligned 8 byte accesses (two separate + // loads), but that technique breaks atomicity. ds_load_b64 requires 8 byte + // alignment. Reject all sub naturally aligned atomic LDS/GDS loads. + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) + return false; + + // Atomic operations require at least DWORD alignment. Unaligned access mode + // only enables non atomic split accesses, which would break atomicity for + // sub DWORD aligned atomic loads. + if (LI->getAlign() < Align(4)) + return false; + + unsigned Size = + LI->getModule()->getDataLayout().getTypeSizeInBits(LI->getType()); + return allowsMisalignedMemoryAccessesImpl(Size, AS, LI->getAlign(), + MachineMemOperand::MOLoad, + /*IsFast=*/nullptr); +} + +bool SITargetLowering::supportsUnalignedAtomicStoreInIR( + const StoreInst *SI) const { + unsigned AS = SI->getPointerAddressSpace(); + // LDS uses ds_write2_b32 for 4 byte aligned 8 byte accesses (two separate + // stores), but that technique breaks atomicity. ds_store_b64 requires + // 8 byte alignment. Reject all sub naturally aligned atomic LDS/GDS stores. + if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) + return false; + + // Atomic operations require at least DWORD alignment. Unaligned access mode + // only enables non atomic split accesses, which would break atomicity for + // sub DWORD aligned atomic stores. + if (SI->getAlign() < Align(4)) + return false; + + unsigned Size = SI->getModule()->getDataLayout().getTypeSizeInBits( + SI->getValueOperand()->getType()); + return allowsMisalignedMemoryAccessesImpl(Size, AS, SI->getAlign(), + MachineMemOperand::MOStore, + /*IsFast=*/nullptr); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicCmpXchgInIR( const AtomicCmpXchgInst *CmpX) const { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 95ff5bba7cfff..605bfd990e1a9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -580,6 +580,8 @@ class SITargetLowering final : public AMDGPUTargetLowering { shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override; AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override; + bool supportsUnalignedAtomicLoadInIR(const LoadInst *LI) const override; + bool supportsUnalignedAtomicStoreInIR(const StoreInst *SI) const override; AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override; diff --git a/llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll b/llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll index 6a9062939d778..8376d66ac800f 100644 --- a/llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/load-atomic-flat.ll @@ -553,6 +553,93 @@ define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_max(ptr addrspac store i16 %sum, ptr addrspace(1) %out, align 4 ret void } + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_align4(ptr addrspace(0) %p, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: flat_store_dword v[2:3], v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: flat_store_b32 v[2:3], v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: flat_store_b32 v[2:3], v0 +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(0) %p syncscope("agent") monotonic, align 4 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(0) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_seq_cst_agent_align4(ptr addrspace(0) %p, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_load_f32x2_seq_cst_agent_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_seq_cst_agent_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: flat_store_dword v[2:3], v0 +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_seq_cst_agent_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: flat_store_b32 v[2:3], v0 +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_seq_cst_agent_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: flat_store_b32 v[2:3], v0 +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(0) %p syncscope("agent") seq_cst, align 4 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(0) %out, align 4 + ret void +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} ; GFX10-GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/load-atomic-global.ll b/llvm/test/CodeGen/AMDGPU/load-atomic-global.ll index 7e4b3e010c45e..0046b7e398a12 100644 --- a/llvm/test/CodeGen/AMDGPU/load-atomic-global.ll +++ b/llvm/test/CodeGen/AMDGPU/load-atomic-global.ll @@ -1240,6 +1240,93 @@ define amdgpu_cs void @atomic_load_i16x2_monotonic_agent_offset_max(ptr addrspac store i16 %sum, ptr addrspace(1) %out, align 4 ret void } + +define amdgpu_cs void @atomic_load_f32x2_monotonic_agent_align4(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_monotonic_agent_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_monotonic_agent_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_monotonic_agent_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_monotonic_agent_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(1) %p syncscope("agent") monotonic, align 4 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_cs void @atomic_load_f32x2_seq_cst_agent_align4(ptr addrspace(1) %p, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_load_f32x2_seq_cst_agent_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_load_f32x2_seq_cst_agent_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_load_f32x2_seq_cst_agent_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_load_f32x2_seq_cst_agent_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_inv scope:SCOPE_DEV +; GFX12-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX12-NEXT: global_store_b32 v[2:3], v0, off +; GFX12-NEXT: s_endpgm + %a0 = load atomic <2 x float>, ptr addrspace(1) %p syncscope("agent") seq_cst, align 4 + %num1 = extractelement <2 x float> %a0, i32 0 + %num2 = extractelement <2 x float> %a0, i32 1 + %res = fadd float %num1, %num2 + store float %res, ptr addrspace(1) %out, align 4 + ret void +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} ; GFX9-GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll b/llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll index c2f3128162ab8..a48db7b76dd74 100644 --- a/llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll +++ b/llvm/test/CodeGen/AMDGPU/store-atomic-flat.ll @@ -186,6 +186,55 @@ define amdgpu_cs void @atomic_store_i16x4_monotonic_agent_offset_max(<4 x i16> % store atomic <4 x i16> %in, ptr addrspace(0) %gep syncscope("agent") monotonic, align 8 ret void } + +define amdgpu_cs void @atomic_store_f32x2_monotonic_agent_align4(<2 x float> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_f32x2_monotonic_agent_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_monotonic_agent_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_monotonic_agent_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_monotonic_agent_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(0) %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f32x2_seq_cst_agent_align4(<2 x float> %in, ptr addrspace(0) %out) { +; GFX9-LABEL: atomic_store_f32x2_seq_cst_agent_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_seq_cst_agent_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_seq_cst_agent_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_seq_cst_agent_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(0) %out syncscope("agent") seq_cst, align 4 + ret void +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} ; GFX10-GISEL: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/store-atomic-global.ll b/llvm/test/CodeGen/AMDGPU/store-atomic-global.ll index 60f8194ea698a..38daccd1e9157 100644 --- a/llvm/test/CodeGen/AMDGPU/store-atomic-global.ll +++ b/llvm/test/CodeGen/AMDGPU/store-atomic-global.ll @@ -428,6 +428,54 @@ define amdgpu_cs void @atomic_store_i16x4_monotonic_agent_offset_max(<4 x i16> % store atomic <4 x i16> %in, ptr addrspace(1) %gep syncscope("agent") monotonic, align 8 ret void } + +define amdgpu_cs void @atomic_store_f32x2_monotonic_agent_align4(<2 x float> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f32x2_monotonic_agent_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_monotonic_agent_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_monotonic_agent_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_monotonic_agent_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_cs void @atomic_store_f32x2_seq_cst_agent_align4(<2 x float> %in, ptr addrspace(1) %out) { +; GFX9-LABEL: atomic_store_f32x2_seq_cst_agent_align4: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_store_f32x2_seq_cst_agent_align4: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: atomic_store_f32x2_seq_cst_agent_align4: +; GFX11: ; %bb.0: +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_store_f32x2_seq_cst_agent_align4: +; GFX12: ; %bb.0: +; GFX12-NEXT: global_store_b64 v[2:3], v[0:1], off scope:SCOPE_DEV +; GFX12-NEXT: s_endpgm + store atomic <2 x float> %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} ; GFX11-GISEL: {{.*}} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll index 552fb4121166d..3ba8c9ad56429 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-load.ll @@ -173,6 +173,26 @@ define double @load_atomic_f64_flat_agent(ptr %ptr) { ret double %ld } +define <2 x float> @load_atomic_v2f32_global_align4_agent(ptr addrspace(1) %ptr) { +; CHECK-LABEL: define <2 x float> @load_atomic_v2f32_global_align4_agent( +; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) { +; CHECK-NEXT: [[LD:%.*]] = load atomic <2 x float>, ptr addrspace(1) [[PTR]] syncscope("agent") monotonic, align 4, !some.unknown.md [[META0]] +; CHECK-NEXT: ret <2 x float> [[LD]] +; + %ld = load atomic <2 x float>, ptr addrspace(1) %ptr syncscope("agent") monotonic, align 4, !some.unknown.md !0 + ret <2 x float> %ld +} + +define <2 x float> @load_atomic_v2f32_flat_align4_agent(ptr addrspace(0) %ptr) { +; CHECK-LABEL: define <2 x float> @load_atomic_v2f32_flat_align4_agent( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[LD:%.*]] = load atomic <2 x float>, ptr [[PTR]] syncscope("agent") monotonic, align 4, !some.unknown.md [[META0]] +; CHECK-NEXT: ret <2 x float> [[LD]] +; + %ld = load atomic <2 x float>, ptr addrspace(0) %ptr syncscope("agent") monotonic, align 4, !some.unknown.md !0 + ret <2 x float> %ld +} + !0 = !{} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll index 79b3f2d6ec87f..777f96d989fab 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/no-expand-atomic-store.ll @@ -160,6 +160,26 @@ define void @store_atomic_f64_flat(double %val, ptr %ptr) { ret void } +define void @store_v2f32_global_align4_agent(<2 x float> %val, ptr addrspace(1) %ptr) { +; CHECK-LABEL: define void @store_v2f32_global_align4_agent( +; CHECK-SAME: <2 x float> [[VAL:%.*]], ptr addrspace(1) [[PTR:%.*]]) { +; CHECK-NEXT: store atomic <2 x float> [[VAL]], ptr addrspace(1) [[PTR]] syncscope("agent") monotonic, align 4 +; CHECK-NEXT: ret void +; + store atomic <2 x float> %val, ptr addrspace(1) %ptr syncscope("agent") monotonic, align 4 + ret void +} + +define void @store_v2f32_flat_align4_agent(<2 x float> %val, ptr addrspace(0) %ptr) { +; CHECK-LABEL: define void @store_v2f32_flat_align4_agent( +; CHECK-SAME: <2 x float> [[VAL:%.*]], ptr [[PTR:%.*]]) { +; CHECK-NEXT: store atomic <2 x float> [[VAL]], ptr [[PTR]] syncscope("agent") monotonic, align 4 +; CHECK-NEXT: ret void +; + store atomic <2 x float> %val, ptr addrspace(0) %ptr syncscope("agent") monotonic, align 4 + ret void +} + !0 = !{} ;. ; CHECK: [[META0]] = !{} diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll index 4ad087181968b..e55f2be9dc493 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/unaligned-atomic.ll @@ -11,3 +11,39 @@ define void @atomic_store_global_align1(ptr addrspace(1) %ptr, i32 %val) { store atomic i32 %val, ptr addrspace(1) %ptr monotonic, align 1 ret void } + +; CHECK: error: unsupported atomic load: instruction alignment 2 is smaller than the required 4-byte alignment for this atomic operation +define <2 x half> @atomic_load_v2f16_flat_align2(ptr addrspace(0) %ptr) { + %val = load atomic <2 x half>, ptr addrspace(0) %ptr syncscope("agent") monotonic, align 2 + ret <2 x half> %val +} + +; CHECK: error: unsupported atomic load: instruction alignment 2 is smaller than the required 4-byte alignment for this atomic operation +define <2 x half> @atomic_load_v2f16_global_align2(ptr addrspace(1) %ptr) { + %val = load atomic <2 x half>, ptr addrspace(1) %ptr syncscope("agent") monotonic, align 2 + ret <2 x half> %val +} + +; CHECK: error: unsupported atomic load: instruction alignment 4 is smaller than the required 8-byte alignment for this atomic operation +define <2 x float> @atomic_load_v2f32_lds_align4(ptr addrspace(3) %ptr) { + %val = load atomic <2 x float>, ptr addrspace(3) %ptr syncscope("agent") monotonic, align 4 + ret <2 x float> %val +} + +; CHECK: error: unsupported atomic store: instruction alignment 2 is smaller than the required 4-byte alignment for this atomic operation +define void @atomic_store_v2f16_flat_align2(ptr addrspace(0) %ptr, <2 x half> %val) { + store atomic <2 x half> %val, ptr addrspace(0) %ptr syncscope("agent") monotonic, align 2 + ret void +} + +; CHECK: error: unsupported atomic store: instruction alignment 2 is smaller than the required 4-byte alignment for this atomic operation +define void @atomic_store_v2f16_global_align2(ptr addrspace(1) %ptr, <2 x half> %val) { + store atomic <2 x half> %val, ptr addrspace(1) %ptr syncscope("agent") monotonic, align 2 + ret void +} + +; CHECK: error: unsupported atomic store: instruction alignment 4 is smaller than the required 8-byte alignment for this atomic operation +define void @atomic_store_v2f32_lds_align4(ptr addrspace(3) %ptr, <2 x float> %val) { + store atomic <2 x float> %val, ptr addrspace(3) %ptr syncscope("agent") monotonic, align 4 + ret void +}