diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 0e0b84f7e3374..a366db1c580ba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -68,13 +68,15 @@ def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets", def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts", "FlatGlobalInsts", "true", - "Have global_* flat memory instructions" + "Have global_* flat memory instructions", + [FeatureFlatAddressSpace] >; def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", "FlatScratchInsts", "true", - "Have scratch_* flat memory instructions" + "Have scratch_* flat memory instructions", + [FeatureFlatAddressSpace] >; def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts", @@ -92,7 +94,8 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", "FlatGVSMode", "true", - "Have GVS addressing mode with flat_* instructions" + "Have GVS addressing mode with flat_* instructions", + [FeatureFlatAddressSpace] >; def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", @@ -934,13 +937,15 @@ def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-glo def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32", "HasAtomicFMinFMaxF32FlatInsts", "true", - "Has flat memory instructions for atomicrmw fmin/fmax for float" + "Has flat memory instructions for atomicrmw fmin/fmax for float", + [FeatureFlatAddressSpace] >; def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64", "HasAtomicFMinFMaxF64FlatInsts", "true", - "Has flat memory instructions for atomicrmw fmin/fmax for double" + "Has flat memory instructions for atomicrmw fmin/fmax for double", + [FeatureFlatAddressSpace] >; def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts", @@ -992,7 +997,8 @@ def FeatureFlatAtomicFaddF32Inst : SubtargetFeature<"flat-atomic-fadd-f32-inst", "HasFlatAtomicFaddF32Inst", "true", - "Has flat_atomic_add_f32 instruction" + "Has flat_atomic_add_f32 instruction", + [FeatureFlatAddressSpace] >; def FeatureFlatBufferGlobalAtomicFaddF64Inst diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 1617f7954a5ee..0ac5f3d50f1b5 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -297,7 +297,7 @@ multiclass FLAT_Flat_Store_Pseudo_t16 { multiclass FLAT_Global_Load_Pseudo { - let is_flat_global = 1 in { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Load_Pseudo, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Load_Pseudo, @@ -347,7 +347,7 @@ multiclass FLAT_Global_Load_AddTid_Pseudo { - let is_flat_global = 1 in { + let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Store_Pseudo, GlobalSaddrTable<0, opName>; def _SADDR : FLAT_Store_Pseudo, @@ -1043,8 +1043,12 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { let SubtargetPredicate = isGFX12Plus in { defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPROp_32, i32>; - defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPROp_32, i32>; -} // End SubtargetPredicate = isGFX12Plus + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>; +} + +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in { + defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_NO_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>; +} defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte">; defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte">; @@ -1296,19 +1300,19 @@ let SubtargetPredicate = isGFX10Plus in { FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", AVLdSt_64, f64, v2f64, AVLdSt_128>; } // End SubtargetPredicate = isGFX10Plus -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in +let SubtargetPredicate = HasAtomicFaddNoRtnInsts in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_add_f32", AVLdSt_32, f32 >; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN < "global_atomic_pk_add_f16", AVLdSt_32, v2f16 >; -let OtherPredicates = [HasAtomicFaddRtnInsts] in +let SubtargetPredicate = HasAtomicFaddRtnInsts in defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_add_f32", AVLdSt_32, f32 >; -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in +let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN < "global_atomic_pk_add_f16", AVLdSt_32, v2f16 >; @@ -1442,8 +1446,10 @@ class FlatStoreSaddrPat : GCNPat < (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)), - (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset, $cpol) ->; + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset, $cpol)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; +} class GlobalAtomicNoRtnSaddrPat : GCNPat < @@ -1469,19 +1475,24 @@ class FlatStoreSignedAtomicPat .ret:$data, $offset) >; -multiclass FlatAtomicNoRtnPatBase { - + defvar inst = !cast(base_inst_name); + defvar inst_saddr = !cast(inst#"_SADDR"); defvar noRtnNode = !cast(node); let AddedComplexity = 1 in def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (!cast(inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; + } - def : FlatAtomicSaddrPat(inst#"_SADDR"), !cast(node), + def : FlatAtomicSaddrPat(node), GlobalSAddr, vt, data_vt> { let AddedComplexity = 9; - let SubtargetPredicate = HasFlatGVSMode; + let SubtargetPredicate = inst_saddr.SubtargetPredicate; + let OtherPredicates = inst_saddr.OtherPredicates; } } @@ -1494,17 +1505,22 @@ multiclass FlatAtomicNoRtnPat ; -multiclass FlatAtomicRtnPatBase { - + defvar inst = !cast(inst_name#"_RTN"); + defvar inst_saddr = !cast(inst_name#"_SADDR_RTN"); defvar rtnNode = !cast(node); def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (!cast(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; + } - def : FlatAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> { + def : FlatAtomicSaddrPat { let AddedComplexity = 8; - let SubtargetPredicate = HasFlatGVSMode; + let SubtargetPredicate = inst_saddr.SubtargetPredicate; + let OtherPredicates = inst_saddr.OtherPredicates; } } @@ -1540,8 +1556,10 @@ multiclass FlatAtomicIntrPat : GCNPat < (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)), - (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset) ->; + (inst VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; +} multiclass FlatSignedAtomicPat { multiclass GlobalFLATLoadPats { def : FlatLoadSignedPat { let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } def : FlatLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } } multiclass GlobalFLATLoadPats_M0 { def : FlatLoadSignedPat_M0 { let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } def : GlobalLoadSaddrPat_M0(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } } multiclass GlobalFLATLoadPats_CPOL { def : FlatLoadSignedPat_CPOL { let AddedComplexity = 10; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } def : GlobalLoadSaddrPat_CPOL(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; } } @@ -1701,10 +1731,14 @@ multiclass GlobalFLATLoadPats_D16_t16 { def : FlatStoreSignedPat { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; let AddedComplexity = 10; } def : FlatStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let SubtargetPredicate = inst.SubtargetPredicate; + let OtherPredicates = inst.OtherPredicates; let AddedComplexity = 11; } } @@ -1849,7 +1883,9 @@ multiclass ScratchFLATLoadPats_D16_t16 { - def : FlatLoadPat ; + def : FlatLoadPat { + let OtherPredicates = [HasFlatAddressSpace]; + } def : FlatLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 9; @@ -1876,7 +1912,9 @@ multiclass FlatLoadPats_D16_t16 { - def : FlatStorePat ; + def : FlatStorePat { + let OtherPredicates = [HasFlatAddressSpace]; + } def : FlatStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 9; @@ -1893,8 +1931,6 @@ multiclass FlatStorePats_t16; defm : FlatLoadPats ; defm : FlatLoadPats ; @@ -2018,12 +2054,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; defm : FlatStorePats ; defm : FlatStorePats ; -} // End OtherPredicates = [HasFlatAddressSpace] - -let OtherPredicates = [isGFX12Plus] in defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; - -let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; let OtherPredicates = [HasD16LoadStore] in { @@ -2048,8 +2079,6 @@ defm : FlatLoadPats_D16 ; defm : FlatLoadPats_D16 ; } -let OtherPredicates = [HasFlatGlobalInsts] in { - defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; @@ -2063,7 +2092,7 @@ defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in -let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in { +let True16Predicate = p in { defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; @@ -2077,7 +2106,7 @@ defm : GlobalFLATLoadPats ; } -let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { +let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in { defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>; defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>; @@ -2174,7 +2203,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_glo defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>; defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; -let OtherPredicates = [HasAtomicCSubNoRtnInsts] in +let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; @@ -2194,7 +2223,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i let SubtargetPredicate = isGFX12Plus in { defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in + let SubtargetPredicate = HasAtomicCSubNoRtnInsts in defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>; } @@ -2249,62 +2278,38 @@ let OtherPredicates = [isGFX1250Plus] in { defm : GlobalStoreLDSPats ; } -let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>; -} - -let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in { defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>; defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>; -} -let OtherPredicates = [isGFX12Only] in { - // FIXME: Remove these intrinsics - defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; - defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; - defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; - defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; +// FIXME: Remove these intrinsics +let SubtargetPredicate = isGFX12Only in { +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>; +defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>; +defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>; } -let OtherPredicates = [HasAtomicFaddNoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -} -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in { defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; -} -let OtherPredicates = [HasAtomicFaddRtnInsts] in { defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>; -} -let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; -} -let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>; -} -let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in { defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>; defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>; -} -let OtherPredicates = [HasFlatAtomicFaddF32Inst] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>; -} - -let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; -} -let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; -} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll index 004d3c0c1cf53..3dedf008c917e 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -1,8 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s -; Test using saddr addressing mode of flat_* atomic instructions. +; Test using saddr addressing mode of flat_* atomic instructions. Make +; sure these are not incorrectly selected before gfx1250. define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn: @@ -11,6 +14,29 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -25,6 +51,29 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:2047 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:2047 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047 @@ -40,6 +89,35 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff800, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 @@ -55,6 +133,29 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -70,6 +171,29 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_2048: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:2048 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn_2048: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:2048 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2048 @@ -86,6 +210,35 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_neg2048: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn_neg2048: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff800, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 @@ -128,6 +281,33 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -164,6 +344,33 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 % ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:42 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:42 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -199,6 +406,33 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -232,6 +466,33 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 ; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:42 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:42 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %sbase = load ptr, ptr addrspace(3) @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset @@ -338,6 +599,86 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB10_5 ; GFX1250-GISEL-NEXT: .LBB10_5: +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB10_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB10_4 +; GFX950-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB10_5 +; GFX950-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB10_2 +; GFX950-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB10_5 +; GFX950-SDAG-NEXT: .LBB10_5: +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB10_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB10_4 +; GFX950-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB10_5 +; GFX950-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB10_2 +; GFX950-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB10_5 +; GFX950-GISEL-NEXT: .LBB10_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -441,6 +782,92 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX1250-GISEL-NEXT: s_branch .LBB11_5 ; GFX1250-GISEL-NEXT: .LBB11_5: +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB11_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB11_4 +; GFX950-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB11_5 +; GFX950-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB11_2 +; GFX950-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB11_5 +; GFX950-SDAG-NEXT: .LBB11_5: +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB11_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB11_4 +; GFX950-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB11_5 +; GFX950-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX950-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[4:5], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB11_5 +; GFX950-GISEL-NEXT: .LBB11_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -522,6 +949,72 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB12_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB12_4 +; GFX950-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB12_2 +; GFX950-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB12_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB12_4 +; GFX950-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB12_2 +; GFX950-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -607,6 +1100,78 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB13_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB13_4 +; GFX950-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB13_2 +; GFX950-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB13_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB13_4 +; GFX950-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB13_2 +; GFX950-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -626,6 +1191,29 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_add_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -641,6 +1229,35 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_add_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -656,6 +1273,29 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_add_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_add_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -669,6 +1309,35 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_add_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_add_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -766,6 +1435,90 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB18_5 ; GFX1250-GISEL-NEXT: .LBB18_5: +; +; GFX950-SDAG-LABEL: flat_add_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB18_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB18_4 +; GFX950-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB18_5 +; GFX950-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB18_2 +; GFX950-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB18_5 +; GFX950-SDAG-NEXT: .LBB18_5: +; +; GFX950-GISEL-LABEL: flat_add_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB18_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB18_4 +; GFX950-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB18_5 +; GFX950-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB18_2 +; GFX950-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB18_5 +; GFX950-GISEL-NEXT: .LBB18_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -869,6 +1622,96 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB19_5 ; GFX1250-GISEL-NEXT: .LBB19_5: +; +; GFX950-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB19_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB19_4 +; GFX950-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB19_5 +; GFX950-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB19_2 +; GFX950-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB19_5 +; GFX950-SDAG-NEXT: .LBB19_5: +; +; GFX950-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB19_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB19_4 +; GFX950-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB19_5 +; GFX950-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB19_2 +; GFX950-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB19_5 +; GFX950-GISEL-NEXT: .LBB19_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -956,6 +1799,80 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_add_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB20_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB20_4 +; GFX950-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB20_2 +; GFX950-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_add_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB20_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB20_4 +; GFX950-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB20_2 +; GFX950-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1047,6 +1964,86 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB21_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB21_4 +; GFX950-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB21_2 +; GFX950-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB21_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB21_4 +; GFX950-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB21_2 +; GFX950-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1066,6 +2063,29 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1081,6 +2101,35 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1096,6 +2145,29 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1109,6 +2181,35 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1206,6 +2307,92 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB26_5 ; GFX1250-GISEL-NEXT: .LBB26_5: +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB26_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB26_4 +; GFX950-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB26_5 +; GFX950-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB26_2 +; GFX950-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB26_5 +; GFX950-SDAG-NEXT: .LBB26_5: +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB26_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB26_4 +; GFX950-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB26_5 +; GFX950-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB26_2 +; GFX950-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB26_5 +; GFX950-GISEL-NEXT: .LBB26_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1309,6 +2496,98 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB27_5 ; GFX1250-GISEL-NEXT: .LBB27_5: +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB27_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB27_4 +; GFX950-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB27_5 +; GFX950-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB27_2 +; GFX950-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB27_5 +; GFX950-SDAG-NEXT: .LBB27_5: +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB27_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB27_4 +; GFX950-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB27_5 +; GFX950-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB27_2 +; GFX950-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB27_5 +; GFX950-GISEL-NEXT: .LBB27_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1396,6 +2675,82 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB28_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB28_4 +; GFX950-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB28_2 +; GFX950-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB28_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB28_4 +; GFX950-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB28_2 +; GFX950-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1487,6 +2842,88 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB29_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB29_4 +; GFX950-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB29_2 +; GFX950-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB29_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB29_4 +; GFX950-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB29_2 +; GFX950-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1506,6 +2943,29 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_and_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1521,6 +2981,35 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_and_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1536,6 +3025,29 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_and_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_and_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1549,6 +3061,35 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_and_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_and_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1648,6 +3189,90 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB34_5 ; GFX1250-GISEL-NEXT: .LBB34_5: +; +; GFX950-SDAG-LABEL: flat_and_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB34_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB34_4 +; GFX950-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB34_5 +; GFX950-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB34_2 +; GFX950-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB34_5 +; GFX950-SDAG-NEXT: .LBB34_5: +; +; GFX950-GISEL-LABEL: flat_and_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB34_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB34_4 +; GFX950-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB34_5 +; GFX950-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB34_2 +; GFX950-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB34_5 +; GFX950-GISEL-NEXT: .LBB34_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1753,6 +3378,96 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB35_5 ; GFX1250-GISEL-NEXT: .LBB35_5: +; +; GFX950-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB35_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB35_4 +; GFX950-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB35_5 +; GFX950-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB35_2 +; GFX950-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB35_5 +; GFX950-SDAG-NEXT: .LBB35_5: +; +; GFX950-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB35_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB35_4 +; GFX950-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB35_5 +; GFX950-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB35_2 +; GFX950-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB35_5 +; GFX950-GISEL-NEXT: .LBB35_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1842,6 +3557,80 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_and_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB36_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB36_4 +; GFX950-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB36_2 +; GFX950-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_and_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB36_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB36_4 +; GFX950-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB36_2 +; GFX950-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -1935,6 +3724,86 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB37_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB37_4 +; GFX950-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB37_2 +; GFX950-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB37_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB37_4 +; GFX950-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB37_2 +; GFX950-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1954,6 +3823,29 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_or_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1969,6 +3861,35 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_or_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1984,6 +3905,29 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_or_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_or_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -1997,6 +3941,35 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_or_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_or_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2096,6 +4069,90 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB42_5 ; GFX1250-GISEL-NEXT: .LBB42_5: +; +; GFX950-SDAG-LABEL: flat_or_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB42_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB42_4 +; GFX950-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB42_5 +; GFX950-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB42_2 +; GFX950-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB42_5 +; GFX950-SDAG-NEXT: .LBB42_5: +; +; GFX950-GISEL-LABEL: flat_or_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB42_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB42_4 +; GFX950-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB42_5 +; GFX950-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB42_2 +; GFX950-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB42_5 +; GFX950-GISEL-NEXT: .LBB42_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -2201,6 +4258,96 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB43_5 ; GFX1250-GISEL-NEXT: .LBB43_5: +; +; GFX950-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB43_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB43_4 +; GFX950-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB43_5 +; GFX950-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB43_2 +; GFX950-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB43_5 +; GFX950-SDAG-NEXT: .LBB43_5: +; +; GFX950-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB43_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB43_4 +; GFX950-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB43_5 +; GFX950-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB43_2 +; GFX950-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB43_5 +; GFX950-GISEL-NEXT: .LBB43_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2290,6 +4437,80 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_or_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB44_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB44_4 +; GFX950-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB44_2 +; GFX950-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_or_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB44_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB44_4 +; GFX950-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB44_2 +; GFX950-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -2383,6 +4604,86 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB45_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB45_4 +; GFX950-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB45_2 +; GFX950-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB45_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB45_4 +; GFX950-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB45_2 +; GFX950-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2402,6 +4703,29 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -2417,6 +4741,35 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2432,6 +4785,29 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst @@ -2445,6 +4821,35 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2544,6 +4949,90 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB50_5 ; GFX1250-GISEL-NEXT: .LBB50_5: +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB50_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB50_4 +; GFX950-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB50_5 +; GFX950-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB50_2 +; GFX950-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB50_5 +; GFX950-SDAG-NEXT: .LBB50_5: +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB50_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB50_4 +; GFX950-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB50_5 +; GFX950-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB50_2 +; GFX950-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB50_5 +; GFX950-GISEL-NEXT: .LBB50_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -2649,6 +5138,96 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB51_5 ; GFX1250-GISEL-NEXT: .LBB51_5: +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB51_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB51_4 +; GFX950-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB51_5 +; GFX950-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB51_2 +; GFX950-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX950-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB51_5 +; GFX950-SDAG-NEXT: .LBB51_5: +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB51_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB51_4 +; GFX950-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB51_5 +; GFX950-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB51_2 +; GFX950-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB51_5 +; GFX950-GISEL-NEXT: .LBB51_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2738,6 +5317,80 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB52_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB52_4 +; GFX950-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB52_2 +; GFX950-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB52_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB52_4 +; GFX950-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB52_2 +; GFX950-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst @@ -2831,6 +5484,86 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB53_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB53_4 +; GFX950-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc1 +; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB53_2 +; GFX950-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX950-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB53_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB53_4 +; GFX950-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc1 +; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB53_2 +; GFX950-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2848,6 +5581,25 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_max_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_smax v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_max_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smax v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2861,6 +5613,31 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_max_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_smax v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_max_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smax v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2875,6 +5652,25 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -2887,6 +5683,31 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -2986,6 +5807,92 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB58_5 ; GFX1250-GISEL-NEXT: .LBB58_5: +; +; GFX950-SDAG-LABEL: flat_max_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_4 +; GFX950-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB58_5 +; GFX950-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB58_2 +; GFX950-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB58_5 +; GFX950-SDAG-NEXT: .LBB58_5: +; +; GFX950-GISEL-LABEL: flat_max_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_4 +; GFX950-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB58_5 +; GFX950-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB58_2 +; GFX950-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB58_5 +; GFX950-GISEL-NEXT: .LBB58_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3091,6 +5998,98 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB59_5 ; GFX1250-GISEL-NEXT: .LBB59_5: +; +; GFX950-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_4 +; GFX950-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB59_5 +; GFX950-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB59_2 +; GFX950-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB59_5 +; GFX950-SDAG-NEXT: .LBB59_5: +; +; GFX950-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_4 +; GFX950-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB59_5 +; GFX950-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB59_2 +; GFX950-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB59_5 +; GFX950-GISEL-NEXT: .LBB59_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3176,6 +6175,80 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_max_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB60_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB60_4 +; GFX950-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB60_2 +; GFX950-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_max_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB60_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB60_4 +; GFX950-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB60_2 +; GFX950-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3265,6 +6338,86 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB61_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB61_4 +; GFX950-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB61_2 +; GFX950-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB61_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB61_4 +; GFX950-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB61_2 +; GFX950-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3282,6 +6435,25 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_min_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_smin v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_min_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smin v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -3295,6 +6467,31 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_min_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_smin v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_min_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smin v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3309,6 +6506,25 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -3321,6 +6537,31 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3420,6 +6661,92 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB66_5 ; GFX1250-GISEL-NEXT: .LBB66_5: +; +; GFX950-SDAG-LABEL: flat_min_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_4 +; GFX950-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB66_5 +; GFX950-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB66_2 +; GFX950-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB66_5 +; GFX950-SDAG-NEXT: .LBB66_5: +; +; GFX950-GISEL-LABEL: flat_min_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_4 +; GFX950-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB66_5 +; GFX950-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB66_2 +; GFX950-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB66_5 +; GFX950-GISEL-NEXT: .LBB66_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3525,6 +6852,98 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB67_5 ; GFX1250-GISEL-NEXT: .LBB67_5: +; +; GFX950-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_4 +; GFX950-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB67_5 +; GFX950-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB67_2 +; GFX950-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB67_5 +; GFX950-SDAG-NEXT: .LBB67_5: +; +; GFX950-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_4 +; GFX950-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB67_5 +; GFX950-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB67_2 +; GFX950-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB67_5 +; GFX950-GISEL-NEXT: .LBB67_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3610,6 +7029,80 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_min_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB68_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB68_4 +; GFX950-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB68_2 +; GFX950-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_min_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB68_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB68_4 +; GFX950-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB68_2 +; GFX950-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3699,6 +7192,86 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB69_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB69_4 +; GFX950-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB69_2 +; GFX950-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB69_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB69_4 +; GFX950-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB69_2 +; GFX950-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3716,6 +7289,25 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_umax v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umax v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -3729,6 +7321,31 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_umax v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umax v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3743,6 +7360,25 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -3755,6 +7391,31 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -3854,6 +7515,92 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB74_5 ; GFX1250-GISEL-NEXT: .LBB74_5: +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_4 +; GFX950-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB74_5 +; GFX950-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB74_2 +; GFX950-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB74_5 +; GFX950-SDAG-NEXT: .LBB74_5: +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_4 +; GFX950-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB74_5 +; GFX950-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB74_2 +; GFX950-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB74_5 +; GFX950-GISEL-NEXT: .LBB74_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -3959,6 +7706,98 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB75_5 ; GFX1250-GISEL-NEXT: .LBB75_5: +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_4 +; GFX950-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB75_5 +; GFX950-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB75_2 +; GFX950-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB75_5 +; GFX950-SDAG-NEXT: .LBB75_5: +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_4 +; GFX950-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB75_5 +; GFX950-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB75_2 +; GFX950-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB75_5 +; GFX950-GISEL-NEXT: .LBB75_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4044,6 +7883,80 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB76_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB76_4 +; GFX950-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB76_2 +; GFX950-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB76_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB76_4 +; GFX950-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB76_2 +; GFX950-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -4133,6 +8046,86 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB77_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB77_4 +; GFX950-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB77_2 +; GFX950-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB77_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB77_4 +; GFX950-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB77_2 +; GFX950-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4150,6 +8143,25 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_umin v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umin v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -4163,6 +8175,31 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_umin v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umin v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4177,6 +8214,25 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst @@ -4189,6 +8245,31 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 ; GFX1250-NEXT: s_wait_dscnt 0x0 ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4288,6 +8369,92 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB82_5 ; GFX1250-GISEL-NEXT: .LBB82_5: +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_4 +; GFX950-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB82_5 +; GFX950-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB82_2 +; GFX950-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB82_5 +; GFX950-SDAG-NEXT: .LBB82_5: +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_4 +; GFX950-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB82_5 +; GFX950-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB82_2 +; GFX950-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB82_5 +; GFX950-GISEL-NEXT: .LBB82_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -4393,6 +8560,98 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB83_5 ; GFX1250-GISEL-NEXT: .LBB83_5: +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_4 +; GFX950-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB83_5 +; GFX950-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB83_2 +; GFX950-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB83_5 +; GFX950-SDAG-NEXT: .LBB83_5: +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_4 +; GFX950-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB83_5 +; GFX950-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB83_2 +; GFX950-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB83_5 +; GFX950-GISEL-NEXT: .LBB83_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4478,6 +8737,80 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB84_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB84_4 +; GFX950-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB84_2 +; GFX950-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB84_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB84_4 +; GFX950-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB84_2 +; GFX950-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst @@ -4567,6 +8900,86 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v ; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5] ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB85_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB85_4 +; GFX950-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB85_2 +; GFX950-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB85_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB85_4 +; GFX950-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB85_2 +; GFX950-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4589,6 +9002,30 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst @@ -4608,6 +9045,36 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4627,6 +9094,30 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst @@ -4643,6 +9134,36 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX1250-NEXT: global_inv scope:SCOPE_SYS ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4748,6 +9269,98 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 % ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB90_5 ; GFX1250-GISEL-NEXT: .LBB90_5: +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB90_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB90_4 +; GFX950-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB90_5 +; GFX950-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB90_2 +; GFX950-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v8, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v8, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB90_5 +; GFX950-SDAG-NEXT: .LBB90_5: +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB90_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB90_4 +; GFX950-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB90_5 +; GFX950-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB90_2 +; GFX950-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB90_5 +; GFX950-GISEL-NEXT: .LBB90_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %cmpxchg = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst @@ -4860,6 +9473,104 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB91_5 ; GFX1250-GISEL-NEXT: .LBB91_5: +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB91_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB91_4 +; GFX950-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_branch .LBB91_5 +; GFX950-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB91_2 +; GFX950-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v8, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v8, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB91_5 +; GFX950-SDAG-NEXT: .LBB91_5: +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB91_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB91_4 +; GFX950-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_branch .LBB91_5 +; GFX950-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB91_2 +; GFX950-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v4, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v4, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB91_5 +; GFX950-GISEL-NEXT: .LBB91_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -4956,6 +9667,88 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB92_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB92_4 +; GFX950-SDAG-NEXT: .LBB92_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB92_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB92_2 +; GFX950-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB92_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB92_4 +; GFX950-GISEL-NEXT: .LBB92_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB92_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB92_2 +; GFX950-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst @@ -5055,6 +9848,94 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2 +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB93_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB93_4 +; GFX950-SDAG-NEXT: .LBB93_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB93_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: buffer_inv sc0 sc1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB93_2 +; GFX950-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB93_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB93_4 +; GFX950-GISEL-NEXT: .LBB93_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB93_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: buffer_inv sc0 sc1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB93_2 +; GFX950-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5072,6 +9953,25 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_inc v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_inc v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic @@ -5085,6 +9985,31 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_inc v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_inc v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5098,6 +10023,23 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_inc v[0:1], v2 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_inc v[2:3], v1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic @@ -5109,6 +10051,29 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_inc v[0:1], v2 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_inc v[2:3], v1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5212,6 +10177,96 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB98_5 ; GFX1250-GISEL-NEXT: .LBB98_5: +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB98_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB98_4 +; GFX950-SDAG-NEXT: .LBB98_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB98_5 +; GFX950-SDAG-NEXT: .LBB98_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB98_2 +; GFX950-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB98_5 +; GFX950-SDAG-NEXT: .LBB98_5: +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB98_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB98_4 +; GFX950-GISEL-NEXT: .LBB98_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB98_5 +; GFX950-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB98_2 +; GFX950-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB98_5 +; GFX950-GISEL-NEXT: .LBB98_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic @@ -5321,6 +10376,102 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1250-GISEL-NEXT: s_branch .LBB99_5 ; GFX1250-GISEL-NEXT: .LBB99_5: +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB99_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB99_4 +; GFX950-SDAG-NEXT: .LBB99_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB99_5 +; GFX950-SDAG-NEXT: .LBB99_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB99_2 +; GFX950-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB99_5 +; GFX950-SDAG-NEXT: .LBB99_5: +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB99_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB99_4 +; GFX950-GISEL-NEXT: .LBB99_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB99_5 +; GFX950-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB99_2 +; GFX950-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB99_5 +; GFX950-GISEL-NEXT: .LBB99_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5410,6 +10561,82 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB100_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB100_4 +; GFX950-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB100_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB100_2 +; GFX950-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB100_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB100_4 +; GFX950-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB100_2 +; GFX950-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic @@ -5503,6 +10730,88 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo ; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB101_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB101_4 +; GFX950-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB101_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB101_2 +; GFX950-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB101_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB101_4 +; GFX950-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB101_2 +; GFX950-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5521,6 +10830,25 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i32_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_dec v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i32_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_dec v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic @@ -5534,6 +10862,31 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i32_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_dec v0, v[0:1], v2 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: ; return to shader part epilog +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i32_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_dec v0, v[2:3], v1 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5547,6 +10900,23 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i32_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: flat_atomic_dec v[0:1], v2 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i32_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_dec v[2:3], v1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic @@ -5558,6 +10928,29 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250: ; %bb.0: ; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV ; GFX1250-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i32_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-SDAG-NEXT: flat_atomic_dec v[0:1], v2 +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i32_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX950-GISEL-NEXT: flat_atomic_dec v[2:3], v1 +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5665,6 +11058,98 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-GISEL-NEXT: s_branch .LBB106_5 ; GFX1250-GISEL-NEXT: .LBB106_5: +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i64_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB106_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB106_4 +; GFX950-SDAG-NEXT: .LBB106_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB106_5 +; GFX950-SDAG-NEXT: .LBB106_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB106_2 +; GFX950-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB106_5 +; GFX950-SDAG-NEXT: .LBB106_5: +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i64_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB106_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB106_4 +; GFX950-GISEL-NEXT: .LBB106_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB106_5 +; GFX950-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB106_2 +; GFX950-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB106_5 +; GFX950-GISEL-NEXT: .LBB106_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %rtn = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic @@ -5778,6 +11263,104 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3 ; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1250-GISEL-NEXT: s_branch .LBB107_5 ; GFX1250-GISEL-NEXT: .LBB107_5: +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB107_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB107_4 +; GFX950-SDAG-NEXT: .LBB107_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB107_5 +; GFX950-SDAG-NEXT: .LBB107_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB107_2 +; GFX950-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_branch .LBB107_5 +; GFX950-SDAG-NEXT: .LBB107_5: +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB107_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB107_4 +; GFX950-GISEL-NEXT: .LBB107_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB107_5 +; GFX950-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[4:5] sc0 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB107_2 +; GFX950-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_branch .LBB107_5 +; GFX950-GISEL-NEXT: .LBB107_5: %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5871,6 +11454,84 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i64_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB108_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB108_4 +; GFX950-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB108_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB108_2 +; GFX950-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i64_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB108_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB108_4 +; GFX950-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB108_2 +; GFX950-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, -1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %unused = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic @@ -5968,6 +11629,90 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 ; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE ; GFX1250-GISEL-NEXT: s_endpgm +; +; GFX950-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX950-SDAG-NEXT: s_mov_b32 s1, -1 +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB109_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB109_4 +; GFX950-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_endpgm +; GFX950-SDAG-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB109_2 +; GFX950-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: s_nop 1 +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3] +; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc +; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off +; GFX950-SDAG-NEXT: s_endpgm +; +; GFX950-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1 +; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB109_3 +; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB109_4 +; GFX950-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_endpgm +; GFX950-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5] +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execz .LBB109_2 +; GFX950-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, -1, v0 +; GFX950-GISEL-NEXT: s_nop 1 +; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5] +; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc +; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off +; GFX950-GISEL-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -5975,4 +11720,2004 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo ret void } +define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fadd_f64_saddr_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB110_5 +; GFX1250-SDAG-NEXT: s_branch .LBB110_6 +; GFX1250-SDAG-NEXT: .LBB110_3: +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_branch .LBB110_7 +; GFX1250-SDAG-NEXT: .LBB110_4: +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[4:5], s2 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: .LBB110_6: ; %Flow1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB110_8 +; GFX1250-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fadd_f64_saddr_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: .LBB110_3: ; %Flow +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 +; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[4:5], s2 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB110_5: ; %Flow1 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: .LBB110_6: ; %Flow2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_8 +; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_f64_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB110_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB110_4 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB110_5 +; GFX950-SDAG-NEXT: s_branch .LBB110_6 +; GFX950-SDAG-NEXT: .LBB110_3: +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: s_branch .LBB110_7 +; GFX950-SDAG-NEXT: .LBB110_4: +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; GFX950-SDAG-NEXT: .LBB110_6: ; %Flow1 +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB110_8 +; GFX950-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_f64_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB110_6 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB110_3 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX950-GISEL-NEXT: .LBB110_3: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB110_5 +; GFX950-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[4:5], s2 +; GFX950-GISEL-NEXT: .LBB110_5: ; %Flow1 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX950-GISEL-NEXT: .LBB110_6: ; %Flow2 +; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB110_8 +; GFX950-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fadd_f64_saddr_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow2 +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_8 +; GFX1250-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_5 +; GFX1250-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0 +; GFX1250-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: .LBB111_5: ; %Flow +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 +; GFX1250-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_add_f64_e32 v[2:3], v[2:3], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[2:3], s2 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: .LBB111_7: ; %Flow1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB111_2 +; GFX1250-SDAG-NEXT: .LBB111_8: ; %atomicrmw.shared +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-SDAG-NEXT: ds_add_f64 v2, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fadd_f64_saddr_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: .LBB111_3: ; %Flow +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 +; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_add_f64_e32 v[2:3], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[2:3], s2 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB111_5: ; %Flow1 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX1250-GISEL-NEXT: .LBB111_6: ; %Flow2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_8 +; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-GISEL-NEXT: ds_add_f64 v2, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_f64_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 +; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB111_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow2 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB111_8 +; GFX950-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB111_5 +; GFX950-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: .LBB111_5: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB111_7 +; GFX950-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[2:3], s2 +; GFX950-SDAG-NEXT: .LBB111_7: ; %Flow1 +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB111_2 +; GFX950-SDAG-NEXT: .LBB111_8: ; %atomicrmw.shared +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: ds_add_f64 v2, v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_f64_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_6 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private +; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 1 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_3 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX950-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX950-GISEL-NEXT: .LBB111_3: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB111_5 +; GFX950-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s2, s0, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s2 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[2:3], s2 +; GFX950-GISEL-NEXT: .LBB111_5: ; %Flow1 +; GFX950-GISEL-NEXT: s_mov_b32 s2, 0 +; GFX950-GISEL-NEXT: .LBB111_6: ; %Flow2 +; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB111_8 +; GFX950-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: ds_add_f64 v2, v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fmax_f64_saddr_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB112_2 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB112_3 +; GFX1250-SDAG-NEXT: s_branch .LBB112_4 +; GFX1250-SDAG-NEXT: .LBB112_2: +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmax_f64_saddr_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB112_2: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_f64_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB112_2 +; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB112_3 +; GFX950-SDAG-NEXT: s_branch .LBB112_4 +; GFX950-SDAG-NEXT: .LBB112_2: +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_f64_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 +; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB112_2 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] offset:80 sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX950-GISEL-NEXT: .LBB112_2: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB112_4 +; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fmax_f64_saddr_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB113_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB113_4 +; GFX1250-SDAG-NEXT: .LBB113_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2 +; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmax_f64_saddr_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB113_2: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB113_4: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_f64_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 +; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB113_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB113_4 +; GFX950-SDAG-NEXT: .LBB113_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_atomic_max_f64 v[2:3], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB113_2 +; GFX950-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_f64_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 +; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB113_2 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_max_f64 v[2:3], v[0:1] offset:80 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX950-GISEL-NEXT: .LBB113_2: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB113_4 +; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-GISEL-NEXT: .LBB113_4: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fmin_f64_saddr_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB114_2 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB114_3 +; GFX1250-SDAG-NEXT: s_branch .LBB114_4 +; GFX1250-SDAG-NEXT: .LBB114_2: +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmin_f64_saddr_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB114_2: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_f64_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB114_2 +; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0 +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_cbranch_execz .LBB114_3 +; GFX950-SDAG-NEXT: s_branch .LBB114_4 +; GFX950-SDAG-NEXT: .LBB114_2: +; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_f64_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 +; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB114_2 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] offset:80 sc0 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX950-GISEL-NEXT: .LBB114_2: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB114_4 +; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3] +; GFX950-GISEL-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3 +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret double %result +} + +define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) { +; GFX1250-SDAG-LABEL: flat_atomic_fmin_f64_saddr_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2 +; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000 +; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1 +; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB115_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2 +; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB115_4 +; GFX1250-SDAG-NEXT: .LBB115_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; GFX1250-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2 +; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2 +; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-GISEL-LABEL: flat_atomic_fmin_f64_saddr_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0 +; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4 +; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] offset:80 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: .LBB115_2: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 +; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0 +; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1] +; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE +; GFX1250-GISEL-NEXT: .LBB115_4: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_f64_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50 +; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base +; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3 +; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1 +; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB115_3 +; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB115_4 +; GFX950-SDAG-NEXT: .LBB115_2: ; %atomicrmw.phi +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX950-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_atomic_min_f64 v[2:3], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB115_2 +; GFX950-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private +; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1 +; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_f64_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50 +; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0 +; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5 +; GFX950-GISEL-NEXT: s_mov_b32 s4, 1 +; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB115_2 +; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_min_f64 v[2:3], v[0:1] offset:80 +; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX950-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX950-GISEL-NEXT: .LBB115_2: ; %Flow +; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1 +; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0 +; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB115_4 +; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private +; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1 +; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0 +; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX950-GISEL-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1] +; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0 +; GFX950-GISEL-NEXT: .LBB115_4: ; %atomicrmw.phi +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @flat_atomic_fadd_f32_saddr_rtn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fadd_f32_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: flat_atomic_add_f32 v0, v[2:3], v0 offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_f32_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_add_f32 v0, v[2:3], v0 offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @flat_atomic_fadd_f32_saddr_nortn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fadd_f32_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_add_f32 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: flat_atomic_add_f32 v[2:3], v0 offset:40 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_f32_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_add_f32 v[2:3], v0 offset:40 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fmax_f32_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-GISEL-NEXT: .LBB118_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB118_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fmax_f32_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_max_num_f32 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v2, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB119_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-GISEL-NEXT: .LBB119_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB119_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fmin_f32_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX950-GISEL-NEXT: .LBB120_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v5, v5 +; GFX950-GISEL-NEXT: v_min_f32_e32 v4, v0, v1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB120_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret float %result +} + +define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) { +; GFX1250-LABEL: flat_atomic_fmin_f32_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_min_num_f32 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX950-SDAG-NEXT: v_min_f32_e32 v2, v2, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB121_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v0 +; GFX950-GISEL-NEXT: .LBB121_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX950-GISEL-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB121_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_pk_add_f16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[2:3], v0 offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_v2f16_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[2:3], v0 offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_pk_add_f16 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-SDAG-NEXT: flat_atomic_pk_add_f16 v[2:3], v0 offset:40 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fadd_v2f16_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_atomic_pk_add_f16 v[2:3], v0 offset:40 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v5, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX1250-NEXT: v_pk_max_num_f16 v4, v0, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB124_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB124_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-GISEL-NEXT: .LBB124_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB124_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v3 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB125_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB125_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-GISEL-NEXT: .LBB125_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v4 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB125_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1 +; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v5, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5 +; GFX1250-NEXT: v_pk_min_num_f16 v4, v0, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB126_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB126_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_rtn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1 +; GFX950-GISEL-NEXT: .LBB126_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v5, v5 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB126_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x half> %result +} + +define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) { +; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v3 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB127_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: +; GFX950-SDAG: ; %bb.0: +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40 +; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX950-SDAG-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3 +; GFX950-SDAG-NEXT: s_nop 0 +; GFX950-SDAG-NEXT: v_pk_min_f16 v2, v2, v4 +; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0 +; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB127_1 +; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_nortn: +; GFX950-GISEL: ; %bb.0: +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0 +; GFX950-GISEL-NEXT: .LBB127_1: ; %atomicrmw.start +; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v1, v1 +; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_pk_min_f16 v0, v0, v4 +; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB127_1 +; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x bfloat> @flat_atomic_fadd_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fadd_v2bf16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_pk_add_bf16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_rtn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[2:3], v0 offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @flat_atomic_fadd_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: flat_atomic_pk_add_bf16 v1, v0, s[0:1] offset:40 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-NEXT: flat_atomic_pk_add_bf16 v[2:3], v0 offset:40 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fmax_v2bf16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v5, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_bf16 v4, v5, v0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB130_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_rtn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v7, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v5, v5, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB130_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @flat_atomic_fmax_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fmax_v2bf16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40 +; GFX1250-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v3, v0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB131_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_nortn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_max_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB131_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + +define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fmin_v2bf16_saddr_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40 +; GFX1250-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v5, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_min_num_bf16 v4, v5, v0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB132_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: v_mov_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_rtn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX950-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v7, v0 +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v5, v5, v1 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB132_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret <2 x bfloat> %result +} + +define void @flat_atomic_fmin_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) { +; GFX1250-LABEL: flat_atomic_fmin_v2bf16_saddr_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0 +; GFX1250-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40 +; GFX1250-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_pk_min_num_bf16 v2, v3, v0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v3, v2 +; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_cbranch_execnz .LBB133_1 +; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +; +; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_nortn: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40 +; GFX950-NEXT: s_mov_b64 s[2:3], 0 +; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX950-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX950-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX950-NEXT: v_min_f32_e32 v6, v6, v5 +; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0 +; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_cbranch_execnz .LBB133_1 +; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX950-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX950-NEXT: s_setpc_b64 s[30:31] + %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10 + %unused = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0 + ret void +} + attributes #0 = { argmemonly nounwind willreturn } + +!0 = !{}