diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 3965b5dd8c5c3..74632c71f0f95 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1275,8 +1275,8 @@ class FlatLoadSaddrPat (inst $saddr, $voffset, $offset, 0) >; -class GlobalStoreSaddrPat : GCNPat < +class FlatStoreSaddrPat : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) >; @@ -1485,7 +1485,7 @@ multiclass GlobalFLATStorePats(!cast(inst)#"_SADDR"), node, vt> { + def : FlatStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1495,7 +1495,7 @@ multiclass GlobalFLATStorePats_D16_t16(inst#"_SADDR_t16"), node, vt> { + def : FlatStoreSaddrPat(inst#"_SADDR_t16"), node, vt> { let AddedComplexity = 11; } } @@ -1655,6 +1655,24 @@ multiclass FlatLoadPats_D16_t16 { + def : FlatStorePat ; + + def : FlatStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats_t16 { + def : FlatStorePat (!cast(inst)#"_t16"), node, vt>; + + def : FlatStoreSaddrPat(!cast(inst)#"_SADDR_t16"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + let OtherPredicates = [HasFlatAddressSpace] in { defm : FlatLoadPats ; @@ -1682,10 +1700,10 @@ let True16Predicate = p in { defm : FlatLoadPats ; defm : FlatLoadPats ; defm : FlatLoadPats ; - def : FlatStorePat ; - def : FlatStorePat ; - def : FlatStorePat ; - def : FlatStorePat ; + defm : FlatStorePats ; + defm : FlatStorePats ; + defm : FlatStorePats ; + defm : FlatStorePats ; } let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { @@ -1697,8 +1715,8 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats_D16_t16; defm : FlatLoadPats_D16_t16; defm : FlatLoadPats_D16_t16; - def : FlatStorePat ; - def : FlatStorePat ; + defm : FlatStorePats_t16 ; + defm : FlatStorePats_t16 ; def : FlatStorePat ; def : FlatStorePat ; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts @@ -1706,30 +1724,31 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats ; defm : FlatLoadPats ; -def : FlatStorePat ; -def : FlatStorePat ; +defm : FlatStorePats ; +defm : FlatStorePats ; foreach vt = Reg32Types.types in { defm : FlatLoadPats ; -def : FlatStorePat ; +defm : FlatStorePats ; } foreach vt = VReg_64.RegTypes in { -def : FlatStorePat ; +defm : FlatStorePats ; def : FlatLoadPat ; } -def : FlatStorePat ; +defm : FlatStorePats ; foreach vt = VReg_128.RegTypes in { defm : FlatLoadPats ; -def : FlatStorePat ; +defm : FlatStorePats ; } -def : FlatStorePat ; -def : FlatStorePat ; -def : FlatStorePat ; -def : FlatStorePat ; +defm : FlatStorePats ; +defm : FlatStorePats ; +defm : FlatStorePats ; +defm : FlatStorePats ; + foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1780,8 +1799,8 @@ let SubtargetPredicate = isGFX12Plus in { } let OtherPredicates = [HasD16LoadStore] in { -def : FlatStorePat ; -def : FlatStorePat ; +defm : FlatStorePats ; +defm : FlatStorePats ; } let OtherPredicates = [D16PreservesUnusedBits] in { diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll new file mode 100644 index 0000000000000..32888d2acf1cd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll @@ -0,0 +1,1118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +; Test using saddr addressing mode of flat_*store_* instructions. + +define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { +; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] +; GFX1250-NEXT: s_endpgm + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i8 %data, ptr %gep0 + ret void +} + +; Maximum positive offset on gfx10 +define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { +; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_2047: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:2047 +; GFX1250-NEXT: s_endpgm + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047 + store i8 %data, ptr %gep1 + ret void +} + +; Maximum negative offset on gfx10 +define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { +; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_neg2048: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:-2048 +; GFX1250-NEXT: s_endpgm + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 + store i8 %data, ptr %gep1 + ret void +} + +; -------------------------------------------------------------------------------- +; Uniformity edge cases +; -------------------------------------------------------------------------------- + +@ptr.in.lds = internal addrspace(3) global ptr undef + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_store_b8 v0, v1, s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_store_b8 v[2:3], v1 +; GFX1250-GISEL-NEXT: s_endpgm + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i8 %data, ptr %gep0 + ret void +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_store_b8 v0, v1, s[0:1] offset:-120 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_store_b8 v[2:3], v1 offset:-120 +; GFX1250-GISEL-NEXT: s_endpgm + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -120 + store i8 %data, ptr %gep1 + ret void +} + +; -------------------------------------------------------------------------------- +; Stress various type stores +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, i16 %data) { +; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i16 %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i16 %data) { +; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store i16 %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, half %data) { +; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store half %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, half %data) { +; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store half %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i32 %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store i32 %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, float %data) { +; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store float %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, float %data) { +; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store float %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) { +; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store ptr addrspace(3) %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) { +; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store ptr addrspace(3) %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i64 %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store i64 %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, double %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store double %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, double %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store double %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x i32> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x i32> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x float> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x float> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x i16> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x i16> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x half> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x half> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store ptr %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store ptr %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <3 x i32> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <3 x i32> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <3 x float> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <3 x float> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <6 x i16> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <6 x i16> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <6 x half> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <6 x half> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x i32> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x i32> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x float> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x float> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x i64> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x i64> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x double> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x double> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x double> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x double> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <8 x i16> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <8 x i16> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <8 x half> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <8 x half> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x ptr> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x ptr> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x ptr addrspace(3)> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x ptr addrspace(3)> %data, ptr %gep1 + ret void +} + +; -------------------------------------------------------------------------------- +; Atomic store +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store atomic i32 %data, ptr %gep0 seq_cst, align 4 + ret void +} + +define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store atomic i32 %data, ptr %gep1 seq_cst, align 4 + ret void +} + +define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store atomic i64 %data, ptr %gep0 seq_cst, align 8 + ret void +} + +define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store atomic i64 %data, ptr %gep1 seq_cst, align 8 + ret void +} + +; -------------------------------------------------------------------------------- +; D16 HI store (hi 16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %data.hi = extractelement <2 x i16> %data, i32 1 + store i16 %data.hi, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %data.hi = extractelement <2 x i16> %data, i32 1 + store i16 %data.hi, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %data.hi = extractelement <2 x i16> %data, i32 1 + %data.hi.trunc = trunc i16 %data.hi to i8 + store i8 %data.hi.trunc, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %data.hi = extractelement <2 x i16> %data, i32 1 + %data.hi.trunc = trunc i16 %data.hi to i8 + store i8 %data.hi.trunc, ptr %gep1 + ret void +}