@@ -31,12 +31,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) {
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: buffer_wbl2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_invl2
; GFX940-NEXT: buffer_wbinvl1_vol
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
ret void
@@ -49,12 +48,11 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 {
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX940-NEXT: buffer_wbl2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_invl2
; GFX940-NEXT: buffer_wbinvl1_vol
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
ret void
@@ -76,12 +74,11 @@ define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) {
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: buffer_wbl2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_invl2
; GFX940-NEXT: buffer_wbinvl1_vol
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst
ret float %ret
@@ -195,12 +192,11 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NEXT: buffer_wbl2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_pk_add_bf16 v0, v1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_invl2
; GFX940-NEXT: buffer_wbinvl1_vol
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_endpgm
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data)
ret void
@@ -210,12 +206,11 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2
; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_invl2
; GFX940-NEXT: buffer_wbinvl1_vol
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data)
ret <2 x i16> %ret

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -5,6 +5,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s

define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
@@ -84,6 +86,32 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4, !nontemporal !0
@@ -179,6 +207,36 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] nt
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -266,6 +324,32 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4
@@ -361,6 +445,36 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, s1
; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[2:3]
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-NOTTGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX940-NOTTGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, s1
; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[2:3]
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GFX940-TGSPLIT-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s

define amdgpu_kernel void @global_nontemporal_load_0(
; GFX6-LABEL: global_nontemporal_load_0:
@@ -91,6 +93,28 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_0:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_nontemporal_load_0:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
@@ -191,6 +215,28 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_load_1:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_load_dword v0, v0, s[0:1] nt
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_nontemporal_load_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_load_dword v0, v0, s[0:1] nt
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -284,6 +330,28 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_0:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_nontemporal_store_0:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -379,6 +447,28 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: global_nontemporal_store_1:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: global_nontemporal_store_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s

define amdgpu_kernel void @local_nontemporal_load_0(
; GFX6-LABEL: local_nontemporal_load_0:
@@ -99,6 +101,30 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: local_nontemporal_load_0:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
@@ -201,6 +227,30 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-NOTTGSPLIT-NEXT: ds_read_b32 v0, v0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: local_nontemporal_load_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-TGSPLIT-NEXT: ds_read_b32 v0, v0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -299,6 +349,30 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s4
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: local_nontemporal_store_0:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s4
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -398,6 +472,30 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NOTTGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: local_nontemporal_store_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

@@ -6,6 +6,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s

define amdgpu_kernel void @private_nontemporal_load_0(
; GFX6-LABEL: private_nontemporal_load_0:
@@ -125,6 +127,28 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: private_nontemporal_load_0:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, off, s4 nt
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
@@ -253,6 +277,30 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v1, v0, s[0:1]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-NOTTGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: private_nontemporal_load_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, 0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-TGSPLIT-NEXT: scratch_load_dword v0, v0, off nt
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -378,6 +426,28 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-TGSPLIT-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: private_nontemporal_store_0:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -504,6 +574,30 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX90A-TGSPLIT-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX940-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-NOTTGSPLIT-NEXT: scratch_store_dword v0, v1, off nt
; GFX940-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX940-TGSPLIT-LABEL: private_nontemporal_store_1:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX940-TGSPLIT-NEXT: s_load_dword s4, s[0:1], 0x8
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s4
; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, off nt
; GFX940-TGSPLIT-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()