Skip to content

Commit

Permalink
AMDGPU: Handle non-temporal loads and stores
Browse files Browse the repository at this point in the history
Differential Revision: https://reviews.llvm.org/D36862

llvm-svn: 312729
  • Loading branch information
kzhuravl committed Sep 7, 2017
1 parent 257132a commit 5f5b586
Show file tree
Hide file tree
Showing 6 changed files with 586 additions and 32 deletions.
82 changes: 59 additions & 23 deletions llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
Expand Up @@ -52,13 +52,15 @@ class SIMemOpInfo final {
SyncScope::ID SSID = SyncScope::System;
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
bool IsNonTemporal = false;

SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
: SSID(SSID), Ordering(Ordering) {}

SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
AtomicOrdering FailureOrdering)
: SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering) {}
AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
: SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
IsNonTemporal(IsNonTemporal) {}

/// \returns Info constructed from \p MI, which has at least machine memory
/// operand.
Expand All @@ -81,6 +83,11 @@ class SIMemOpInfo final {
AtomicOrdering getFailureOrdering() const {
return FailureOrdering;
}
/// \returns True if memory access of the machine instruction used to
/// create this SIMemOpInfo is non-temporal, false otherwise.
bool isNonTemporal() const {
return IsNonTemporal;
}

/// \returns True if ordering constraint of the machine instruction used to
/// create this SIMemOpInfo is unordered or higher, false otherwise.
Expand Down Expand Up @@ -130,6 +137,34 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
/// \brief List of atomic pseudo instructions.
std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;

/// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns
/// true if \p MI is modified, false otherwise.
template <uint16_t BitName>
bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
if (BitIdx == -1)
return false;

MachineOperand &Bit = MI->getOperand(BitIdx);
if (Bit.getImm() != 0)
return false;

Bit.setImm(1);
return true;
}

/// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit<AMDGPU::OpName::glc>(MI);
}

/// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
/// is modified, false otherwise.
bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
return enableNamedBit<AMDGPU::OpName::slc>(MI);
}

/// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
/// Always returns true.
bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
Expand All @@ -139,10 +174,6 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
bool Before = true) const;

/// \brief Sets GLC bit if present in \p MI. Returns true if \p MI is
/// modified, false otherwise.
bool setGLC(const MachineBasicBlock::iterator &MI) const;

/// \brief Removes all processed atomic pseudo instructions from the current
/// function. Returns true if current function is modified, false otherwise.
bool removeAtomicPseudoMIs();
Expand Down Expand Up @@ -199,6 +230,7 @@ Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
SyncScope::ID SSID = SyncScope::SingleThread;
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
bool IsNonTemporal = true;

// Validator should check whether or not MMOs cover the entire set of
// locations accessed by the memory instruction.
Expand All @@ -217,9 +249,12 @@ Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
FailureOrdering =
isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
FailureOrdering : MMO->getFailureOrdering();

if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
IsNonTemporal = false;
}

return SIMemOpInfo(SSID, Ordering, FailureOrdering);
return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
}

/* static */
Expand Down Expand Up @@ -343,19 +378,6 @@ bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
return true;
}

bool SIMemoryLegalizer::setGLC(const MachineBasicBlock::iterator &MI) const {
int GLCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::glc);
if (GLCIdx == -1)
return false;

MachineOperand &GLC = MI->getOperand(GLCIdx);
if (GLC.getImm() == 1)
return false;

GLC.setImm(1);
return true;
}

bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
Expand All @@ -378,7 +400,7 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
MOI.getSSID() == MMI->getAgentSSID()) {
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= setGLC(MI);
Changed |= enableGLCBit(MI);

if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
Changed |= insertWaitcntVmcnt0(MI);
Expand All @@ -401,6 +423,13 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
llvm_unreachable("Unsupported synchronization scope");
}

// Atomic instructions do not have the nontemporal attribute.
if (MOI.isNonTemporal()) {
Changed |= enableGLCBit(MI);
Changed |= enableSLCBit(MI);
return Changed;
}

return Changed;
}

Expand Down Expand Up @@ -429,6 +458,13 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
llvm_unreachable("Unsupported synchronization scope");
}

// Atomic instructions do not have the nontemporal attribute.
if (MOI.isNonTemporal()) {
Changed |= enableGLCBit(MI);
Changed |= enableSLCBit(MI);
return Changed;
}

return Changed;
}

Expand Down Expand Up @@ -499,7 +535,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI,
if (MOI.getSSID() == SyncScope::SingleThread ||
MOI.getSSID() == MMI->getWorkgroupSSID() ||
MOI.getSSID() == MMI->getWavefrontSSID()) {
Changed |= setGLC(MI);
Changed |= enableGLCBit(MI);
return Changed;
}

Expand Down Expand Up @@ -536,7 +572,7 @@ bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
if (MOI.getSSID() == SyncScope::SingleThread ||
MOI.getSSID() == MMI->getWorkgroupSSID() ||
MOI.getSSID() == MMI->getWavefrontSSID()) {
Changed |= setGLC(MI);
Changed |= enableGLCBit(MI);
return Changed;
}

Expand Down
97 changes: 97 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-nontemporal-load.ll
@@ -0,0 +1,97 @@
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s

declare i32 @llvm.amdgcn.workitem.id.x()

; GCN-LABEL: {{^}}nontemporal_load_private_0
; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
define amdgpu_kernel void @nontemporal_load_private_0(
i32* %in, i32 addrspace(4)* %out) {
entry:
%val = load i32, i32* %in, align 4, !nontemporal !0
store i32 %val, i32 addrspace(4)* %out
ret void
}

; GCN-LABEL: {{^}}nontemporal_load_private_1
; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
define amdgpu_kernel void @nontemporal_load_private_1(
i32* %in, i32 addrspace(4)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
%val = load i32, i32* %val.gep, align 4, !nontemporal !0
store i32 %val, i32 addrspace(4)* %out
ret void
}

; GCN-LABEL: {{^}}nontemporal_load_global_0
; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}}
define amdgpu_kernel void @nontemporal_load_global_0(
i32 addrspace(1)* %in, i32 addrspace(4)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
store i32 %val, i32 addrspace(4)* %out
ret void
}

; GCN-LABEL: {{^}}nontemporal_load_global_1
; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
define amdgpu_kernel void @nontemporal_load_global_1(
i32 addrspace(1)* %in, i32 addrspace(4)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
%val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0
store i32 %val, i32 addrspace(4)* %out
ret void
}

; GCN-LABEL: {{^}}nontemporal_load_local_0
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
define amdgpu_kernel void @nontemporal_load_local_0(
i32 addrspace(3)* %in, i32 addrspace(4)* %out) {
entry:
%val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
store i32 %val, i32 addrspace(4)* %out
ret void
}

; GCN-LABEL: {{^}}nontemporal_load_local_1
; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
define amdgpu_kernel void @nontemporal_load_local_1(
i32 addrspace(3)* %in, i32 addrspace(4)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
%val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
store i32 %val, i32 addrspace(4)* %out
ret void
}

; GCN-LABEL: {{^}}nontemporal_load_flat_0
; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
define amdgpu_kernel void @nontemporal_load_flat_0(
i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
entry:
%val = load i32, i32 addrspace(4)* %in, align 4, !nontemporal !0
store i32 %val, i32 addrspace(4)* %out
ret void
}

; GCN-LABEL: {{^}}nontemporal_load_flat_1
; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
define amdgpu_kernel void @nontemporal_load_flat_1(
i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val.gep = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tid
%val = load i32, i32 addrspace(4)* %val.gep, align 4, !nontemporal !0
store i32 %val, i32 addrspace(4)* %out
ret void
}

!0 = !{i32 1}
97 changes: 97 additions & 0 deletions llvm/test/CodeGen/AMDGPU/memory-legalizer-nontemporal-store.ll
@@ -0,0 +1,97 @@
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s

declare i32 @llvm.amdgcn.workitem.id.x()

; GCN-LABEL: {{^}}nontemporal_store_private_0
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
define amdgpu_kernel void @nontemporal_store_private_0(
i32 addrspace(4)* %in, i32* %out) {
entry:
%val = load i32, i32 addrspace(4)* %in, align 4
store i32 %val, i32* %out, !nontemporal !0
ret void
}

; GCN-LABEL: {{^}}nontemporal_store_private_1
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
define amdgpu_kernel void @nontemporal_store_private_1(
i32 addrspace(4)* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(4)* %in, align 4
%out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
store i32 %val, i32* %out.gep, !nontemporal !0
ret void
}

; GCN-LABEL: {{^}}nontemporal_store_global_0
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc slc{{$}}
define amdgpu_kernel void @nontemporal_store_global_0(
i32 addrspace(4)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(4)* %in, align 4
store i32 %val, i32 addrspace(1)* %out, !nontemporal !0
ret void
}

; GCN-LABEL: {{^}}nontemporal_store_global_1
; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
define amdgpu_kernel void @nontemporal_store_global_1(
i32 addrspace(4)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(4)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0
ret void
}

; GCN-LABEL: {{^}}nontemporal_store_local_0
; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
define amdgpu_kernel void @nontemporal_store_local_0(
i32 addrspace(4)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(4)* %in, align 4
store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
ret void
}

; GCN-LABEL: {{^}}nontemporal_store_local_1
; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
define amdgpu_kernel void @nontemporal_store_local_1(
i32 addrspace(4)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(4)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
ret void
}

; GCN-LABEL: {{^}}nontemporal_store_flat_0
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
define amdgpu_kernel void @nontemporal_store_flat_0(
i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
entry:
%val = load i32, i32 addrspace(4)* %in, align 4
store i32 %val, i32 addrspace(4)* %out, !nontemporal !0
ret void
}

; GCN-LABEL: {{^}}nontemporal_store_flat_1
; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
define amdgpu_kernel void @nontemporal_store_flat_1(
i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%val = load i32, i32 addrspace(4)* %in, align 4
%out.gep = getelementptr inbounds i32, i32 addrspace(4)* %out, i32 %tid
store i32 %val, i32 addrspace(4)* %out.gep, !nontemporal !0
ret void
}

!0 = !{i32 1}

0 comments on commit 5f5b586

Please sign in to comment.