AMDGPU: Handle non-temporal loads and stores

Differential Revision: https://reviews.llvm.org/D36862 llvm-svn: 312729
llvm · Sep 7, 2017 · 5f5b586 · 5f5b586
1 parent 257132a
commit 5f5b586
Show file tree

Hide file tree

Showing 6 changed files with 586 additions and 32 deletions.
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -52,13 +52,15 @@ class SIMemOpInfo final {
   SyncScope::ID SSID = SyncScope::System;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  bool IsNonTemporal = false;
 
   SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering)
       : SSID(SSID), Ordering(Ordering) {}
 
   SIMemOpInfo(SyncScope::ID SSID, AtomicOrdering Ordering,
-              AtomicOrdering FailureOrdering)
-      : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering) {}
+              AtomicOrdering FailureOrdering, bool IsNonTemporal = false)
+      : SSID(SSID), Ordering(Ordering), FailureOrdering(FailureOrdering),
+        IsNonTemporal(IsNonTemporal) {}
 
   /// \returns Info constructed from \p MI, which has at least machine memory
   /// operand.
@@ -81,6 +83,11 @@ class SIMemOpInfo final {
   AtomicOrdering getFailureOrdering() const {
     return FailureOrdering;
   }
+  /// \returns True if memory access of the machine instruction used to
+  /// create this SIMemOpInfo is non-temporal, false otherwise.
+  bool isNonTemporal() const {
+    return IsNonTemporal;
+  }
 
   /// \returns True if ordering constraint of the machine instruction used to
   /// create this SIMemOpInfo is unordered or higher, false otherwise.
@@ -130,6 +137,34 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
   /// \brief List of atomic pseudo instructions.
   std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
 
+  /// \brief Sets named bit (BitName) to "true" if present in \p MI. Returns
+  /// true if \p MI is modified, false otherwise.
+  template <uint16_t BitName>
+  bool enableNamedBit(const MachineBasicBlock::iterator &MI) const {
+    int BitIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), BitName);
+    if (BitIdx == -1)
+      return false;
+
+    MachineOperand &Bit = MI->getOperand(BitIdx);
+    if (Bit.getImm() != 0)
+      return false;
+
+    Bit.setImm(1);
+    return true;
+  }
+
+  /// \brief Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
+  /// is modified, false otherwise.
+  bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
+    return enableNamedBit<AMDGPU::OpName::glc>(MI);
+  }
+
+  /// \brief Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
+  /// is modified, false otherwise.
+  bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
+    return enableNamedBit<AMDGPU::OpName::slc>(MI);
+  }
+
   /// \brief Inserts "buffer_wbinvl1_vol" instruction \p Before or after \p MI.
   /// Always returns true.
   bool insertBufferWbinvl1Vol(MachineBasicBlock::iterator &MI,
@@ -139,10 +174,6 @@ class SIMemoryLegalizer final : public MachineFunctionPass {
   bool insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
                            bool Before = true) const;
 
-  /// \brief Sets GLC bit if present in \p MI. Returns true if \p MI is
-  /// modified, false otherwise.
-  bool setGLC(const MachineBasicBlock::iterator &MI) const;
-
   /// \brief Removes all processed atomic pseudo instructions from the current
   /// function. Returns true if current function is modified, false otherwise.
   bool removeAtomicPseudoMIs();
@@ -199,6 +230,7 @@ Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
   SyncScope::ID SSID = SyncScope::SingleThread;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
+  bool IsNonTemporal = true;
 
   // Validator should check whether or not MMOs cover the entire set of
   // locations accessed by the memory instruction.
@@ -217,9 +249,12 @@ Optional<SIMemOpInfo> SIMemOpInfo::constructFromMIWithMMO(
     FailureOrdering =
         isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
             FailureOrdering : MMO->getFailureOrdering();
+
+    if (!(MMO->getFlags() & MachineMemOperand::MONonTemporal))
+      IsNonTemporal = false;
   }
 
-  return SIMemOpInfo(SSID, Ordering, FailureOrdering);
+  return SIMemOpInfo(SSID, Ordering, FailureOrdering, IsNonTemporal);
 }
 
 /* static */
@@ -343,19 +378,6 @@ bool SIMemoryLegalizer::insertWaitcntVmcnt0(MachineBasicBlock::iterator &MI,
   return true;
 }
 
-bool SIMemoryLegalizer::setGLC(const MachineBasicBlock::iterator &MI) const {
-  int GLCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::glc);
-  if (GLCIdx == -1)
-    return false;
-
-  MachineOperand &GLC = MI->getOperand(GLCIdx);
-  if (GLC.getImm() == 1)
-    return false;
-
-  GLC.setImm(1);
-  return true;
-}
-
 bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
   if (AtomicPseudoMIs.empty())
     return false;
@@ -378,7 +400,7 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
         MOI.getSSID() == MMI->getAgentSSID()) {
       if (MOI.getOrdering() == AtomicOrdering::Acquire ||
           MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
-        Changed |= setGLC(MI);
+        Changed |= enableGLCBit(MI);
 
       if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
         Changed |= insertWaitcntVmcnt0(MI);
@@ -401,6 +423,13 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
     llvm_unreachable("Unsupported synchronization scope");
   }
 
+  // Atomic instructions do not have the nontemporal attribute.
+  if (MOI.isNonTemporal()) {
+    Changed |= enableGLCBit(MI);
+    Changed |= enableSLCBit(MI);
+    return Changed;
+  }
+
   return Changed;
 }
 
@@ -429,6 +458,13 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
     llvm_unreachable("Unsupported synchronization scope");
   }
 
+  // Atomic instructions do not have the nontemporal attribute.
+  if (MOI.isNonTemporal()) {
+    Changed |= enableGLCBit(MI);
+    Changed |= enableSLCBit(MI);
+    return Changed;
+  }
+
   return Changed;
 }
 
@@ -499,7 +535,7 @@ bool SIMemoryLegalizer::expandAtomicCmpxchg(const SIMemOpInfo &MOI,
     if (MOI.getSSID() == SyncScope::SingleThread ||
         MOI.getSSID() == MMI->getWorkgroupSSID() ||
         MOI.getSSID() == MMI->getWavefrontSSID()) {
-      Changed |= setGLC(MI);
+      Changed |= enableGLCBit(MI);
       return Changed;
     }
 
@@ -536,7 +572,7 @@ bool SIMemoryLegalizer::expandAtomicRmw(const SIMemOpInfo &MOI,
     if (MOI.getSSID() == SyncScope::SingleThread ||
         MOI.getSSID() == MMI->getWorkgroupSSID() ||
         MOI.getSSID() == MMI->getWavefrontSSID()) {
-      Changed |= setGLC(MI);
+      Changed |= enableGLCBit(MI);
       return Changed;
     }
 

diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-nontemporal-load.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-nontemporal-load.ll
@@ -0,0 +1,97 @@
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+; GCN-LABEL: {{^}}nontemporal_load_private_0
+; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
+define amdgpu_kernel void @nontemporal_load_private_0(
+    i32* %in, i32 addrspace(4)* %out) {
+entry:
+  %val = load i32, i32* %in, align 4, !nontemporal !0
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_load_private_1
+; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
+define amdgpu_kernel void @nontemporal_load_private_1(
+    i32* %in, i32 addrspace(4)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
+  %val = load i32, i32* %val.gep, align 4, !nontemporal !0
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_load_global_0
+; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}}
+define amdgpu_kernel void @nontemporal_load_global_0(
+    i32 addrspace(1)* %in, i32 addrspace(4)* %out) {
+entry:
+  %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_load_global_1
+; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
+define amdgpu_kernel void @nontemporal_load_global_1(
+    i32 addrspace(1)* %in, i32 addrspace(4)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
+  %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_load_local_0
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @nontemporal_load_local_0(
+    i32 addrspace(3)* %in, i32 addrspace(4)* %out) {
+entry:
+  %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_load_local_1
+; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @nontemporal_load_local_1(
+    i32 addrspace(3)* %in, i32 addrspace(4)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
+  %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_load_flat_0
+; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
+define amdgpu_kernel void @nontemporal_load_flat_0(
+    i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+entry:
+  %val = load i32, i32 addrspace(4)* %in, align 4, !nontemporal !0
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_load_flat_1
+; GCN: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
+define amdgpu_kernel void @nontemporal_load_flat_1(
+    i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val.gep = getelementptr inbounds i32, i32 addrspace(4)* %in, i32 %tid
+  %val = load i32, i32 addrspace(4)* %val.gep, align 4, !nontemporal !0
+  store i32 %val, i32 addrspace(4)* %out
+  ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-nontemporal-store.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-nontemporal-store.ll
@@ -0,0 +1,97 @@
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=GCN --check-prefix=GFX9 %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+; GCN-LABEL: {{^}}nontemporal_store_private_0
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
+define amdgpu_kernel void @nontemporal_store_private_0(
+    i32 addrspace(4)* %in, i32* %out) {
+entry:
+  %val = load i32, i32 addrspace(4)* %in, align 4
+  store i32 %val, i32* %out, !nontemporal !0
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_store_private_1
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
+define amdgpu_kernel void @nontemporal_store_private_1(
+    i32 addrspace(4)* %in, i32* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val = load i32, i32 addrspace(4)* %in, align 4
+  %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
+  store i32 %val, i32* %out.gep, !nontemporal !0
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_store_global_0
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc slc{{$}}
+define amdgpu_kernel void @nontemporal_store_global_0(
+    i32 addrspace(4)* %in, i32 addrspace(1)* %out) {
+entry:
+  %val = load i32, i32 addrspace(4)* %in, align 4
+  store i32 %val, i32 addrspace(1)* %out, !nontemporal !0
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_store_global_1
+; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
+; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
+define amdgpu_kernel void @nontemporal_store_global_1(
+    i32 addrspace(4)* %in, i32 addrspace(1)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val = load i32, i32 addrspace(4)* %in, align 4
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_store_local_0
+; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @nontemporal_store_local_0(
+    i32 addrspace(4)* %in, i32 addrspace(3)* %out) {
+entry:
+  %val = load i32, i32 addrspace(4)* %in, align 4
+  store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_store_local_1
+; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @nontemporal_store_local_1(
+    i32 addrspace(4)* %in, i32 addrspace(3)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val = load i32, i32 addrspace(4)* %in, align 4
+  %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
+  store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_store_flat_0
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
+define amdgpu_kernel void @nontemporal_store_flat_0(
+    i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+entry:
+  %val = load i32, i32 addrspace(4)* %in, align 4
+  store i32 %val, i32 addrspace(4)* %out, !nontemporal !0
+  ret void
+}
+
+; GCN-LABEL: {{^}}nontemporal_store_flat_1
+; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
+define amdgpu_kernel void @nontemporal_store_flat_1(
+    i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %val = load i32, i32 addrspace(4)* %in, align 4
+  %out.gep = getelementptr inbounds i32, i32 addrspace(4)* %out, i32 %tid
+  store i32 %val, i32 addrspace(4)* %out.gep, !nontemporal !0
+  ret void
+}
+
+!0 = !{i32 1}