From 6d6b11a4a6a418956ac963bb9a7d9b1fd2966e3d Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Sun, 11 Sep 2016 01:39:04 +0000 Subject: [PATCH] [NVPTX] Use ldg for explicitly invariant loads. Summary: With this change (plus some changes to prevent !invariant from being clobbered within llvm), clang will be able to model the __ldg CUDA builtin as an invariant load, rather than as a target-specific llvm intrinsic. This will let the optimizer play with these loads -- specifically, we should be able to vectorize them in the load-store vectorizer. Reviewers: tra Subscribers: jholewinski, hfinkel, llvm-commits, chandlerc Differential Revision: https://reviews.llvm.org/D23477 llvm-svn: 281152 --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 35 +++++++++++++-------- llvm/test/CodeGen/NVPTX/ldg-invariant.ll | 27 ++++++++++++++++ 2 files changed, 49 insertions(+), 13 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/ldg-invariant.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 14aa3f15f5c9a..7ab15ee94cf0e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -558,21 +558,30 @@ static unsigned int getCodeAddrSpace(MemSDNode *N) { static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F) { - // To use non-coherent caching, the load has to be from global - // memory and we have to prove that the memory area is not written - // to anywhere for the duration of the kernel call, not even after - // the load. + // We use ldg (i.e. ld.global.nc) for invariant loads from the global address + // space. // - // To ensure that there are no writes to the memory, we require the - // underlying pointer to be a noalias (__restrict) kernel parameter - // that is never used for a write. We can only do this for kernel - // functions since from within a device function, we cannot know if - // there were or will be writes to the memory from the caller - or we - // could, but then we would have to do inter-procedural analysis. - if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL || - !isKernelFunction(*F->getFunction())) { + // We have two ways of identifying invariant loads: Loads may be explicitly + // marked as invariant, or we may infer them to be invariant. + // + // We currently infer invariance only for kernel function pointer params that + // are noalias (i.e. __restrict) and never written to. + // + // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally + // not during the SelectionDAG phase). + // + // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for + // explicitly invariant loads because these are how clang tells us to use ldg + // when the user uses a builtin. + if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL) + return false; + + if (N->isInvariant()) + return true; + + // Load wasn't explicitly invariant. Attempt to infer invariance. + if (!isKernelFunction(*F->getFunction())) return false; - } // We use GetUnderlyingObjects() here instead of // GetUnderlyingObject() mainly because the former looks through phi diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll new file mode 100644 index 0000000000000..40dad1f1769ba --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll @@ -0,0 +1,27 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s + +; Check that invariant loads from the global addrspace are lowered to +; ld.global.nc. + +; CHECK-LABEL: @ld_global +define i32 @ld_global(i32 addrspace(1)* %ptr) { +; CHECK: ld.global.nc.{{[a-z]}}32 + %a = load i32, i32 addrspace(1)* %ptr, !invariant.load !0 + ret i32 %a +} + +; CHECK-LABEL: @ld_not_invariant +define i32 @ld_not_invariant(i32 addrspace(1)* %ptr) { +; CHECK: ld.global.{{[a-z]}}32 + %a = load i32, i32 addrspace(1)* %ptr + ret i32 %a +} + +; CHECK-LABEL: @ld_not_global_addrspace +define i32 @ld_not_global_addrspace(i32 addrspace(0)* %ptr) { +; CHECK: ld.{{[a-z]}}32 + %a = load i32, i32 addrspace(0)* %ptr + ret i32 %a +} + +!0 = !{}