Skip to content

Commit

Permalink
AMDGPU : Widen extending scalar loads to 32-bits.
Browse files Browse the repository at this point in the history
Differential Revision: http://reviews.llvm.org/D35146

llvm-svn: 309178
  • Loading branch information
Wei Ding committed Jul 26, 2017
1 parent 92d4dd0 commit a126a13
Show file tree
Hide file tree
Showing 3 changed files with 238 additions and 1 deletion.
45 changes: 45 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
Expand Up @@ -18,6 +18,7 @@
#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
Expand Down Expand Up @@ -53,6 +54,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
DivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;
bool HasUnsafeFPMath = false;
AMDGPUAS AMDGPUASI;

/// \brief Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.
Expand Down Expand Up @@ -123,6 +125,15 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
///
/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;
/// \brief Widen a scalar load.
///
/// \details \p Widen scalar load for uniform, small type loads from constant
// memory / to a full 32-bits and then truncate the input to allow a scalar
// load instead of a vector load.
//
/// \returns True.

bool canWidenScalarExtLoad(LoadInst &I) const;

public:
static char ID;
Expand All @@ -133,6 +144,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,

bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);
bool visitLoadInst(LoadInst &I);
bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);

Expand Down Expand Up @@ -223,6 +235,16 @@ static bool promotedOpIsNUW(const Instruction &I) {
}
}

bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
Type *Ty = I.getType();
const DataLayout &DL = Mod->getDataLayout();
int TySize = DL.getTypeSizeInBits(Ty);
unsigned Align = I.getAlignment() ?
I.getAlignment() : DL.getABITypeAlignment(Ty);

return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
}

bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
assert(needsPromotionToI32(I.getType()) &&
"I does not need promotion to i32");
Expand Down Expand Up @@ -443,6 +465,29 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
return Changed;
}

bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
canWidenScalarExtLoad(I)) {
IRBuilder<> Builder(&I);
Builder.SetCurrentDebugLocation(I.getDebugLoc());

Type *I32Ty = Builder.getInt32Ty();
Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
Value *WidenLoad = Builder.CreateLoad(BitCast);

int TySize = Mod->getDataLayout().getTypeSizeInBits(I.getType());
Type *IntNTy = Builder.getIntNTy(TySize);
Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
I.replaceAllUsesWith(ValOrig);
I.eraseFromParent();
return true;
}

return false;
}

bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
bool Changed = false;

Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
Expand Up @@ -519,7 +519,7 @@ define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)*
}

; SI-LABEL: {{^}}constant_align4_load_i8:
; SI: buffer_load_ubyte
; SI: s_load_dword
; SI: buffer_store_byte
define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
%v = load i8, i8 addrspace(2)* %p, align 4
Expand Down
192 changes: 192 additions & 0 deletions llvm/test/CodeGen/AMDGPU/widen_extending_scalar_loads.ll
@@ -0,0 +1,192 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-codegenprepare < %s | FileCheck -check-prefix=OPT %s

declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0

; OPT-LABEL: @constant_load_i1
; OPT: load i1
; OPT-NEXT: store i1
define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
%val = load i1, i1 addrspace(2)* %in
store i1 %val, i1 addrspace(1)* %out
ret void
}

; OPT-LABEL: @constant_load_i1_align2
; OPT: load i1
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_i1_align2(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
%val = load i1, i1 addrspace(2)* %in, align 2
store i1 %val, i1 addrspace(1)* %out, align 2
ret void
}

; OPT-LABEL: @constant_load_i1_align4
; OPT: bitcast
; OPT-NEXT: load i32
; OPT-NEXT: trunc
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_i1_align4(i1 addrspace(1)* %out, i1 addrspace(2)* %in) #0 {
%val = load i1, i1 addrspace(2)* %in, align 4
store i1 %val, i1 addrspace(1)* %out, align 4
ret void
}

; OPT-LABEL: @constant_load_i8
; OPT: load i8
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%val = load i8, i8 addrspace(2)* %in
store i8 %val, i8 addrspace(1)* %out
ret void
}

; OPT-LABEL: @constant_load_i8_align2
; OPT: load i8
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_i8_align2(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%val = load i8, i8 addrspace(2)* %in, align 2
store i8 %val, i8 addrspace(1)* %out, align 2
ret void
}

; OPT-LABEL: @constant_load_i8align4
; OPT: bitcast
; OPT-NEXT: load i32
; OPT-NEXT: trunc
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_i8align4(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
%val = load i8, i8 addrspace(2)* %in, align 4
store i8 %val, i8 addrspace(1)* %out, align 4
ret void
}


; OPT-LABEL: @constant_load_v2i8
; OPT: load <2 x i8>
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
ret void
}

; OPT-LABEL: @constant_load_v2i8_align4
; OPT: bitcast
; OPT-NEXT: load i32
; OPT-NEXT: trunc
; OPT-NEXT: bitcast
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_v2i8_align4(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
%ld = load <2 x i8>, <2 x i8> addrspace(2)* %in, align 4
store <2 x i8> %ld, <2 x i8> addrspace(1)* %out, align 4
ret void
}

; OPT-LABEL: @constant_load_v3i8
; OPT: bitcast <3 x i8>
; OPT-NEXT: load i32, i32 addrspace(2)
; OPT-NEXT: trunc i32
; OPT-NEXT: bitcast i24
; OPT-NEXT: store <3 x i8>
define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
ret void
}

; OPT-LABEL: @constant_load_v3i8_align4
; OPT: bitcast <3 x i8>
; OPT-NEXT: load i32, i32 addrspace(2)
; OPT-NEXT: trunc i32
; OPT-NEXT: bitcast i24
; OPT-NEXT: store <3 x i8>
define amdgpu_kernel void @constant_load_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
%ld = load <3 x i8>, <3 x i8> addrspace(2)* %in, align 4
store <3 x i8> %ld, <3 x i8> addrspace(1)* %out, align 4
ret void
}

; OPT-LABEL: @constant_load_i16
; OPT: load i16
; OPT: sext
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_i16(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
%ld = load i16, i16 addrspace(2)* %in
%ext = sext i16 %ld to i32
store i32 %ext, i32 addrspace(1)* %out
ret void
}

; OPT-LABEL: @constant_load_i16_align4
; OPT: bitcast
; OPT-NEXT: load i32
; OPT-NEXT: trunc
; OPT-NEXT: sext
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_i16_align4(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
%ld = load i16, i16 addrspace(2)* %in, align 4
%ext = sext i16 %ld to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}

; OPT-LABEL: @constant_load_f16
; OPT: load half
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_f16(half addrspace(1)* %out, half addrspace(2)* %in) #0 {
%ld = load half, half addrspace(2)* %in
store half %ld, half addrspace(1)* %out
ret void
}

; OPT-LABEL: @constant_load_v2f16
; OPT: load <2 x half>
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %in) #0 {
%ld = load <2 x half>, <2 x half> addrspace(2)* %in
store <2 x half> %ld, <2 x half> addrspace(1)* %out
ret void
}

; OPT-LABEL: @load_volatile
; OPT: load volatile i16
; OPT-NEXT: store
define amdgpu_kernel void @load_volatile(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
%a = load volatile i16, i16 addrspace(2)* %in
store i16 %a, i16 addrspace(1)* %out
ret void
}

; OPT-LABEL: @constant_load_v2i8_volatile
; OPT: load volatile <2 x i8>
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_v2i8_volatile(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
%ld = load volatile <2 x i8>, <2 x i8> addrspace(2)* %in
store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
ret void
}

; OPT-LABEL: @constant_load_v2i8_addrspace1
; OPT: load <2 x i8>
; OPT-NEXT: store
define amdgpu_kernel void @constant_load_v2i8_addrspace1(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
%ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
ret void
}

; OPT-LABEL: @use_dispatch_ptr
; OPT: bitcast
; OPT-NEXT: load i32
; OPT-NEXT: trunc
; OPT-NEXT: zext
; OPT-NEXT: store
define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
%dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
%val = load i8, i8 addrspace(2)* %dispatch.ptr, align 4
%ld = zext i8 %val to i32
store i32 %ld, i32 addrspace(1)* %ptr
ret void
}

attributes #0 = { nounwind }

0 comments on commit a126a13

Please sign in to comment.