Skip to content

Commit

Permalink
AMDGPU: Fix MUBUF offset bugs affecting llvm.amdgcn.buffer.* intrinsics
Browse files Browse the repository at this point in the history
Summary:
This fixes two related bugs. First, the generic optimization passes
unfortunately generate negative constant offsets but the hardware treats
SOffset as an unsigned value.

Second, there is a hardware bug on SI and CI, where address clamping in MUBUF
instructions does not work correctly when SOffset is larger than the buffer
size. This patch works around this bug by never using SOffset.

An alternative workaround would be to do the clamping manually when SOffset
is too large, but generating the required code sequence during instruction
selection would be rather involved, and in any case the resulting code would
probably be worse.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96360

Reviewers: arsenm, tstellarAMD

Subscribers: arsenm, llvm-commits, kzhuravl

Differential Revision: http://reviews.llvm.org/D21326

llvm-svn: 272761
  • Loading branch information
nhaehnle committed Jun 15, 2016
1 parent 3f59c0c commit a609259
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 35 deletions.
43 changes: 30 additions & 13 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Expand Up @@ -131,7 +131,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
void SelectMUBUFConstant(SDValue Constant,
bool SelectMUBUFConstant(SDValue Constant,
SDValue &SOffset,
SDValue &ImmOffset) const;
bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset,
Expand Down Expand Up @@ -1168,7 +1168,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE);
}

void AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
SDValue &SOffset,
SDValue &ImmOffset) const {
SDLoc DL(Constant);
Expand All @@ -1193,6 +1193,13 @@ void AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
}
}

// There is a hardware bug in SI and CI which prevents address clamping in
// MUBUF instructions from working correctly with SOffsets. The immediate
// offset is unaffected.
if (Overflow > 0 &&
Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
return false;

ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16);

if (Overflow <= 64)
Expand All @@ -1201,6 +1208,8 @@ void AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant,
SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getTargetConstant(Overflow, DL, MVT::i32)),
0);

return true;
}

bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
Expand All @@ -1211,9 +1220,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset,
if (!isa<ConstantSDNode>(Offset))
return false;

SelectMUBUFConstant(Offset, SOffset, ImmOffset);

return true;
return SelectMUBUFConstant(Offset, SOffset, ImmOffset);
}

bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
Expand All @@ -1223,20 +1230,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset,
SDLoc DL(Offset);

// Don't generate an unnecessary voffset for constant offsets.
if (isa<ConstantSDNode>(Offset))
return false;
if (isa<ConstantSDNode>(Offset)) {
SDValue Tmp1, Tmp2;

// When necessary, use a voffset in <= CI anyway to work around a hardware
// bug.
if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS ||
SelectMUBUFConstant(Offset, Tmp1, Tmp2))
return false;
}

if (CurDAG->isBaseWithConstantOffset(Offset)) {
SDValue N0 = Offset.getOperand(0);
SDValue N1 = Offset.getOperand(1);
SelectMUBUFConstant(N1, SOffset, ImmOffset);
VOffset = N0;
} else {
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
VOffset = Offset;
if (cast<ConstantSDNode>(N1)->getSExtValue() >= 0 &&
SelectMUBUFConstant(N1, SOffset, ImmOffset)) {
VOffset = N0;
return true;
}
}

SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
VOffset = Offset;

return true;
}

Expand Down
14 changes: 8 additions & 6 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll
@@ -1,9 +1,9 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI

;CHECK-LABEL: {{^}}test1:
;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc
;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc
;CHECK: s_waitcnt vmcnt(0)
Expand All @@ -13,7 +13,8 @@
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
;CHECK-DAG: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc
;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc
;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}}
define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) {
Expand Down Expand Up @@ -70,7 +71,7 @@ main_body:
;CHECK-LABEL: {{^}}test3:
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc
Expand All @@ -79,7 +80,8 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc
;CHECK-DAG: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc
;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc
;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc
define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) {
main_body:
%o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0)
Expand Down
28 changes: 16 additions & 12 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
@@ -1,5 +1,5 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI

;CHECK-LABEL: {{^}}buffer_load:
;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0
Expand Down Expand Up @@ -27,11 +27,15 @@ main_body:
}

;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095
;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff
;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093
;CHECK: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1
;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x103c
;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen
;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095
;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff
;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093
;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen
;VI: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
main_body:
Expand All @@ -44,11 +48,11 @@ main_body:
}

;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse:
;CHECK: s_movk_i32 [[OFS:s[0-9]+]], 0xfff
;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65
;CHECK-NOT: s_mov
;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81
;CHECK: s_waitcnt
;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xfff
;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65
;VI-NOT: s_mov
;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81
;VI: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) {
main_body:
%d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0)
Expand Down
19 changes: 15 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@@ -1,5 +1,5 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI

;CHECK-LABEL: {{^}}buffer_load:
;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
Expand Down Expand Up @@ -27,8 +27,9 @@ main_body:
}

;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1
;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen
;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1
;CHECK: s_waitcnt
define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
main_body:
Expand Down Expand Up @@ -101,6 +102,16 @@ main_body:
ret <2 x float> %data
}

;CHECK-LABEL: {{^}}buffer_load_negative_offset:
;CHECK: v_add_i32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0
;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen
define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) {
main_body:
%ofs.1 = add i32 %ofs, -16
%data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0)
ret <4 x float> %data
}

declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
Expand Down

0 comments on commit a609259

Please sign in to comment.