Skip to content

Commit

Permalink
[NVPTX] Unify vectorization of load/stores of aggregate arguments and…
Browse files Browse the repository at this point in the history
… return values.

Original code only used vector loads/stores for explicit vector arguments.
It could also do more loads/stores than necessary (e.g v5f32 would
touch 8 f32 values). Aggregate types were loaded one element at a time,
even the vectors contained within.

This change attempts to generalize (and simplify) parameter space
loads/stores so that vector loads/stores can be used more broadly.
Functionality of the patch has been verified by compiling thrust
test suite and manually checking the differences between PTX
generated by llvm with and without the patch.

General algorithm:
* ComputePTXValueVTs() flattens input/output argument into a flat list
  of scalars to load/store and returns their types and offsets.
* VectorizePTXValueVTs() uses that data to create vectorization plan
  which returns an array of flags marking boundaries of vectorized
  load/stores. Scalars are represented as 1-element vectors.
* Code that generates loads/stores implements a simple state machine
  that constructs a vector according to the plan.

Differential Revision: https://reviews.llvm.org/D30011

llvm-svn: 295784
  • Loading branch information
Artem-B committed Feb 21, 2017
1 parent 7d6b71d commit 29bbdc1
Show file tree
Hide file tree
Showing 9 changed files with 1,384 additions and 746 deletions.
1,130 changes: 420 additions & 710 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Large diffs are not rendered by default.

35 changes: 27 additions & 8 deletions llvm/test/CodeGen/NVPTX/aggregate-return.ll
@@ -1,21 +1,40 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s

declare <2 x float> @barv(<2 x float> %input)
declare <3 x float> @barv3(<3 x float> %input)
declare [2 x float] @bara([2 x float] %input)
declare {float, float} @bars({float, float} %input)

define void @foov(<2 x float> %input, <2 x float>* %output) {
; CHECK-LABEL: @foov
define void @test_v2f32(<2 x float> %input, <2 x float>* %output) {
; CHECK-LABEL: @test_v2f32
%call = tail call <2 x float> @barv(<2 x float> %input)
; CHECK: .param .align 8 .b8 retval0[8];
; CHECK: ld.param.v2.f32 {[[ELEMV1:%f[0-9]+]], [[ELEMV2:%f[0-9]+]]}, [retval0+0];
; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0];
store <2 x float> %call, <2 x float>* %output, align 8
; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEMV1]], [[ELEMV2]]}
; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
ret void
}

define void @fooa([2 x float] %input, [2 x float]* %output) {
; CHECK-LABEL: @fooa
define void @test_v3f32(<3 x float> %input, <3 x float>* %output) {
; CHECK-LABEL: @test_v3f32
;
%call = tail call <3 x float> @barv3(<3 x float> %input)
; CHECK: .param .align 16 .b8 retval0[16];
; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0];
; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8];
; Make sure we don't load more values than than we need to.
; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
store <3 x float> %call, <3 x float>* %output, align 8
; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
; -- This is suboptimal. We should do st.v2.f32 instead
; of combining 2xf32 info i64.
; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
; CHECK: ret;
ret void
}

define void @test_a2f32([2 x float] %input, [2 x float]* %output) {
; CHECK-LABEL: @test_a2f32
%call = tail call [2 x float] @bara([2 x float] %input)
; CHECK: .param .align 4 .b8 retval0[8];
; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0];
Expand All @@ -28,8 +47,8 @@ define void @fooa([2 x float] %input, [2 x float]* %output) {
; CHECK: ret
}

define void @foos({float, float} %input, {float, float}* %output) {
; CHECK-LABEL: @foos
define void @test_s2f32({float, float} %input, {float, float}* %output) {
; CHECK-LABEL: @test_s2f32
%call = tail call {float, float} @bars({float, float} %input)
; CHECK: .param .align 4 .b8 retval0[8];
; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0];
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/f16-instructions.ll
Expand Up @@ -229,7 +229,7 @@ define half @test_tailcall_flipped(half %a, half %b) #0 {
; CHECK-LABEL: test_select(
; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0];
; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1];
; CHECK: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
; CHECK-NEXT: ret;
Expand Down
5 changes: 4 additions & 1 deletion llvm/test/CodeGen/NVPTX/ldparam-v4.ll
Expand Up @@ -2,8 +2,11 @@

declare <4 x float> @bar()

; CHECK-LABEL: .func foo(
define void @foo(<4 x float>* %ptr) {
; CHECK: ld.param.v4.f32
; CHECK: ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0];
; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0];
; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
%val = tail call <4 x float> @bar()
store <4 x float> %val, <4 x float>* %ptr
ret void
Expand Down
27 changes: 14 additions & 13 deletions llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -1,4 +1,4 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR

; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
Expand Down Expand Up @@ -27,9 +27,9 @@ entry:
; PTX: LBB[[LABEL:[_0-9]+]]:
; PTX: ld.u8 %rs[[REG:[0-9]+]]
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
}

define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
Expand All @@ -45,9 +45,9 @@ entry:
; PTX: LBB[[LABEL:[_0-9]+]]:
; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]]
; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
}

define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
Expand Down Expand Up @@ -78,12 +78,13 @@ entry:
; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]]

; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller(
; PTX: ld.param.u8 %rs[[REG:[0-9]+]]
; PTX: ld.param.u32 %r[[C:[0-9]+]]
; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]];
; PTX: LBB[[LABEL:[_0-9]+]]:
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
; PTX: @%p[[PRED]] bra LBB[[LABEL]]
}

define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
Expand Down Expand Up @@ -118,15 +119,15 @@ entry:
; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]]
; -- this is the backwards copying BB
; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]]
; PTX: add.s64 %rd[[N]], %rd[[N]], -1
; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1
; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]]
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]]
; -- this is the forwards copying BB
; PTX: LBB[[FORWARD_BB]]:
; PTX: @%p[[NEQ0]] bra LBB[[EXIT]]
; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]]
; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]]
; PTX: add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1
; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1
; -- exit block
; PTX: LBB[[EXIT]]:
; PTX-NEXT: st.param.b64 [func_retval0
Expand Down

0 comments on commit 29bbdc1

Please sign in to comment.