Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
441 changes: 384 additions & 57 deletions llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp

Large diffs are not rendered by default.

41 changes: 22 additions & 19 deletions llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -45,29 +45,32 @@ define half @fh(ptr %p) {
; ENABLED-LABEL: fh(
; ENABLED: {
; ENABLED-NEXT: .reg .b16 %rs<10>;
; ENABLED-NEXT: .reg .b32 %r<13>;
; ENABLED-NEXT: .reg .b32 %r<17>;
; ENABLED-NEXT: .reg .b64 %rd<2>;
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
; ENABLED-NEXT: cvt.f32.f16 %r1, %rs2;
; ENABLED-NEXT: cvt.f32.f16 %r2, %rs1;
; ENABLED-NEXT: add.rn.f32 %r3, %r2, %r1;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r3;
; ENABLED-NEXT: cvt.f32.f16 %r4, %rs4;
; ENABLED-NEXT: cvt.f32.f16 %r5, %rs3;
; ENABLED-NEXT: add.rn.f32 %r6, %r5, %r4;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r6;
; ENABLED-NEXT: cvt.f32.f16 %r7, %rs7;
; ENABLED-NEXT: cvt.f32.f16 %r8, %rs6;
; ENABLED-NEXT: add.rn.f32 %r9, %r8, %r7;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r9;
; ENABLED-NEXT: cvt.f32.f16 %r10, %rs8;
; ENABLED-NEXT: cvt.f32.f16 %r11, %rs5;
; ENABLED-NEXT: add.rn.f32 %r12, %r10, %r11;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r12;
; ENABLED-NEXT: .pragma "used_bytes_mask 1023";
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not look right. Our input is presumably an array of f16 elements, but we end up loading 4 x b32, but then appear to ignore the last two elements. It should have been ld.v2.b32, or, perhaps the load should have remained ld.v4.f16.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note the difference in number of ld instructions in the PTX. The old output has two load instructions to load 5 b16s: a ld.v4.b16 and a ld.b16. The new version, in the LSV, "extends" the chain of 5 loads to the next power of two, a chain of 8 loads with 3 unused tail elements, vectorizing it a single load <8 x i16>. This gets lowered by the backend to a ld.v4.b32, with 2.5 elements (containing the packed 5 b16s) ending up being used, the rest unused.

This reduction from two load instructions to one load instruction is an optimization.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've missed the 5th load of f16. Generated code looks correct.

My next question is whether this extension is always beneficial. E.g. if we do that on shared memory, it may potentially increase bank contention due to the extra loads. In the worst case we'd waste ~25% of shared memory bandwidth for this particular extension from v5f16 to v4b32.

I think we should take AS info into account and have some sort of user-controllable knob to enable/disable the gap filling, if needed. E.g. it's probably always good for loads from global AS, it's a maybe for shared memory (less instructions may win over bank conflicts if the extra loads happen to be broadcast to other thread's loads, but would waste bandwidth otherwise), and we can't say much about generic AS, as it could go either way, I think.
For masked writes it's more likely to be a win, as we don't actually write extra data, so the potential downside is a possible register pressure bump.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we do that on shared memory, it may potentially increase bank contention due to the extra loads.

I don't think that's a concern for CUDA GPU. But it's a good idea to add AS as a parameter to the TTI API, other targets may want to control this feature for specific AS.

; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;
; ENABLED-NEXT: mov.b32 {%rs4, %rs5}, %r1;
; ENABLED-NEXT: cvt.f32.f16 %r5, %rs5;
; ENABLED-NEXT: cvt.f32.f16 %r6, %rs4;
; ENABLED-NEXT: add.rn.f32 %r7, %r6, %r5;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %r7;
; ENABLED-NEXT: cvt.f32.f16 %r8, %rs3;
; ENABLED-NEXT: cvt.f32.f16 %r9, %rs2;
; ENABLED-NEXT: add.rn.f32 %r10, %r9, %r8;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %r10;
; ENABLED-NEXT: cvt.f32.f16 %r11, %rs7;
; ENABLED-NEXT: cvt.f32.f16 %r12, %rs6;
; ENABLED-NEXT: add.rn.f32 %r13, %r12, %r11;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %r13;
; ENABLED-NEXT: cvt.f32.f16 %r14, %rs8;
; ENABLED-NEXT: cvt.f32.f16 %r15, %rs1;
; ENABLED-NEXT: add.rn.f32 %r16, %r14, %r15;
; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %r16;
; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9;
; ENABLED-NEXT: ret;
;
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,8 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
; CHECK-LABEL: callee_St4x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8];
; CHECK: .pragma "used_bytes_mask 4095";
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
; CHECK-NEXT: ret;
Expand Down Expand Up @@ -394,8 +394,8 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
; CHECK-LABEL: callee_St4x7(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24];
; CHECK: .pragma "used_bytes_mask 4095";
; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
; CHECK-DAG: st.param.b32 [func_retval0+24], [[R7]];
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/NVPTX/variadics-backend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ define dso_local i32 @foo() {
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: st.b64 [%SP], 4294967297;
; CHECK-PTX-NEXT: st.v2.b32 [%SP], {1, 1};
; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408;
Expand Down
81 changes: 81 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/NVPTX/extend-chain.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S -o - %s | FileCheck %s

;; Check that the vectorizer extends a Chain to the next power of two,
;; essentially loading more vector elements than the original
;; code. Alignment and other requirement for vectorization should
;; still be met.

define void @load3to4(ptr %p) #0 {
; CHECK-LABEL: define void @load3to4(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[P_0]], i32 16, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
; CHECK-NEXT: [[V01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[V12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[V23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[EXTEND4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i32, ptr %p, i32 0
%p.1 = getelementptr i32, ptr %p, i32 1
%p.2 = getelementptr i32, ptr %p, i32 2

%v0 = load i32, ptr %p.0, align 16
%v1 = load i32, ptr %p.1, align 4
%v2 = load i32, ptr %p.2, align 8

ret void
}

define void @load5to8(ptr %p) #0 {
; CHECK-LABEL: define void @load5to8(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i16, ptr [[P]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[P_0]], i32 16, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false>, <8 x i16> poison)
; CHECK-NEXT: [[V05:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
; CHECK-NEXT: [[V16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
; CHECK-NEXT: [[V27:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
; CHECK-NEXT: [[V38:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
; CHECK-NEXT: [[V49:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
; CHECK-NEXT: [[EXTEND10:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
; CHECK-NEXT: [[EXTEND211:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
; CHECK-NEXT: [[EXTEND412:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i16, ptr %p, i32 0
%p.1 = getelementptr i16, ptr %p, i32 1
%p.2 = getelementptr i16, ptr %p, i32 2
%p.3 = getelementptr i16, ptr %p, i32 3
%p.4 = getelementptr i16, ptr %p, i32 4

%v0 = load i16, ptr %p.0, align 16
%v1 = load i16, ptr %p.1, align 2
%v2 = load i16, ptr %p.2, align 4
%v3 = load i16, ptr %p.3, align 8
%v4 = load i16, ptr %p.4, align 2

ret void
}

define void @load3to4_unaligned(ptr %p) #0 {
; CHECK-LABEL: define void @load3to4_unaligned(
; CHECK-SAME: ptr [[P:%.*]]) {
; CHECK-NEXT: [[P_0:%.*]] = getelementptr i32, ptr [[P]], i32 0
; CHECK-NEXT: [[P_2:%.*]] = getelementptr i32, ptr [[P]], i32 2
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P_0]], align 8
; CHECK-NEXT: [[V01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[V12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[P_2]], align 8
; CHECK-NEXT: ret void
;
%p.0 = getelementptr i32, ptr %p, i32 0
%p.1 = getelementptr i32, ptr %p, i32 1
%p.2 = getelementptr i32, ptr %p, i32 2

%v0 = load i32, ptr %p.0, align 8
%v1 = load i32, ptr %p.1, align 4
%v2 = load i32, ptr %p.2, align 8

ret void
}
37 changes: 37 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/NVPTX/gap-fill-cleanup.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s

; Test that gap filled instructions get deleted if they are not used
%struct.S10 = type { i32, i32, i32, i32 }

; First, confirm that gap instructions get generated and would be vectorized if the alignment is correct
define void @fillTwoGapsCanVectorize(ptr %in) {
; CHECK-LABEL: define void @fillTwoGapsCanVectorize(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[IN]], i32 16, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> poison)
; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: ret void
;
%load0 = load i32, ptr %in, align 16
%getElem = getelementptr i8, ptr %in, i64 12
%load3 = load i32, ptr %getElem, align 4
ret void
}

; Then, confirm that gap instructions get deleted if the alignment prevents the vectorization
define void @fillTwoGapsCantVectorize(ptr %in) {
; CHECK-LABEL: define void @fillTwoGapsCantVectorize(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[IN]], align 4
; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[IN]], i64 12
; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEM]], align 4
; CHECK-NEXT: ret void
;
%load0 = load i32, ptr %in, align 4
%getElem = getelementptr i8, ptr %in, i64 12
%load3 = load i32, ptr %getElem, align 4
ret void
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -S < %s | FileCheck %s

; Test that gap filled instructions don't lose invariant metadata
%struct.S10 = type { i32, i32, i32, i32 }

; With no gaps, if every load is invariant, the vectorized load will be too.
define i32 @noGaps(ptr %in) {
; CHECK-LABEL: define i32 @noGaps(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16, !invariant.load [[META0:![0-9]+]]
; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]]
; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]]
; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]]
; CHECK-NEXT: ret i32 [[SUM0123]]
;
%load0 = load i32, ptr %in, align 16, !invariant.load !0
%getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1
%load1 = load i32, ptr %getElem1, align 4, !invariant.load !0
%getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2
%load2 = load i32, ptr %getElem2, align 4, !invariant.load !0
%getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
%load3 = load i32, ptr %getElem3, align 4, !invariant.load !0
%sum01 = add i32 %load0, %load1
%sum012 = add i32 %sum01, %load2
%sum0123 = add i32 %sum012, %load3
ret i32 %sum0123
}

; If one of the loads is not invariant, the vectorized load will not be invariant.
define i32 @noGapsMissingInvariant(ptr %in) {
; CHECK-LABEL: define i32 @noGapsMissingInvariant(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[IN]], align 16
; CHECK-NEXT: [[TMP01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: [[SUM01:%.*]] = add i32 [[TMP01]], [[TMP12]]
; CHECK-NEXT: [[SUM012:%.*]] = add i32 [[SUM01]], [[TMP23]]
; CHECK-NEXT: [[SUM0123:%.*]] = add i32 [[SUM012]], [[TMP34]]
; CHECK-NEXT: ret i32 [[SUM0123]]
;
%load0 = load i32, ptr %in, align 16, !invariant.load !0
%getElem1 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 1
%load1 = load i32, ptr %getElem1, align 4, !invariant.load !0
%getElem2 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 2
%load2 = load i32, ptr %getElem2, align 4, !invariant.load !0
%getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
%load3 = load i32, ptr %getElem3, align 4
%sum01 = add i32 %load0, %load1
%sum012 = add i32 %sum01, %load2
%sum0123 = add i32 %sum012, %load3
ret i32 %sum0123
}

; With two gaps, if every real load is invariant, the vectorized load will be too.
define i32 @twoGaps(ptr %in) {
; CHECK-LABEL: define i32 @twoGaps(
; CHECK-SAME: ptr [[IN:%.*]]) {
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[IN]], i32 16, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> poison), !invariant.load [[META0]]
; CHECK-NEXT: [[LOAD03:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
; CHECK-NEXT: [[GAPFILL4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: [[GAPFILL25:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
; CHECK-NEXT: [[LOAD36:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
; CHECK-NEXT: [[SUM:%.*]] = add i32 [[LOAD03]], [[LOAD36]]
; CHECK-NEXT: ret i32 [[SUM]]
;
%load0 = load i32, ptr %in, align 16, !invariant.load !0
%getElem3 = getelementptr inbounds %struct.S10, ptr %in, i64 0, i32 3
%load3 = load i32, ptr %getElem3, align 4, !invariant.load !0
%sum = add i32 %load0, %load3
ret i32 %sum
}

!0 = !{}
;.
; CHECK: [[META0]] = !{}
;.
Loading
Loading