Skip to content

Commit

Permalink
[GPGPU] Correctly initialize array order and fixed_element information
Browse files Browse the repository at this point in the history
Summary:
This information is necessary for PPCG to perform correct life range reordering.
With these changes applied we can live-range reorder some of the important
kernels in COSMO.

We also update and rename one test case, which previously could not be optimized
and now is optimized thanks to live-range reordering. To preserve test coverage
we add a new test case scalar-writes-in-scop-requires-abort.ll, which exercises
our automatic abort in case of scalar writes in the kernel.

Reviewers: Meinersbur, bollu, singam-sanjay

Subscribers: nemanjai, pollydev, llvm-commits, kbarton

Tags: #polly

Differential Revision: https://reviews.llvm.org/D36929

llvm-svn: 311259
  • Loading branch information
tobiasgrosser committed Aug 19, 2017
1 parent 91522ff commit ecb94a0
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 41 deletions.
14 changes: 7 additions & 7 deletions polly/lib/CodeGen/PPCGCodeGeneration.cpp
Expand Up @@ -2815,6 +2815,9 @@ class PPCGCodeGeneration : public ScopPass {
Access->ref_id = Acc->getId().release();
Access->next = Accesses;
Access->n_index = Acc->getScopArrayInfo()->getNumberOfDimensions();
// TODO: Also mark one-element accesses to arrays as fixed-element.
Access->fixed_element =
Acc->isLatestScalarKind() ? isl_bool_true : isl_bool_false;
Accesses = Access;
}

Expand Down Expand Up @@ -3029,6 +3032,7 @@ class PPCGCodeGeneration : public ScopPass {
i++;

collect_references(PPCGProg, &PPCGArray);
PPCGArray.only_fixed_element = only_fixed_element_accessed(&PPCGArray);
}
}

Expand Down Expand Up @@ -3070,13 +3074,6 @@ class PPCGCodeGeneration : public ScopPass {
PPCGProg->to_outer = getArrayIdentity();
// TODO: verify that this assignment is correct.
PPCGProg->any_to_outer = nullptr;

// this needs to be set when live range reordering is enabled.
// NOTE: I believe that is conservatively correct. I'm not sure
// what the semantics of this is.
// Quoting PPCG/gpu.h: "Order dependences on non-scalars."
PPCGProg->array_order =
isl_union_map_empty(isl_set_get_space(PPCGScop->context));
PPCGProg->n_stmts = std::distance(S->begin(), S->end());
PPCGProg->stmts = getStatements();

Expand All @@ -3099,6 +3096,9 @@ class PPCGCodeGeneration : public ScopPass {

createArrays(PPCGProg, ValidSAIs);

PPCGProg->array_order = nullptr;
collect_order_dependences(PPCGProg);

PPCGProg->may_persist = compute_may_persist(PPCGProg);
return PPCGProg;
}
Expand Down
5 changes: 4 additions & 1 deletion polly/lib/External/ppcg/gpu.c
Expand Up @@ -162,7 +162,7 @@ static int is_read_only_scalar(struct gpu_array_info *array,
/* Is "array" only accessed as individual, fixed elements?
* That is, does each access to "array" access a single, fixed element?
*/
static isl_bool only_fixed_element_accessed(struct gpu_array_info *array)
isl_bool only_fixed_element_accessed(struct gpu_array_info *array)
{
int i;

Expand Down Expand Up @@ -250,6 +250,9 @@ static int extract_array_info(struct gpu_prog *prog,
static __isl_give isl_union_map *remove_independences(struct gpu_prog *prog,
struct gpu_array_info *array, __isl_take isl_union_map *order)
{
// We do not have independence information in Polly. Hence, make this
// function a no-op.
return order;
int i;

for (i = 0; i < prog->scop->pet->n_independence; ++i) {
Expand Down
2 changes: 2 additions & 0 deletions polly/lib/External/ppcg/gpu.h
Expand Up @@ -454,4 +454,6 @@ __isl_give isl_ast_node *generate_code(struct gpu_gen *gen,

__isl_give isl_union_set *compute_may_persist(struct gpu_prog *prog);
void collect_references(struct gpu_prog *prog, struct gpu_array_info *array);
void collect_order_dependences(struct gpu_prog *prog);
isl_bool only_fixed_element_accessed(struct gpu_array_info *array);
#endif
@@ -1,34 +1,15 @@
; RUN: opt %loadPolly -analyze -polly-use-llvm-names -polly-scops \
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP
; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \
; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \
; RUN: -polly-acc-dump-code -disable-output \
; RUN: < %s | FileCheck %s -check-prefix=CODE

; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR
; RUN: opt %loadPolly -polly-use-llvm-names -polly-scops \
; RUN: -polly-invariant-load-hoisting -polly-codegen-ppcg \
; RUN: -polly-acc-dump-kernel-ir -disable-output \
; RUN: < %s | FileCheck %s -check-prefix=KERNELIR

; REQUIRES: pollyacc

; SCOP: Function: f
; SCOP-NEXT: Region: %entry.split---%for.end
; SCOP-NEXT: Max Loop Depth: 1
; SCOP-NEXT: Invariant Accesses: {
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOP-NEXT: [tmp, tmp1] -> { Stmt_if_end[i0] -> MemRef_end[0] };
; SCOP-NEXT: Execution Context: [tmp, tmp1] -> { : }
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOP-NEXT: [tmp, tmp1] -> { Stmt_for_body[i0] -> MemRef_control[0] };
; SCOP-NEXT: Execution Context: [tmp, tmp1] -> { : tmp > 0 }
; SCOP-NEXT: }

; Check that we generate a correct "always false" branch.
; HOST-IR: br i1 false, label %polly.start, label %entry.split.pre_entry_bb

; This test case checks that we generate correct code if PPCGCodeGeneration
; decides a build is unsuccessful with invariant load hoisting enabled.
;
; There is a conditional branch which switches between the original code and
; the new code. We try to set this conditional branch to branch on false.
; However, invariant load hoisting changes the structure of the scop, so we
; need to change the way we *locate* this instruction.
;
; void f(const int *end, int *arr, const int *control, const int *readarr) {
; for (int i = 0; i < *end; i++) {
; int t = 0;
Expand All @@ -38,7 +19,20 @@
; arr[i] = t;
; }
; }
;

; This test case tests the ability to infer that `t` is local to each loop
; iteration, and can therefore be privatized.

; CODE: # kernel0
; CODE-NEXT: for (int c0 = 0; c0 <= (tmp - 32 * b0 - 1) / 1048576; c0 += 1)
; CODE-NEXT: if (tmp >= 32 * b0 + t0 + 1048576 * c0 + 1) {
; CODE-NEXT: Stmt_for_body(32 * b0 + t0 + 1048576 * c0);
; CODE-NEXT: if (tmp1 >= 4)
; CODE-NEXT: Stmt_if_then(32 * b0 + t0 + 1048576 * c0);
; CODE-NEXT: Stmt_if_end(32 * b0 + t0 + 1048576 * c0);
; CODE-NEXT: }

; KERNELIR: %private_array = alloca i32

target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.12.0"
Expand Down
15 changes: 10 additions & 5 deletions polly/test/GPGPU/non-read-only-scalars.ll
Expand Up @@ -68,11 +68,16 @@
; CODE-NEXT: Stmt_bb17();

; CODE: # kernel2
; CODE-NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) {
; CODE-NEXT: Stmt_bb18(c0);
; CODE-NEXT: if (c0 <= 31)
; CODE-NEXT: Stmt_bb20(c0);
; CODE-NEXT: }
; CODE_NEXT: {
; CODE_NEXT: read();
; CODE_NEXT: for (int c0 = 0; c0 <= 32; c0 += 1) {
; CODE_NEXT: Stmt_bb18(c0);
; CODE_NEXT: if (c0 <= 31)
; CODE_NEXT: Stmt_bb20(c0);
; CODE_NEXT: }
; CODE_NEXT: write();
; CODE_NEXT: }


; KERNEL-IR: define ptx_kernel void @FUNC_foo_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_sum_0__phi)
; KERNEL-IR: store float 0.000000e+00, float* %sum.0.phiops
Expand Down
66 changes: 66 additions & 0 deletions polly/test/GPGPU/scalar-writes-in-scop-requires-abort.ll
@@ -0,0 +1,66 @@
; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-scops \
; RUN: -polly-acc-dump-code -analyze \
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=SCOP

; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
; RUN: -polly-acc-dump-code \
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=CODE

; RUN: opt %loadPolly -S -polly-use-llvm-names -polly-codegen-ppcg \
; RUN: -polly-invariant-load-hoisting < %s | FileCheck %s -check-prefix=HOST-IR

; REQUIRES: pollyacc

; SCOP: Invariant Accesses: {
; SCOP-NEXT: ReadAccess := [Reduction Type: NONE] [Scalar: 0]
; SCOP-NEXT: { Stmt_loop[i0] -> MemRef_p[0] };
; SCOP-NEXT: Execution Context: { : }
; SCOP-NEXT: }

; CODE: # kernel0
; CODE-NEXT: {
; CODE-NEXT: if (32 * b0 + t0 <= 1025) {
; CODE-NEXT: Stmt_loop(32 * b0 + t0);
; CODE-NEXT: write(0);
; CODE-NEXT: }
; CODE-NEXT: sync0();
; CODE-NEXT: }

; Check that we generate a correct "always false" branch.
; HOST-IR: br i1 false, label %polly.start, label %loop.pre_entry_bb

; This test case checks that we generate correct code if PPCGCodeGeneration
; decides a build is unsuccessful with invariant load hoisting enabled.
;
; There is a conditional branch which switches between the original code and
; the new code. We try to set this conditional branch to branch on false.
; However, invariant load hoisting changes the structure of the scop, so we
; need to change the way we *locate* this instruction.

target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.12.0"

define void @foo(float* %A, float* %p) {
entry:
br label %loop

loop:
%indvar = phi i64 [0, %entry], [%indvar.next, %loop]
%indvar.next = add i64 %indvar, 1
%invariant = load float, float* %p
%ptr = getelementptr float, float* %A, i64 %indvar
store float 42.0, float* %ptr
%cmp = icmp sle i64 %indvar, 1024
br i1 %cmp, label %loop, label %loop2

loop2:
%indvar2 = phi i64 [0, %loop], [%indvar2.next, %loop2]
%indvar2f = phi float [%invariant, %loop], [%indvar2f, %loop2]
%indvar2.next = add i64 %indvar2, 1
store float %indvar2f, float* %A
%cmp2 = icmp sle i64 %indvar2, 1024
br i1 %cmp2, label %loop2, label %end

end:
ret void
}

0 comments on commit ecb94a0

Please sign in to comment.