Skip to content

Commit

Permalink
[PPCGCodeGen] [3/3] Update PPCGCodeGen + tests to latest ppcg.
Browse files Browse the repository at this point in the history
This commit *WILL COMPILE*.

1. `PPCG` now uses `isl_multi_pw_aff` instead of an array of `pw_aff`.
   This needs us to adjust how we index array bounds and how we construct
   array bounds.

2. `PPCG` introduces two new kinds of nodes: `init_device` and `clear_device`.
   We should investigate what the correct way to handle these are.

3. `PPCG` has gotten smarter with its use of live range reordering, so some of
   the tests have a qualitative improvement.

4. `PPCG` changed its output style, so many test cases need to be updated to
   fit the new style for `polly-acc-dump-code` checks.

Differential Revision: https://reviews.llvm.org/D35677

llvm-svn: 308625
  • Loading branch information
bollu committed Jul 20, 2017
1 parent 3d4d752 commit 9e3db2b
Show file tree
Hide file tree
Showing 14 changed files with 151 additions and 157 deletions.
97 changes: 77 additions & 20 deletions polly/lib/CodeGen/PPCGCodeGeneration.cpp
Expand Up @@ -137,7 +137,11 @@ struct MustKillsInfo {
/// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
isl::union_map TaggedMustKills;

MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){};
/// Tagged must kills stripped of the tags.
/// [params] -> { Stmt_phantom[] -> scalar_to_kill[] }
isl::union_map MustKills;

MustKillsInfo() : KillsSchedule(nullptr) {}
};

/// Check if SAI's uses are entirely contained within Scop S.
Expand Down Expand Up @@ -179,6 +183,7 @@ static MustKillsInfo computeMustKillsInfo(const Scop &S) {
}

Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace));
Info.MustKills = isl::union_map::empty(isl::space(ParamSpace));

// Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
// schedule:
Expand Down Expand Up @@ -225,6 +230,9 @@ static MustKillsInfo computeMustKillsInfo(const Scop &S) {
isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar);
Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);

// 2. [param] -> { Stmt[] -> scalar_to_kill[] }
Info.MustKills = Info.TaggedMustKills.domain_factor_domain();

// 3. Create the kill schedule of the form:
// "[param] -> { Stmt_phantom[] }"
// Then add this to Info.KillsSchedule.
Expand Down Expand Up @@ -1004,11 +1012,11 @@ Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);

if (!gpu_array_is_scalar(Array)) {
auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]);
auto OffsetDimZero = isl_multi_pw_aff_get_pw_aff(Array->bound, 0);
isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero);

for (unsigned int i = 1; i < Array->n_index; i++) {
isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]);
isl_pw_aff *Bound_I = isl_multi_pw_aff_get_pw_aff(Array->bound, i);
isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
Res = isl_ast_expr_mul(Res, Expr);
}
Expand Down Expand Up @@ -1048,7 +1056,7 @@ Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {

for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) {
if (i > 0) {
isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]);
isl_pw_aff *Bound_I = isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1);
isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
Result = isl_ast_expr_mul(Result, BExpr);
}
Expand Down Expand Up @@ -1152,7 +1160,18 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
isl_ast_expr_free(Expr);
return;
}

if (!strcmp(Str, "init_device")) {
initializeAfterRTH();
isl_ast_node_free(UserStmt);
isl_ast_expr_free(Expr);
return;
}
if (!strcmp(Str, "clear_device")) {
finalize();
isl_ast_node_free(UserStmt);
isl_ast_expr_free(Expr);
return;
}
if (isPrefix(Str, "to_device")) {
if (!ManagedMemory)
createDataTransfer(UserStmt, HOST_TO_DEVICE);
Expand Down Expand Up @@ -1766,7 +1785,7 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
Sizes.push_back(nullptr);
for (long j = 1; j < Kernel->array[i].array->n_index; j++) {
isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j]));
Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j));
auto V = ExprBuilder.create(DimSize);
Sizes.push_back(SE.getSCEV(V));
}
Expand Down Expand Up @@ -2127,6 +2146,7 @@ class PPCGCodeGeneration : public ScopPass {

Options->debug = DebugOptions;

Options->group_chains = false;
Options->reschedule = true;
Options->scale_tile_loops = false;
Options->wrap = false;
Expand All @@ -2135,17 +2155,26 @@ class PPCGCodeGeneration : public ScopPass {
Options->ctx = nullptr;
Options->sizes = nullptr;

Options->tile = true;
Options->tile_size = 32;

Options->isolate_full_tiles = false;

Options->use_private_memory = PrivateMemory;
Options->use_shared_memory = SharedMemory;
Options->max_shared_memory = 48 * 1024;

Options->target = PPCG_TARGET_CUDA;
Options->openmp = false;
Options->linearize_device_arrays = true;
Options->live_range_reordering = false;
Options->allow_gnu_extensions = false;

Options->unroll_copy_shared = false;
Options->unroll_gpu_tile = false;
Options->live_range_reordering = true;

Options->live_range_reordering = true;
Options->hybrid = false;
Options->opencl_compiler_options = nullptr;
Options->opencl_use_gpu = false;
Options->opencl_n_include_file = 0;
Expand Down Expand Up @@ -2260,6 +2289,8 @@ class PPCGCodeGeneration : public ScopPass {
///
/// @returns A new ppcg scop.
ppcg_scop *createPPCGScop() {
MustKillsInfo KillsInfo = computeMustKillsInfo(*S);

auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));

PPCGScop->options = createPPCGOptions();
Expand All @@ -2271,7 +2302,8 @@ class PPCGCodeGeneration : public ScopPass {

PPCGScop->context = S->getContext();
PPCGScop->domain = S->getDomains();
PPCGScop->call = nullptr;
// TODO: investigate this further. PPCG calls collect_call_domains.
PPCGScop->call = isl_union_set_from_set(S->getContext());
PPCGScop->tagged_reads = getTaggedReads();
PPCGScop->reads = S->getReads();
PPCGScop->live_in = nullptr;
Expand All @@ -2280,6 +2312,9 @@ class PPCGCodeGeneration : public ScopPass {
PPCGScop->tagged_must_writes = getTaggedMustWrites();
PPCGScop->must_writes = S->getMustWrites();
PPCGScop->live_out = nullptr;
PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take();
PPCGScop->must_kills = KillsInfo.MustKills.take();

PPCGScop->tagger = nullptr;
PPCGScop->independence =
isl_union_map_empty(isl_set_get_space(PPCGScop->context));
Expand All @@ -2291,19 +2326,17 @@ class PPCGCodeGeneration : public ScopPass {
PPCGScop->tagged_dep_order = nullptr;

PPCGScop->schedule = S->getScheduleTree();

MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
// If we have something non-trivial to kill, add it to the schedule
if (KillsInfo.KillsSchedule.get())
PPCGScop->schedule = isl_schedule_sequence(
PPCGScop->schedule, KillsInfo.KillsSchedule.take());
PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take();

PPCGScop->names = getNames();
PPCGScop->pet = nullptr;

compute_tagger(PPCGScop);
compute_dependences(PPCGScop);
eliminate_dead_code(PPCGScop);

return PPCGScop;
}
Expand Down Expand Up @@ -2458,14 +2491,23 @@ class PPCGCodeGeneration : public ScopPass {
/// @param PPCGArray The array to compute bounds for.
/// @param Array The polly array from which to take the information.
void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
isl_pw_aff_list *BoundsList =
isl_pw_aff_list_alloc(S->getIslCtx(), PPCGArray.n_index);
std::vector<isl::pw_aff> PwAffs;

isl_space *AlignSpace = S->getParamSpace();
AlignSpace = isl_space_add_dims(AlignSpace, isl_dim_set, 1);

if (PPCGArray.n_index > 0) {
if (isl_set_is_empty(PPCGArray.extent)) {
isl_set *Dom = isl_set_copy(PPCGArray.extent);
isl_local_space *LS = isl_local_space_from_space(
isl_space_params(isl_set_get_space(Dom)));
isl_set_free(Dom);
isl_aff *Zero = isl_aff_zero_on_domain(LS);
PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero);
isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS));
Zero = isl_pw_aff_align_params(Zero, isl_space_copy(AlignSpace));
PwAffs.push_back(isl::manage(isl_pw_aff_copy(Zero)));
BoundsList = isl_pw_aff_list_insert(BoundsList, 0, Zero);
} else {
isl_set *Dom = isl_set_copy(PPCGArray.extent);
Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
Expand All @@ -2478,7 +2520,9 @@ class PPCGCodeGeneration : public ScopPass {
One = isl_aff_add_constant_si(One, 1);
Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
Bound = isl_pw_aff_gist(Bound, S->getContext());
PPCGArray.bound[0] = Bound;
Bound = isl_pw_aff_align_params(Bound, isl_space_copy(AlignSpace));
PwAffs.push_back(isl::manage(isl_pw_aff_copy(Bound)));
BoundsList = isl_pw_aff_list_insert(BoundsList, 0, Bound);
}
}

Expand All @@ -2487,8 +2531,20 @@ class PPCGCodeGeneration : public ScopPass {
auto LS = isl_pw_aff_get_domain_space(Bound);
auto Aff = isl_multi_aff_zero(LS);
Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
PPCGArray.bound[i] = Bound;
Bound = isl_pw_aff_align_params(Bound, isl_space_copy(AlignSpace));
PwAffs.push_back(isl::manage(isl_pw_aff_copy(Bound)));
BoundsList = isl_pw_aff_list_insert(BoundsList, i, Bound);
}

isl_space_free(AlignSpace);
isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent);

assert(BoundsSpace && "Unable to access space of array.");
assert(BoundsList && "Unable to access list of bounds.");

PPCGArray.bound =
isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList);
assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly.");
}

/// Create the arrays for @p PPCGProg.
Expand All @@ -2511,8 +2567,6 @@ class PPCGCodeGeneration : public ScopPass {
PPCGArray.name = strdup(Array->getName().c_str());
PPCGArray.extent = nullptr;
PPCGArray.n_index = Array->getNumberOfDimensions();
PPCGArray.bound =
isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index);
PPCGArray.extent = getExtent(Array);
PPCGArray.n_ref = 0;
PPCGArray.refs = nullptr;
Expand All @@ -2527,6 +2581,7 @@ class PPCGCodeGeneration : public ScopPass {
PPCGArray.dep_order = nullptr;
PPCGArray.user = Array;

PPCGArray.bound = nullptr;
setArrayBounds(PPCGArray, Array);
i++;

Expand Down Expand Up @@ -2570,6 +2625,7 @@ class PPCGCodeGeneration : public ScopPass {
isl_union_map_copy(PPCGScop->tagged_must_kills);
PPCGProg->to_inner = getArrayIdentity();
PPCGProg->to_outer = getArrayIdentity();
// TODO: verify that this assignment is correct.
PPCGProg->any_to_outer = nullptr;

// this needs to be set when live range reordering is enabled.
Expand Down Expand Up @@ -2962,15 +3018,16 @@ class PPCGCodeGeneration : public ScopPass {
Condition = isl_ast_expr_and(Condition, SufficientCompute);
isl_ast_build_free(Build);

// preload invariant loads. Note: This should happen before the RTC
// because the RTC may depend on values that are invariant load hoisted.
NodeBuilder.preloadInvariantLoads();

Value *RTC = NodeBuilder.createRTC(Condition);
Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);

Builder.SetInsertPoint(&*StartBlock->begin());

NodeBuilder.initializeAfterRTH();
NodeBuilder.preloadInvariantLoads();
NodeBuilder.create(Root);
NodeBuilder.finalize();

/// In case a sequential kernel has more surrounding loops as any parallel
/// kernel, the SCoP is probably mostly sequential. Hence, there is no
Expand Down
5 changes: 2 additions & 3 deletions polly/test/GPGPU/host-control-flow.ll
Expand Up @@ -14,9 +14,7 @@

; REQUIRES: pollyacc

; CODE: # host
; CODE-NEXT: {
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
; CODE-NEXT: for (int c0 = 0; c0 <= 99; c0 += 1)
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
Expand All @@ -26,6 +24,7 @@
; CODE-NEXT: }

; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
; CODE-NEXT: }

; IR-LABEL: polly.loop_header: ; preds = %polly.loop_header, %polly.loop_preheader
Expand Down
6 changes: 1 addition & 5 deletions polly/test/GPGPU/host-statement.ll
Expand Up @@ -18,11 +18,7 @@ declare void @llvm.lifetime.start(i64, i8* nocapture) #0
; This test case tests that we can correctly handle a ScopStmt that is
; scheduled on the host, instead of within a kernel.

; CODE-LABEL: Code
; CODE-NEXT: ====
; CODE-NEXT: # host
; CODE-NEXT: {
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
; CODE-NEXT: {
Expand Down
7 changes: 1 addition & 6 deletions polly/test/GPGPU/invalid-kernel.ll
Expand Up @@ -20,11 +20,7 @@
; were we still lack proper code-generation support. We check here that we
; detect the invalid IR and bail out gracefully.

; CODE: Code
; CODE-NEXT: ====
; CODE-NEXT: # host
; CODE-NEXT: {
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
; CODE: cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
; CODE-NEXT: cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
; CODE-NEXT: {
; CODE-NEXT: dim3 k0_dimBlock(32);
Expand All @@ -34,7 +30,6 @@
; CODE-NEXT: }

; CODE: cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
; CODE-NEXT: }

; CODE: # kernel0
; CODE-NEXT: Stmt_bb2(32 * b0 + t0);
Expand Down
9 changes: 5 additions & 4 deletions polly/test/GPGPU/kernel-params-only-some-arrays.ll
Expand Up @@ -21,7 +21,7 @@
; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"

; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A)
; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B)
; KERNEL-NEXT: entry:
; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; KERNEL-NEXT: %b0 = zext i32 %0 to i64
Expand All @@ -36,7 +36,7 @@
; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"

; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B)
; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A)
; KERNEL-NEXT: entry:
; KERNEL-NEXT: %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
; KERNEL-NEXT: %b0 = zext i32 %0 to i64
Expand All @@ -47,18 +47,19 @@
; KERNEL-NEXT: }


; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]

; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
; IR: [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
; IR-NEXT: [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0
; IR-NEXT: store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
; IR-NEXT: [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
; IR-NEXT: store i8* [[DATA]], i8** [[SLOT]]


target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

define void @kernel_params_only_some_arrays(float* %A, float* %B) {
Expand Down

0 comments on commit 9e3db2b

Please sign in to comment.