[PPCGCodeGen] [3/3] Update PPCGCodeGen + tests to latest ppcg.

This commit *WILL COMPILE*. 1. `PPCG` now uses `isl_multi_pw_aff` instead of an array of `pw_aff`. This needs us to adjust how we index array bounds and how we construct array bounds. 2. `PPCG` introduces two new kinds of nodes: `init_device` and `clear_device`. We should investigate what the correct way to handle these are. 3. `PPCG` has gotten smarter with its use of live range reordering, so some of the tests have a qualitative improvement. 4. `PPCG` changed its output style, so many test cases need to be updated to fit the new style for `polly-acc-dump-code` checks. Differential Revision: https://reviews.llvm.org/D35677 llvm-svn: 308625
llvm · Jul 20, 2017 · 9e3db2b · 9e3db2b
1 parent 3d4d752
commit 9e3db2b
Show file tree

Hide file tree

Showing 14 changed files with 151 additions and 157 deletions.
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -137,7 +137,11 @@ struct MustKillsInfo {
   /// [params] -> { [Stmt_phantom[] -> ref_phantom[]] -> scalar_to_kill[] }
   isl::union_map TaggedMustKills;
 
-  MustKillsInfo() : KillsSchedule(nullptr), TaggedMustKills(nullptr){};
+  /// Tagged must kills stripped of the tags.
+  /// [params] -> { Stmt_phantom[]  -> scalar_to_kill[] }
+  isl::union_map MustKills;
+
+  MustKillsInfo() : KillsSchedule(nullptr) {}
 };
 
 /// Check if SAI's uses are entirely contained within Scop S.
@@ -179,6 +183,7 @@ static MustKillsInfo computeMustKillsInfo(const Scop &S) {
   }
 
   Info.TaggedMustKills = isl::union_map::empty(isl::space(ParamSpace));
+  Info.MustKills = isl::union_map::empty(isl::space(ParamSpace));
 
   // Initialising KillsSchedule to `isl_set_empty` creates an empty node in the
   // schedule:
@@ -225,6 +230,9 @@ static MustKillsInfo computeMustKillsInfo(const Scop &S) {
     isl::map TaggedMustKill = StmtToScalar.domain_product(PhantomRefToScalar);
     Info.TaggedMustKills = Info.TaggedMustKills.unite(TaggedMustKill);
 
+    // 2. [param] -> { Stmt[] -> scalar_to_kill[] }
+    Info.MustKills = Info.TaggedMustKills.domain_factor_domain();
+
     // 3. Create the kill schedule of the form:
     //     "[param] -> { Stmt_phantom[] }"
     // Then add this to Info.KillsSchedule.
@@ -1004,11 +1012,11 @@ Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
   Value *ArraySize = ConstantInt::get(Builder.getInt64Ty(), Array->size);
 
   if (!gpu_array_is_scalar(Array)) {
-    auto OffsetDimZero = isl_pw_aff_copy(Array->bound[0]);
+    auto OffsetDimZero = isl_multi_pw_aff_get_pw_aff(Array->bound, 0);
     isl_ast_expr *Res = isl_ast_build_expr_from_pw_aff(Build, OffsetDimZero);
 
     for (unsigned int i = 1; i < Array->n_index; i++) {
-      isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i]);
+      isl_pw_aff *Bound_I = isl_multi_pw_aff_get_pw_aff(Array->bound, i);
       isl_ast_expr *Expr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
       Res = isl_ast_expr_mul(Res, Expr);
     }
@@ -1048,7 +1056,7 @@ Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
 
   for (long i = 0; i < isl_set_dim(Min, isl_dim_set); i++) {
     if (i > 0) {
-      isl_pw_aff *Bound_I = isl_pw_aff_copy(Array->bound[i - 1]);
+      isl_pw_aff *Bound_I = isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1);
       isl_ast_expr *BExpr = isl_ast_build_expr_from_pw_aff(Build, Bound_I);
       Result = isl_ast_expr_mul(Result, BExpr);
     }
@@ -1152,7 +1160,18 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
     isl_ast_expr_free(Expr);
     return;
   }
-
+  if (!strcmp(Str, "init_device")) {
+    initializeAfterRTH();
+    isl_ast_node_free(UserStmt);
+    isl_ast_expr_free(Expr);
+    return;
+  }
+  if (!strcmp(Str, "clear_device")) {
+    finalize();
+    isl_ast_node_free(UserStmt);
+    isl_ast_expr_free(Expr);
+    return;
+  }
   if (isPrefix(Str, "to_device")) {
     if (!ManagedMemory)
       createDataTransfer(UserStmt, HOST_TO_DEVICE);
@@ -1766,7 +1785,7 @@ GPUNodeBuilder::createKernelFunctionDecl(ppcg_kernel *Kernel,
     Sizes.push_back(nullptr);
     for (long j = 1; j < Kernel->array[i].array->n_index; j++) {
       isl_ast_expr *DimSize = isl_ast_build_expr_from_pw_aff(
-          Build, isl_pw_aff_copy(Kernel->array[i].array->bound[j]));
+          Build, isl_multi_pw_aff_get_pw_aff(Kernel->array[i].array->bound, j));
       auto V = ExprBuilder.create(DimSize);
       Sizes.push_back(SE.getSCEV(V));
     }
@@ -2127,6 +2146,7 @@ class PPCGCodeGeneration : public ScopPass {
 
     Options->debug = DebugOptions;
 
+    Options->group_chains = false;
     Options->reschedule = true;
     Options->scale_tile_loops = false;
     Options->wrap = false;
@@ -2135,17 +2155,26 @@ class PPCGCodeGeneration : public ScopPass {
     Options->ctx = nullptr;
     Options->sizes = nullptr;
 
+    Options->tile = true;
     Options->tile_size = 32;
 
+    Options->isolate_full_tiles = false;
+
     Options->use_private_memory = PrivateMemory;
     Options->use_shared_memory = SharedMemory;
     Options->max_shared_memory = 48 * 1024;
 
     Options->target = PPCG_TARGET_CUDA;
     Options->openmp = false;
     Options->linearize_device_arrays = true;
-    Options->live_range_reordering = false;
+    Options->allow_gnu_extensions = false;
+
+    Options->unroll_copy_shared = false;
+    Options->unroll_gpu_tile = false;
+    Options->live_range_reordering = true;
 
+    Options->live_range_reordering = true;
+    Options->hybrid = false;
     Options->opencl_compiler_options = nullptr;
     Options->opencl_use_gpu = false;
     Options->opencl_n_include_file = 0;
@@ -2260,6 +2289,8 @@ class PPCGCodeGeneration : public ScopPass {
   ///
   /// @returns A new ppcg scop.
   ppcg_scop *createPPCGScop() {
+    MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
+
     auto PPCGScop = (ppcg_scop *)malloc(sizeof(ppcg_scop));
 
     PPCGScop->options = createPPCGOptions();
@@ -2271,7 +2302,8 @@ class PPCGCodeGeneration : public ScopPass {
 
     PPCGScop->context = S->getContext();
     PPCGScop->domain = S->getDomains();
-    PPCGScop->call = nullptr;
+    // TODO: investigate this further. PPCG calls collect_call_domains.
+    PPCGScop->call = isl_union_set_from_set(S->getContext());
     PPCGScop->tagged_reads = getTaggedReads();
     PPCGScop->reads = S->getReads();
     PPCGScop->live_in = nullptr;
@@ -2280,6 +2312,9 @@ class PPCGCodeGeneration : public ScopPass {
     PPCGScop->tagged_must_writes = getTaggedMustWrites();
     PPCGScop->must_writes = S->getMustWrites();
     PPCGScop->live_out = nullptr;
+    PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take();
+    PPCGScop->must_kills = KillsInfo.MustKills.take();
+
     PPCGScop->tagger = nullptr;
     PPCGScop->independence =
         isl_union_map_empty(isl_set_get_space(PPCGScop->context));
@@ -2291,19 +2326,17 @@ class PPCGCodeGeneration : public ScopPass {
     PPCGScop->tagged_dep_order = nullptr;
 
     PPCGScop->schedule = S->getScheduleTree();
-
-    MustKillsInfo KillsInfo = computeMustKillsInfo(*S);
     // If we have something non-trivial to kill, add it to the schedule
     if (KillsInfo.KillsSchedule.get())
       PPCGScop->schedule = isl_schedule_sequence(
           PPCGScop->schedule, KillsInfo.KillsSchedule.take());
-    PPCGScop->tagged_must_kills = KillsInfo.TaggedMustKills.take();
 
     PPCGScop->names = getNames();
     PPCGScop->pet = nullptr;
 
     compute_tagger(PPCGScop);
     compute_dependences(PPCGScop);
+    eliminate_dead_code(PPCGScop);
 
     return PPCGScop;
   }
@@ -2458,14 +2491,23 @@ class PPCGCodeGeneration : public ScopPass {
   /// @param PPCGArray The array to compute bounds for.
   /// @param Array The polly array from which to take the information.
   void setArrayBounds(gpu_array_info &PPCGArray, ScopArrayInfo *Array) {
+    isl_pw_aff_list *BoundsList =
+        isl_pw_aff_list_alloc(S->getIslCtx(), PPCGArray.n_index);
+    std::vector<isl::pw_aff> PwAffs;
+
+    isl_space *AlignSpace = S->getParamSpace();
+    AlignSpace = isl_space_add_dims(AlignSpace, isl_dim_set, 1);
+
     if (PPCGArray.n_index > 0) {
       if (isl_set_is_empty(PPCGArray.extent)) {
         isl_set *Dom = isl_set_copy(PPCGArray.extent);
         isl_local_space *LS = isl_local_space_from_space(
             isl_space_params(isl_set_get_space(Dom)));
         isl_set_free(Dom);
-        isl_aff *Zero = isl_aff_zero_on_domain(LS);
-        PPCGArray.bound[0] = isl_pw_aff_from_aff(Zero);
+        isl_pw_aff *Zero = isl_pw_aff_from_aff(isl_aff_zero_on_domain(LS));
+        Zero = isl_pw_aff_align_params(Zero, isl_space_copy(AlignSpace));
+        PwAffs.push_back(isl::manage(isl_pw_aff_copy(Zero)));
+        BoundsList = isl_pw_aff_list_insert(BoundsList, 0, Zero);
       } else {
         isl_set *Dom = isl_set_copy(PPCGArray.extent);
         Dom = isl_set_project_out(Dom, isl_dim_set, 1, PPCGArray.n_index - 1);
@@ -2478,7 +2520,9 @@ class PPCGCodeGeneration : public ScopPass {
         One = isl_aff_add_constant_si(One, 1);
         Bound = isl_pw_aff_add(Bound, isl_pw_aff_alloc(Dom, One));
         Bound = isl_pw_aff_gist(Bound, S->getContext());
-        PPCGArray.bound[0] = Bound;
+        Bound = isl_pw_aff_align_params(Bound, isl_space_copy(AlignSpace));
+        PwAffs.push_back(isl::manage(isl_pw_aff_copy(Bound)));
+        BoundsList = isl_pw_aff_list_insert(BoundsList, 0, Bound);
       }
     }
 
@@ -2487,8 +2531,20 @@ class PPCGCodeGeneration : public ScopPass {
       auto LS = isl_pw_aff_get_domain_space(Bound);
       auto Aff = isl_multi_aff_zero(LS);
       Bound = isl_pw_aff_pullback_multi_aff(Bound, Aff);
-      PPCGArray.bound[i] = Bound;
+      Bound = isl_pw_aff_align_params(Bound, isl_space_copy(AlignSpace));
+      PwAffs.push_back(isl::manage(isl_pw_aff_copy(Bound)));
+      BoundsList = isl_pw_aff_list_insert(BoundsList, i, Bound);
     }
+
+    isl_space_free(AlignSpace);
+    isl_space *BoundsSpace = isl_set_get_space(PPCGArray.extent);
+
+    assert(BoundsSpace && "Unable to access space of array.");
+    assert(BoundsList && "Unable to access list of bounds.");
+
+    PPCGArray.bound =
+        isl_multi_pw_aff_from_pw_aff_list(BoundsSpace, BoundsList);
+    assert(PPCGArray.bound && "PPCGArray.bound was not constructed correctly.");
   }
 
   /// Create the arrays for @p PPCGProg.
@@ -2511,8 +2567,6 @@ class PPCGCodeGeneration : public ScopPass {
       PPCGArray.name = strdup(Array->getName().c_str());
       PPCGArray.extent = nullptr;
       PPCGArray.n_index = Array->getNumberOfDimensions();
-      PPCGArray.bound =
-          isl_alloc_array(S->getIslCtx(), isl_pw_aff *, PPCGArray.n_index);
       PPCGArray.extent = getExtent(Array);
       PPCGArray.n_ref = 0;
       PPCGArray.refs = nullptr;
@@ -2527,6 +2581,7 @@ class PPCGCodeGeneration : public ScopPass {
       PPCGArray.dep_order = nullptr;
       PPCGArray.user = Array;
 
+      PPCGArray.bound = nullptr;
       setArrayBounds(PPCGArray, Array);
       i++;
 
@@ -2570,6 +2625,7 @@ class PPCGCodeGeneration : public ScopPass {
         isl_union_map_copy(PPCGScop->tagged_must_kills);
     PPCGProg->to_inner = getArrayIdentity();
     PPCGProg->to_outer = getArrayIdentity();
+    // TODO: verify that this assignment is correct.
     PPCGProg->any_to_outer = nullptr;
 
     // this needs to be set when live range reordering is enabled.
@@ -2962,15 +3018,16 @@ class PPCGCodeGeneration : public ScopPass {
     Condition = isl_ast_expr_and(Condition, SufficientCompute);
     isl_ast_build_free(Build);
 
+    // preload invariant loads. Note: This should happen before the RTC
+    // because the RTC may depend on values that are invariant load hoisted.
+    NodeBuilder.preloadInvariantLoads();
+
     Value *RTC = NodeBuilder.createRTC(Condition);
     Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
 
     Builder.SetInsertPoint(&*StartBlock->begin());
 
-    NodeBuilder.initializeAfterRTH();
-    NodeBuilder.preloadInvariantLoads();
     NodeBuilder.create(Root);
-    NodeBuilder.finalize();
 
     /// In case a sequential kernel has more surrounding loops as any parallel
     /// kernel, the SCoP is probably mostly sequential. Hence, there is no

diff --git a/polly/test/GPGPU/host-control-flow.ll b/polly/test/GPGPU/host-control-flow.ll
@@ -14,9 +14,7 @@
 
 ; REQUIRES: pollyacc
 
-; CODE: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
+; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyHostToDevice));
 ; CODE-NEXT:   for (int c0 = 0; c0 <= 99; c0 += 1)
 ; CODE-NEXT:     {
 ; CODE-NEXT:       dim3 k0_dimBlock(32);
@@ -26,6 +24,7 @@
 ; CODE-NEXT:     }
 
 ; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (2) * (100) * sizeof(float), cudaMemcpyDeviceToHost));
+; CODE-NEXT: cudaCheckReturn(cudaFree(dev_MemRef_A));
 ; CODE-NEXT: }
 
 ; IR-LABEL: polly.loop_header:                                ; preds = %polly.loop_header, %polly.loop_preheader

diff --git a/polly/test/GPGPU/host-statement.ll b/polly/test/GPGPU/host-statement.ll
@@ -18,11 +18,7 @@ declare void @llvm.lifetime.start(i64, i8* nocapture) #0
 ; This test case tests that we can correctly handle a ScopStmt that is
 ; scheduled on the host, instead of within a kernel.
 
-; CODE-LABEL: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
+; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
 ; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_R, MemRef_R, (p_0 + 1) * (512) * sizeof(double), cudaMemcpyHostToDevice));
 ; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_Q, MemRef_Q, (512) * (512) * sizeof(double), cudaMemcpyHostToDevice));
 ; CODE-NEXT:   {

diff --git a/polly/test/GPGPU/invalid-kernel.ll b/polly/test/GPGPU/invalid-kernel.ll
@@ -20,11 +20,7 @@
 ; were we still lack proper code-generation support. We check here that we
 ; detect the invalid IR and bail out gracefully.
 
-; CODE: Code
-; CODE-NEXT: ====
-; CODE-NEXT: # host
-; CODE-NEXT: {
-; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
+; CODE:        cudaCheckReturn(cudaMemcpy(dev_MemRef_B, MemRef_B, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
 ; CODE-NEXT:   cudaCheckReturn(cudaMemcpy(dev_MemRef_A, MemRef_A, (1024) * sizeof(i64), cudaMemcpyHostToDevice));
 ; CODE-NEXT:   {
 ; CODE-NEXT:     dim3 k0_dimBlock(32);
@@ -34,7 +30,6 @@
 ; CODE-NEXT:   }
 
 ; CODE:   cudaCheckReturn(cudaMemcpy(MemRef_A, dev_MemRef_A, (1024) * sizeof(i64), cudaMemcpyDeviceToHost));
-; CODE-NEXT: }
 
 ; CODE: # kernel0
 ; CODE-NEXT: Stmt_bb2(32 * b0 + t0);

diff --git a/polly/test/GPGPU/kernel-params-only-some-arrays.ll b/polly/test/GPGPU/kernel-params-only-some-arrays.ll
@@ -21,7 +21,7 @@
 ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
 
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_A)
+; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_0(i8 addrspace(1)* %MemRef_B)
 ; KERNEL-NEXT:   entry:
 ; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
@@ -36,7 +36,7 @@
 ; KERNEL-NEXT: target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 ; KERNEL-NEXT: target triple = "nvptx64-nvidia-cuda"
 
-; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_B)
+; KERNEL: define ptx_kernel void @FUNC_kernel_params_only_some_arrays_SCOP_0_KERNEL_1(i8 addrspace(1)* %MemRef_A)
 ; KERNEL-NEXT:   entry:
 ; KERNEL-NEXT:     %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
 ; KERNEL-NEXT:     %b0 = zext i32 %0 to i64
@@ -47,18 +47,19 @@
 ; KERNEL-NEXT: }
 
 
-; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
+; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
 ; IR-NEXT:  [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_0_params, i64 0, i64 0
 ; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_0_param_0
 ; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_0_param_0 to i8*
 ; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]
 
-; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_B)
+; IR:       [[DEVPTR:%.*]] = call i8* @polly_getDevicePtr(i8* %p_dev_array_MemRef_A)
 ; IR-NEXT:  [[SLOT:%.*]] = getelementptr [2 x i8*], [2 x i8*]* %polly_launch_1_params, i64 0, i64 0
 ; IR-NEXT:  store i8* [[DEVPTR]], i8** %polly_launch_1_param_0
 ; IR-NEXT:  [[DATA:%.*]] = bitcast i8** %polly_launch_1_param_0 to i8*
 ; IR-NEXT:  store i8* [[DATA]], i8** [[SLOT]]
 
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 define void @kernel_params_only_some_arrays(float* %A, float* %B) {