Skip to content

Commit

Permalink
[Polly][MatMul] Abandon dependence analysis.
Browse files Browse the repository at this point in the history
The copy statements inserted by the matrix-multiplication optimization
introduce new dependencies between the copy statements and other
statements. As a result, the DependenceInfo must be recomputed.

Not recomputing them caused IslAstInfo to deduce that some loops are
parallel but cause race conditions when accessing the packed arrays.
As a result, matrix-matrix multiplication currently cannot be
parallelized.

Also see discussion at https://reviews.llvm.org/D125202
  • Loading branch information
Meinersbur committed Jun 29, 2022
1 parent 3944780 commit 6fa65f8
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 49 deletions.
16 changes: 16 additions & 0 deletions polly/include/polly/DependenceInfo.h
Expand Up @@ -208,6 +208,15 @@ struct DependenceAnalysis final : public AnalysisInfoMixin<DependenceAnalysis> {

/// Recompute dependences from schedule and memory accesses.
const Dependences &recomputeDependences(Dependences::AnalysisLevel Level);

/// Invalidate the dependence information and recompute it when needed
/// again.
/// May be required when the underlaying Scop was changed in a way that
/// would add new dependencies (e.g. between new statement instances
/// insierted into the SCoP) or intentionally breaks existing ones. It is
/// not required when updating the schedule that conforms the existing
/// dependencies.
void abandonDependences();
};
Result run(Scop &S, ScopAnalysisManager &SAM,
ScopStandardAnalysisResults &SAR);
Expand Down Expand Up @@ -241,6 +250,13 @@ class DependenceInfo final : public ScopPass {
/// Recompute dependences from schedule and memory accesses.
const Dependences &recomputeDependences(Dependences::AnalysisLevel Level);

/// Invalidate the dependence information and recompute it when needed again.
/// May be required when the underlaying Scop was changed in a way that would
/// add new dependencies (e.g. between new statement instances insierted into
/// the SCoP) or intentionally breaks existing ones. It is not required when
/// updating the schedule that conforms the existing dependencies.
void abandonDependences();

/// Compute the dependence information for the SCoP @p S.
bool runOnScop(Scop &S) override;

Expand Down
10 changes: 10 additions & 0 deletions polly/lib/Analysis/DependenceInfo.cpp
Expand Up @@ -848,6 +848,11 @@ const Dependences &DependenceAnalysis::Result::recomputeDependences(
return *D[Level];
}

void DependenceAnalysis::Result::abandonDependences() {
for (std::unique_ptr<Dependences> &Deps : D)
Deps.release();
}

DependenceAnalysis::Result
DependenceAnalysis::run(Scop &S, ScopAnalysisManager &SAM,
ScopStandardAnalysisResults &SAR) {
Expand Down Expand Up @@ -890,6 +895,11 @@ DependenceInfo::recomputeDependences(Dependences::AnalysisLevel Level) {
return *D[Level];
}

void DependenceInfo::abandonDependences() {
for (std::unique_ptr<Dependences> &Deps : D)
Deps.release();
}

bool DependenceInfo::runOnScop(Scop &ScopVar) {
S = &ScopVar;
return false;
Expand Down
3 changes: 0 additions & 3 deletions polly/lib/Transform/MatmulOptimizer.cpp
Expand Up @@ -491,9 +491,6 @@ createMacroKernel(isl::schedule_node Node,
Node = permuteBandNodeDimensions(Node, DimOutNum - 2, DimOutNum - 1);
Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);

// Mark the outermost loop as parallelizable.
Node = Node.as<isl::schedule_node_band>().member_set_coincident(0, true);

return Node.child(0).child(0);
}

Expand Down
56 changes: 30 additions & 26 deletions polly/lib/Transform/ScheduleOptimizer.cpp
Expand Up @@ -228,6 +228,7 @@ struct OptimizerAdditionalInfoTy {
bool PatternOpts;
bool Postopts;
bool Prevect;
bool &DepsChanged;
};

class ScheduleTreeOptimizer final {
Expand Down Expand Up @@ -526,6 +527,7 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
tryOptimizeMatMulPattern(Node, OAI->TTI, OAI->D);
if (!PatternOptimizedSchedule.is_null()) {
MatMulOpts++;
OAI->DepsChanged = true;
return PatternOptimizedSchedule.release();
}
}
Expand Down Expand Up @@ -676,21 +678,21 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
&Version);
}

static bool runIslScheduleOptimizer(
static void runIslScheduleOptimizer(
Scop &S,
function_ref<const Dependences &(Dependences::AnalysisLevel)> GetDeps,
TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE,
isl::schedule &LastSchedule) {
isl::schedule &LastSchedule, bool &DepsChanged) {

// Skip SCoPs in case they're already optimised by PPCGCodeGeneration
if (S.isToBeSkipped())
return false;
return;

// Skip empty SCoPs but still allow code generation as it will delete the
// loops present but not needed.
if (S.getSize() == 0) {
S.markAsOptimized();
return false;
return;
}

ScopsProcessed++;
Expand All @@ -706,7 +708,7 @@ static bool runIslScheduleOptimizer(
&S, Schedule, GetDeps(Dependences::AL_Statement), ORE);
if (ManuallyTransformed.is_null()) {
LLVM_DEBUG(dbgs() << "Error during manual optimization\n");
return false;
return;
}

if (ManuallyTransformed.get() != Schedule.get()) {
Expand All @@ -724,18 +726,18 @@ static bool runIslScheduleOptimizer(
// metadata earlier in ScopDetection.
if (!HasUserTransformation && S.hasDisableHeuristicsHint()) {
LLVM_DEBUG(dbgs() << "Heuristic optimizations disabled by metadata\n");
return false;
return;
}

// Get dependency analysis.
const Dependences &D = GetDeps(Dependences::AL_Statement);
if (D.getSharedIslCtx() != S.getSharedIslCtx()) {
LLVM_DEBUG(dbgs() << "DependenceInfo for another SCoP/isl_ctx\n");
return false;
return;
}
if (!D.hasValidDependences()) {
LLVM_DEBUG(dbgs() << "Dependency information not available\n");
return false;
return;
}

// Apply ISL's algorithm only if not overriden by the user. Note that
Expand Down Expand Up @@ -769,7 +771,7 @@ static bool runIslScheduleOptimizer(
isl::union_set Domain = S.getDomains();

if (Domain.is_null())
return false;
return;

isl::union_map Validity = D.getDependences(ValidityKinds);
isl::union_map Proximity = D.getDependences(ProximityKinds);
Expand Down Expand Up @@ -847,7 +849,7 @@ static bool runIslScheduleOptimizer(
// In cases the scheduler is not able to optimize the code, we just do not
// touch the schedule.
if (Schedule.is_null())
return false;
return;

if (GreedyFusion) {
isl::union_map Validity = D.getDependences(
Expand All @@ -858,10 +860,12 @@ static bool runIslScheduleOptimizer(

// Apply post-rescheduling optimizations (if enabled) and/or prevectorization.
const OptimizerAdditionalInfoTy OAI = {
TTI, const_cast<Dependences *>(&D),
TTI,
const_cast<Dependences *>(&D),
/*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
/*Postopts=*/!HasUserTransformation && EnablePostopts,
/*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE};
/*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
DepsChanged};
if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
Schedule = hoistExtensionNodes(Schedule);
Expand All @@ -872,7 +876,7 @@ static bool runIslScheduleOptimizer(
// Skip profitability check if user transformation(s) have been applied.
if (!HasUserTransformation &&
!ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
return false;
return;

auto ScopStats = S.getStatistics();
ScopsOptimized++;
Expand All @@ -885,8 +889,6 @@ static bool runIslScheduleOptimizer(

if (OptimizedScops)
errs() << S;

return false;
}

bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
Expand All @@ -904,7 +906,13 @@ bool IslScheduleOptimizerWrapperPass::runOnScop(Scop &S) {
getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
TargetTransformInfo *TTI =
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
return runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule);

bool DepsChanged = false;
runIslScheduleOptimizer(S, getDependences, TTI, &ORE, LastSchedule,
DepsChanged);
if (DepsChanged)
getAnalysis<DependenceInfo>().abandonDependences();
return false;
}

static void runScheduleOptimizerPrinter(raw_ostream &OS,
Expand Down Expand Up @@ -971,22 +979,18 @@ runIslScheduleOptimizerUsingNPM(Scop &S, ScopAnalysisManager &SAM,
OptimizationRemarkEmitter ORE(&S.getFunction());
TargetTransformInfo *TTI = &SAR.TTI;
isl::schedule LastSchedule;
bool Modified = runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule);
bool DepsChanged = false;
runIslScheduleOptimizer(S, GetDeps, TTI, &ORE, LastSchedule, DepsChanged);
if (DepsChanged)
Deps.abandonDependences();

if (OS) {
*OS << "Printing analysis 'Polly - Optimize schedule of SCoP' for region: '"
<< S.getName() << "' in function '" << S.getFunction().getName()
<< "':\n";
runScheduleOptimizerPrinter(*OS, LastSchedule);
}

if (!Modified)
return PreservedAnalyses::all();

PreservedAnalyses PA;
PA.preserveSet<AllAnalysesOn<Module>>();
PA.preserveSet<AllAnalysesOn<Function>>();
PA.preserveSet<AllAnalysesOn<Loop>>();
return PA;
return PreservedAnalyses::all();
}

llvm::PreservedAnalyses
Expand Down
24 changes: 6 additions & 18 deletions polly/test/CodeGen/OpenMP/matmul-parallel.ll
Expand Up @@ -2,20 +2,10 @@
; RUN: opt %loadPolly -polly-parallel -polly-opt-isl -polly-codegen -S < %s | FileCheck --check-prefix=CODEGEN %s
; REQUIRES: asserts

; Parellization of detected matrix-multiplication. The allocations
; Packed_A and Packed_B must be passed to the outlined function.
; llvm.org/PR43164
;
; #define N 1536
; int foo(float A[N][N],float B[N][N],float C[N][N]) {
; for (int i = 0; i < N; i++) {
; for (int j = 0; j < N; j++) {
; for (int k = 0; k < N; k++)
; C[i][j] = C[i][j] + A[i][k] * B[k][j];
; }
; }
; return 0;
; }
; Parallelization of detected matrix-multiplication.
; Currently, this is not supported. Due to Packed_A/Packed_B not private
; per-thread the outer loops cannot be parallelized and a
; '#pragma omp parallel for' on an inner loop may impose too much overhead.

target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc19.16.27034"
Expand Down Expand Up @@ -65,8 +55,6 @@ for.body8:
}


; AST: #pragma omp parallel for
; AST-NOT: parallel

; CODGEN-LABEL: define internal void @init_array_polly_subfn(i8* %polly.par.userContext)
; CODEGEN: %polly.subfunc.arg.Packed_A = load
; CODEGEN: %polly.subfunc.arg.Packed_B = load
; CODEGEN-NOT: subfunc
3 changes: 1 addition & 2 deletions polly/test/ScheduleOptimizer/pattern-matching-based-opts.ll
Expand Up @@ -15,8 +15,7 @@
;
; CHECK-NOT: The matrix multiplication pattern was detected
; PATTERN-MATCHING-OPTS: The matrix multiplication pattern was detected
; PARALLEL-AST: #pragma known-parallel
; PARALLEL-AST: #pragma known-parallel
; PARALLEL-AST-NOT: #pragma known-parallel
; STATS: 1 polly-opt-isl - Number of matrix multiplication patterns detected and optimized
;
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
Expand Down

0 comments on commit 6fa65f8

Please sign in to comment.