[MLIR][OpenMP] Support basic materialization for `omp.private` ops #81715

ergawy · 2024-02-14T08:19:34Z

Adds basic support for materializing delayed privatization. So far, the
restrictions on the implementation are:

Only private clauses are supported (firstprivate support will be
added in a later PR).

llvmbot · 2024-02-14T08:20:04Z

@llvm/pr-subscribers-mlir
@llvm/pr-subscribers-flang-fir-hlfir
@llvm/pr-subscribers-mlir-openmp

@llvm/pr-subscribers-mlir-llvm

Author: Kareem Ergawy (ergawy)

Changes

Adds basic support for materializing delayed privatization. So far, the
restrictions on the implementation are:

Only private clauses are supported (firstprivate support will be
added in a later PR).
Only single-block omp.private -> alloc regions are supported
(multi-block ones will be supported in a later PR).

This is a follow-up to both #81414 & #81452, only the latest commit (with the same title as the PR) is relevant to this PR.

Patch is 39.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81715.diff

12 Files Affected:

(modified) flang/lib/Lower/OpenMP.cpp (+2-1)
(modified) mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td (+10-2)
(modified) mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp (+19-13)
(modified) mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp (+3-1)
(modified) mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp (+141-48)
(modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+108-6)
(modified) mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir (+26)
(modified) mlir/test/Dialect/OpenMP/invalid.mlir (+56)
(added) mlir/test/Dialect/OpenMP/ops-2.mlir (+74)
(modified) mlir/test/Dialect/OpenMP/ops.mlir (+5-5)
(removed) mlir/test/Dialect/OpenMP/roundtrip.mlir (-21)
(added) mlir/test/Target/LLVMIR/openmp-private.mlir (+91)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 24f91765cb439b..74b2727961a03d 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2640,7 +2640,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                  reductionDeclSymbols),
-      procBindKindAttr);
+      procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
+      /*privatizers=*/nullptr);
 }
 
 static mlir::omp::SectionOp
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 0adf186ae0c7e9..9ed40904f22ae2 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -221,6 +221,12 @@ def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove]> {
       attr-dict
   }];
 
+  let builders = [
+    OpBuilder<(ins CArg<"TypeRange">:$result,
+                   CArg<"StringAttr">:$sym_name,
+                   CArg<"TypeAttr">:$type)>
+  ];
+
   let hasVerifier = 1;
 }
 
@@ -270,7 +276,9 @@ def ParallelOp : OpenMP_Op<"parallel", [
              Variadic<AnyType>:$allocators_vars,
              Variadic<OpenMP_PointerLikeType>:$reduction_vars,
              OptionalAttr<SymbolRefArrayAttr>:$reductions,
-             OptionalAttr<ProcBindKindAttr>:$proc_bind_val);
+             OptionalAttr<ProcBindKindAttr>:$proc_bind_val,
+             Variadic<AnyType>:$private_vars,
+             OptionalAttr<SymbolRefArrayAttr>:$privatizers);
 
   let regions = (region AnyRegion:$region);
 
@@ -291,7 +299,7 @@ def ParallelOp : OpenMP_Op<"parallel", [
                 $allocators_vars, type($allocators_vars)
               ) `)`
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
-    ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars), $reductions) attr-dict
+    ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars), $reductions, $private_vars, type($private_vars), $privatizers) attr-dict
   }];
   let hasVerifier = 1;
 }
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 730858ffc67a71..2eba4fba105c7b 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -200,16 +200,20 @@ struct ReductionOpConversion : public ConvertOpToLLVMPattern<omp::ReductionOp> {
   }
 };
 
-struct ReductionDeclareOpConversion
-    : public ConvertOpToLLVMPattern<omp::ReductionDeclareOp> {
-  using ConvertOpToLLVMPattern<omp::ReductionDeclareOp>::ConvertOpToLLVMPattern;
+template <typename OpType>
+struct MultiRegionOpConversion : public ConvertOpToLLVMPattern<OpType> {
+  using ConvertOpToLLVMPattern<OpType>::ConvertOpToLLVMPattern;
   LogicalResult
-  matchAndRewrite(omp::ReductionDeclareOp curOp, OpAdaptor adaptor,
+  matchAndRewrite(OpType curOp, typename OpType::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto newOp = rewriter.create<omp::ReductionDeclareOp>(
+    auto newOp = rewriter.create<OpType>(
         curOp.getLoc(), TypeRange(), curOp.getSymNameAttr(),
         TypeAttr::get(this->getTypeConverter()->convertType(
             curOp.getTypeAttr().getValue())));
+
+    if constexpr (std::is_same_v<OpType, mlir::omp::PrivateClauseOp>)
+      newOp.setDataSharingType(curOp.getDataSharingType());
+
     for (unsigned idx = 0; idx < curOp.getNumRegions(); idx++) {
       rewriter.inlineRegionBefore(curOp.getRegion(idx), newOp.getRegion(idx),
                                   newOp.getRegion(idx).end());
@@ -231,11 +235,12 @@ void mlir::configureOpenMPToLLVMConversionLegality(
       mlir::omp::DataOp, mlir::omp::OrderedRegionOp, mlir::omp::ParallelOp,
       mlir::omp::WsLoopOp, mlir::omp::SimdLoopOp, mlir::omp::MasterOp,
       mlir::omp::SectionOp, mlir::omp::SectionsOp, mlir::omp::SingleOp,
-      mlir::omp::TaskGroupOp, mlir::omp::TaskOp>([&](Operation *op) {
-    return typeConverter.isLegal(&op->getRegion(0)) &&
-           typeConverter.isLegal(op->getOperandTypes()) &&
-           typeConverter.isLegal(op->getResultTypes());
-  });
+      mlir::omp::TaskGroupOp, mlir::omp::TaskOp, mlir::omp::PrivateClauseOp>(
+      [&](Operation *op) {
+        return typeConverter.isLegal(&op->getRegion(0)) &&
+               typeConverter.isLegal(op->getOperandTypes()) &&
+               typeConverter.isLegal(op->getResultTypes());
+      });
   target.addDynamicallyLegalOp<
       mlir::omp::AtomicReadOp, mlir::omp::AtomicWriteOp, mlir::omp::FlushOp,
       mlir::omp::ThreadprivateOp, mlir::omp::YieldOp, mlir::omp::EnterDataOp,
@@ -267,9 +272,10 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
 
   patterns.add<
       AtomicReadOpConversion, MapInfoOpConversion, ReductionOpConversion,
-      ReductionDeclareOpConversion, RegionOpConversion<omp::CriticalOp>,
-      RegionOpConversion<omp::MasterOp>, ReductionOpConversion,
-      RegionOpConversion<omp::OrderedRegionOp>,
+      MultiRegionOpConversion<omp::ReductionDeclareOp>,
+      MultiRegionOpConversion<omp::PrivateClauseOp>,
+      RegionOpConversion<omp::CriticalOp>, RegionOpConversion<omp::MasterOp>,
+      ReductionOpConversion, RegionOpConversion<omp::OrderedRegionOp>,
       RegionOpConversion<omp::ParallelOp>, RegionOpConversion<omp::WsLoopOp>,
       RegionOpConversion<omp::SectionsOp>, RegionOpConversion<omp::SectionOp>,
       RegionOpConversion<omp::SimdLoopOp>, RegionOpConversion<omp::SingleOp>,
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index ea5f31ee8c6aa7..464a647564aced 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -450,7 +450,9 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
         /* allocators_vars = */ llvm::SmallVector<Value>{},
         /* reduction_vars = */ llvm::SmallVector<Value>{},
         /* reductions = */ ArrayAttr{},
-        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{});
+        /* proc_bind_val = */ omp::ClauseProcBindKindAttr{},
+        /* private_vars = */ ValueRange(),
+        /* privatizers = */ nullptr);
     {
 
       OpBuilder::InsertionGuard guard(rewriter);
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 13fc01d58eced5..3d18b9fe13e42c 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -430,68 +430,102 @@ static void printScheduleClause(OpAsmPrinter &p, Operation *op,
 // Parser, printer and verifier for ReductionVarList
 //===----------------------------------------------------------------------===//
 
-ParseResult
-parseReductionClause(OpAsmParser &parser, Region &region,
-                     SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                     SmallVectorImpl<Type> &types, ArrayAttr &reductionSymbols,
-                     SmallVectorImpl<OpAsmParser::Argument> &privates) {
-  if (failed(parser.parseOptionalKeyword("reduction")))
-    return failure();
-
+ParseResult parseClauseWithRegionArgs(
+    OpAsmParser &parser, Region &region,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
+    SmallVectorImpl<Type> &types, ArrayAttr &symbols,
+    SmallVectorImpl<OpAsmParser::Argument> &regionPrivateArgs) {
   SmallVector<SymbolRefAttr> reductionVec;
+  unsigned regionArgOffset = regionPrivateArgs.size();
 
   if (failed(
           parser.parseCommaSeparatedList(OpAsmParser::Delimiter::Paren, [&]() {
             if (parser.parseAttribute(reductionVec.emplace_back()) ||
                 parser.parseOperand(operands.emplace_back()) ||
                 parser.parseArrow() ||
-                parser.parseArgument(privates.emplace_back()) ||
+                parser.parseArgument(regionPrivateArgs.emplace_back()) ||
                 parser.parseColonType(types.emplace_back()))
               return failure();
             return success();
           })))
     return failure();
 
-  for (auto [prv, type] : llvm::zip_equal(privates, types)) {
+  auto *argsBegin = regionPrivateArgs.begin();
+  MutableArrayRef argsSubrange(argsBegin + regionArgOffset,
+                               argsBegin + regionArgOffset + types.size());
+  for (auto [prv, type] : llvm::zip_equal(argsSubrange, types)) {
     prv.type = type;
   }
   SmallVector<Attribute> reductions(reductionVec.begin(), reductionVec.end());
-  reductionSymbols = ArrayAttr::get(parser.getContext(), reductions);
+  symbols = ArrayAttr::get(parser.getContext(), reductions);
   return success();
 }
 
-static void printReductionClause(OpAsmPrinter &p, Operation *op,
-                                 ValueRange reductionArgs, ValueRange operands,
-                                 TypeRange types, ArrayAttr reductionSymbols) {
-  p << "reduction(";
+static void printClauseWithRegionArgs(OpAsmPrinter &p, Operation *op,
+                                      ValueRange argsSubrange,
+                                      StringRef clauseName, ValueRange operands,
+                                      TypeRange types, ArrayAttr symbols) {
+  p << clauseName << "(";
   llvm::interleaveComma(
-      llvm::zip_equal(reductionSymbols, operands, reductionArgs, types), p,
-      [&p](auto t) {
+      llvm::zip_equal(symbols, operands, argsSubrange, types), p, [&p](auto t) {
         auto [sym, op, arg, type] = t;
         p << sym << " " << op << " -> " << arg << " : " << type;
       });
   p << ") ";
 }
 
-static ParseResult
-parseParallelRegion(OpAsmParser &parser, Region &region,
-                    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &operands,
-                    SmallVectorImpl<Type> &types, ArrayAttr &reductionSymbols) {
+static ParseResult parseParallelRegion(
+    OpAsmParser &parser, Region &region,
+    SmallVectorImpl<OpAsmParser::UnresolvedOperand> &reductionVarOperands,
+    SmallVectorImpl<Type> &reductionVarTypes, ArrayAttr &reductionSymbols,
+    llvm::SmallVectorImpl<OpAsmParser::UnresolvedOperand> &privateVarOperands,
+    llvm::SmallVectorImpl<Type> &privateVarsTypes,
+    ArrayAttr &privatizerSymbols) {
+  llvm::SmallVector<OpAsmParser::Argument> regionPrivateArgs;
 
-  llvm::SmallVector<OpAsmParser::Argument> privates;
-  if (succeeded(parseReductionClause(parser, region, operands, types,
-                                     reductionSymbols, privates)))
-    return parser.parseRegion(region, privates);
+  if (succeeded(parser.parseOptionalKeyword("reduction"))) {
+    if (failed(parseClauseWithRegionArgs(parser, region, reductionVarOperands,
+                                         reductionVarTypes, reductionSymbols,
+                                         regionPrivateArgs)))
+      return failure();
+  }
 
-  return parser.parseRegion(region);
+  if (succeeded(parser.parseOptionalKeyword("private"))) {
+    if (failed(parseClauseWithRegionArgs(parser, region, privateVarOperands,
+                                         privateVarsTypes, privatizerSymbols,
+                                         regionPrivateArgs)))
+      return failure();
+  }
+
+  return parser.parseRegion(region, regionPrivateArgs);
 }
 
 static void printParallelRegion(OpAsmPrinter &p, Operation *op, Region &region,
-                                ValueRange operands, TypeRange types,
-                                ArrayAttr reductionSymbols) {
-  if (reductionSymbols)
-    printReductionClause(p, op, region.front().getArguments(), operands, types,
-                         reductionSymbols);
+                                ValueRange reductionVarOperands,
+                                TypeRange reductionVarTypes,
+                                ArrayAttr reductionSymbols,
+                                ValueRange privateVarOperands,
+                                TypeRange privateVarTypes,
+                                ArrayAttr privatizerSymbols) {
+  if (reductionSymbols) {
+    auto *argsBegin = region.front().getArguments().begin();
+    MutableArrayRef argsSubrange(argsBegin,
+                                 argsBegin + reductionVarTypes.size());
+    printClauseWithRegionArgs(p, op, argsSubrange, "reduction",
+                              reductionVarOperands, reductionVarTypes,
+                              reductionSymbols);
+  }
+
+  if (privatizerSymbols) {
+    auto *argsBegin = region.front().getArguments().begin();
+    MutableArrayRef argsSubrange(argsBegin + reductionVarOperands.size(),
+                                 argsBegin + reductionVarOperands.size() +
+                                     privateVarTypes.size());
+    printClauseWithRegionArgs(p, op, argsSubrange, "private",
+                              privateVarOperands, privateVarTypes,
+                              privatizerSymbols);
+  }
+
   p.printRegion(region, /*printEntryBlockArgs=*/false);
 }
 
@@ -1008,9 +1042,8 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapOperands) {
         }
 
         if (always || close || implicit) {
-          return emitError(
-              op->getLoc(),
-              "present, mapper and iterator map type modifiers are permitted");
+          return emitError(op->getLoc(), "present, mapper and iterator map "
+                                         "type modifiers are permitted");
         }
 
         to ? updateToVars.insert(updateVar) : updateFromVars.insert(updateVar);
@@ -1070,14 +1103,63 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
       builder, state, /*if_expr_var=*/nullptr, /*num_threads_var=*/nullptr,
       /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
       /*reduction_vars=*/ValueRange(), /*reductions=*/nullptr,
-      /*proc_bind_val=*/nullptr);
+      /*proc_bind_val=*/nullptr, /*private_vars=*/ValueRange(),
+      /*privatizers=*/nullptr);
   state.addAttributes(attributes);
 }
 
+static LogicalResult verifyPrivateVarList(ParallelOp &op) {
+  auto privateVars = op.getPrivateVars();
+  auto privatizers = op.getPrivatizersAttr();
+
+  if (privateVars.empty() && (privatizers == nullptr || privatizers.empty()))
+    return success();
+
+  auto numPrivateVars = privateVars.size();
+  auto numPrivatizers = (privatizers == nullptr) ? 0 : privatizers.size();
+
+  if (numPrivateVars != numPrivatizers)
+    return op.emitError() << "inconsistent number of private variables and "
+                             "privatizer op symbols, private vars: "
+                          << numPrivateVars
+                          << " vs. privatizer op symbols: " << numPrivatizers;
+
+  for (auto privateVarInfo : llvm::zip(privateVars, privatizers)) {
+    Type varType = std::get<0>(privateVarInfo).getType();
+    SymbolRefAttr privatizerSym =
+        std::get<1>(privateVarInfo).cast<SymbolRefAttr>();
+    PrivateClauseOp privatizerOp =
+        SymbolTable::lookupNearestSymbolFrom<PrivateClauseOp>(op,
+                                                              privatizerSym);
+
+    if (privatizerOp == nullptr)
+      return op.emitError() << "failed to lookup privatizer op with symbol: '"
+                            << privatizerSym << "'";
+
+    Type privatizerType = privatizerOp.getType();
+
+    if (varType != privatizerType)
+      return op.emitError()
+             << "type mismatch between a "
+             << (privatizerOp.getDataSharingType() ==
+                         DataSharingClauseType::Private
+                     ? "private"
+                     : "firstprivate")
+             << " variable and its privatizer op, var type: " << varType
+             << " vs. privatizer op type: " << privatizerType;
+  }
+
+  return success();
+}
+
 LogicalResult ParallelOp::verify() {
   if (getAllocateVars().size() != getAllocatorsVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
+
+  if (failed(verifyPrivateVarList(*this)))
+    return failure();
+
   return verifyReductionVarList(*this, getReductions(), getReductionVars());
 }
 
@@ -1111,8 +1193,8 @@ LogicalResult TeamsOp::verify() {
       return emitError("expected num_teams upper bound to be defined if the "
                        "lower bound is defined");
     if (numTeamsLowerBound.getType() != numTeamsUpperBound.getType())
-      return emitError(
-          "expected num_teams upper bound and lower bound to be the same type");
+      return emitError("expected num_teams upper bound and lower bound to be "
+                       "the same type");
   }
 
   // Check for allocate clause restrictions
@@ -1174,9 +1256,10 @@ parseWsLoop(OpAsmParser &parser, Region &region,
 
   // Parse an optional reduction clause
   llvm::SmallVector<OpAsmParser::Argument> privates;
-  bool hasReduction = succeeded(
-      parseReductionClause(parser, region, reductionOperands, reductionTypes,
-                           reductionSymbols, privates));
+  bool hasReduction = succeeded(parser.parseOptionalKeyword("reduction")) &&
+                      succeeded(parseClauseWithRegionArgs(
+                          parser, region, reductionOperands, reductionTypes,
+                          reductionSymbols, privates));
 
   if (parser.parseKeyword("for"))
     return failure();
@@ -1223,8 +1306,9 @@ void printWsLoop(OpAsmPrinter &p, Operation *op, Region &region,
   if (reductionSymbols) {
     auto reductionArgs =
         region.front().getArguments().drop_front(loopVarTypes.size());
-    printReductionClause(p, op, reductionArgs, reductionOperands,
-                         reductionTypes, reductionSymbols);
+    printClauseWithRegionArgs(p, op, reductionArgs, "reduction",
+                              reductionOperands, reductionTypes,
+                              reductionSymbols);
   }
 
   p << " for ";
@@ -1464,9 +1548,9 @@ LogicalResult TaskLoopOp::verify() {
   }
 
   if (getGrainSize() && getNumTasks()) {
-    return emitError(
-        "the grainsize clause and num_tasks clause are mutually exclusive and "
-        "may not appear on the same taskloop directive");
+    return emitError("the grainsize clause and num_tasks clause are mutually "
+                     "exclusive and "
+                     "may not appear on the same taskloop directive");
   }
   return success();
 }
@@ -1535,7 +1619,8 @@ LogicalResult OrderedOp::verify() {
 }
 
 LogicalResult OrderedRegionOp::verify() {
-  // TODO: The code generation for ordered simd directive is not supported yet.
+  // TODO: The code generation for ordered simd directive is not supported
+  // yet.
   if (getSimd())
     return failure();
 
@@ -1753,6 +1838,15 @@ LogicalResult DataBoundsOp::verify() {
   return success();
 }
 
+void PrivateClauseOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                            TypeRange /*result_types*/, StringAttr symName,
+                            TypeAttr type) {
+  PrivateClauseOp::build(
+      odsBuilder, odsState, symName, type,
+      DataSharingClauseTypeAttr::get(odsBuilder.getContext(),
+                                     DataSharingClauseType::Private));
+}
+
 LogicalResult PrivateClauseOp::verify() {
   Type symType = getType();
 
@@ -1785,8 +1879,7 @@ LogicalResult PrivateClauseOp::verify() {
 
     if (region.getNumArguments() != expectedNumArgs)
       return mlir::emitError(region.getLoc())
-             << "`" << regionName << "`: "
-             << "expected " << expectedNumArgs
+             << "`" << regionName << "`: " << "expected " << expectedNumArgs
              << " region arguments, got: " << region.getNumArguments();
 
     for (Block &block : region) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 78a2ad76a1e3b8..6b59dc7377fc74 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1000,6 +1000,26 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// Replace the region arguments of the parallel op (which correspond to private
+/// variables) with the actual private varibles t...
[truncated]

github-actions · 2024-02-14T08:21:58Z

✅ With the latest revision this PR passed the C/C++ code formatter.

skatrak

Thanks for this work Kareem. I just have a couple of nits, but mostly there's one part of this approach I don't fully understand. I'd appreciate it if you could elaborate a bit on that.

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

skatrak · 2024-02-15T17:06:43Z

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

+/// variables) with the actual private varibles they correspond to. This
+/// prepares the parallel op so that it matches what is expected by the
+/// OMPIRBuilder.
+static void prepareOmpParallelForPrivatization(omp::ParallelOp opInst) {


Could you elaborate on why this is necessary to match what the OMPIRBuilder expects? My understanding is that this would make the following transformation:

// Before. %x = ... omp.parallel private(@x.privatizer %x -> %arg0 : !llvm.ptr) { ^bb0(%arg0: !llvm.ptr): %0 = llvm.load %arg0 : !llvm.ptr -> i32 omp.terminator } // After. %x = ... omp.parallel private(@x.privatizer %x -> %??? : !llvm.ptr) { ^bb0(): %0 = llvm.load %x : !llvm.ptr -> i32 omp.terminator }

If that's the case, it also seems like it is making the omp.parallel operation invalid, since the number of block arguments wouldn't match the number of private variables. Is there a chance that the bodyGenCB callback could be tweaked to address the problems that this function solves by making changes to the original operation?

Maybe I'm just not understanding how this function works, so some clarification in that case would be much appreciated.

Could you elaborate on why this is necessary to match what the OMPIRBuilder expects?

The OpenMPIRBuilder::createParallel(...) collects the inputs and outputs of the parallel region. The inputs are then iterated to invoke the private/firstprivate/shared callback.

The input as defined by the CodeExtractor is any value defined outside the region and used inside it.

Therefore, if OpenMPIRBuilder::createParallel is invoked with the block arguments still used inside the parallel region, the IR builder won't detect these as inputs and won't invoke the PrivCB for private/firstprivate variables.

That's why prepareOmpParallelForPrivatization "rewires" the uses of private vars inside the region from the block args to the original SSA values.

it also seems like it is making the omp.parallel operation invalid

Indeed, this does invalidate the op. But my understanding is that the op is at the end of its life at this stage and there is no need to keep it valid. However, to make this transformation less damaging, I removed the line that erases the region arguments. That way the region arguments are still there but just not used anymore. The op then is at a transitional state where original SSA values for private variables are used inside the region but the privatization logic is not inlined yet.

Thank you for the explanation, I understand the problem a bit better now. At the moment it might be the case that this is the last time the omp.parallel operation is used, but in general we shouldn't make modifications to the MLIR operations during translation to LLVM IR or we would introduce some hard to debug problems in the future.

When processing the omp.private ops you're creating clones, making changes to those and then deleting them, could something like this be done for omp.parallel? That way we wouldn't have to rely on the omp.parallel operation never being used again after translation.

Done. The per-processing function now returns a clone of the original op and the clone is used for code-gen and then deleted afterwards to clean up the module.

The OpenMPIRBuilder::createParallel(...) collects the inputs and outputs of the parallel region. The inputs are then iterated to invoke the private/firstprivate/shared callback.
The input as defined by the CodeExtractor is any value defined outside the region and used inside it.
Therefore, if OpenMPIRBuilder::createParallel is invoked with the block arguments still used inside the parallel region, the IR builder won't detect these as inputs and won't invoke the PrivCB for private/firstprivate variables.

Wouldn't it still be called for the original variable (%x here)? And at that point can't we get (%arg0) and do the appropriate processing?

Wouldn't it still be called for the original variable (%x here)? And at that point can't we get (%arg0) and do the appropriate processing?

Doesn't seem like it. To illustrate the difference, I created 2 commits (both will be reverted but just to clarify what is happening):

363ae7f where I still pre-proccess the op and also added logs to OMPIRBuilder to list the collected inputs. Here is the list of collected inputs:

>>>> collected input: %tid.addr = alloca i32, align 4 >>>> collected input: %zero.addr = alloca i32, align 4 >>>> collected input: ptr %0 >>>> calling privCB for: ptr %0

3933d26 where I do not pre-proccess the op with the same logs as the above:

>>>> collected input: %tid.addr = alloca i32, align 4 >>>> collected input: %zero.addr = alloca i32, align 4

So the private variable is not detected as input if we don't do the pre-processing. And this actually matches my understanding of how the CodeExtractor works when it collect inputs/outputs to the region.

I think we can modify the CodeExtractor to detect region operands as inputs but I think there will be other problems down the line.

Let me know if I missed anything.

(both test commits will be reverted before merging of course)

@kiranchandramohan please have a look at the above comment and let me know if you have any concerns or comments 🙏.

It is probably the first condition here that is ignoring a variable defined outside, but not used within the region.

if (!SinkCands.count(V) && definedInCaller(Blocks, V)) Inputs.insert(V); }

Would it be unreasonable to let findInputsOutputs, and add some findPrivVars that checks for if(definedInCaller(Blocks, V)) and returns the variables?

and add some findPrivVars that checks for if(definedInCaller(Blocks, V)) and returns the variables?

I think that will not work. If you check the definition of definedInCaller, you will see that it returns true only if a value is defined by an instruction that belongs to a block that is not an element in the Blocks argument to the function.

So, if you do not use findInputsOutputs and instead directly create a findPrivVars that directly calls if(definedInCaller(Block, V)) without iterating over all instructions in the outlined region as findInputsOutputs does, I think you will falsely detect too many values as being private.

Please let me know if I misunderstood your suggestion! :)

Also, what is the disadvantage of the current approach of pre-processing the omp.parallel op clone by remapping the block argument uses to the original SSA values that will be privatized?

Also, what is the disadvantage of the current approach of pre-processing the omp.parallel op clone by remapping the block argument uses to the original SSA values that will be privatized?

Thanks for the analysis. I'm okay with the parallel clone op, if there are no alternatives. Let us wait for others to comment though; I am not super confident on this.

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

skatrak · 2024-02-16T12:26:37Z

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

+/// variables) with the actual private varibles they correspond to. This
+/// prepares the parallel op so that it matches what is expected by the
+/// OMPIRBuilder.
+static void prepareOmpParallelForPrivatization(omp::ParallelOp opInst) {


Thank you for the explanation, I understand the problem a bit better now. At the moment it might be the case that this is the last time the omp.parallel operation is used, but in general we shouldn't make modifications to the MLIR operations during translation to LLVM IR or we would introduce some hard to debug problems in the future.

When processing the omp.private ops you're creating clones, making changes to those and then deleting them, could something like this be done for omp.parallel? That way we wouldn't have to rely on the omp.parallel operation never being used again after translation.

luporl · 2024-02-16T18:11:49Z

You might consider using stacked PRs next time.
The llvm/llvm-project repository allows user branches for this purpose (https://llvm.org/docs/GitHub.html#branches).

kiranchandramohan · 2024-02-18T15:01:32Z

@jdoerfert FYI, this patch is beginning to use the Privatization Callback for privatization.

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

kiranchandramohan · 2024-02-18T15:56:18Z

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

+/// variables) with the actual private varibles they correspond to. This
+/// prepares the parallel op so that it matches what is expected by the
+/// OMPIRBuilder.
+static void prepareOmpParallelForPrivatization(omp::ParallelOp opInst) {


The OpenMPIRBuilder::createParallel(...) collects the inputs and outputs of the parallel region. The inputs are then iterated to invoke the private/firstprivate/shared callback.
The input as defined by the CodeExtractor is any value defined outside the region and used inside it.
Therefore, if OpenMPIRBuilder::createParallel is invoked with the block arguments still used inside the parallel region, the IR builder won't detect these as inputs and won't invoke the PrivCB for private/firstprivate variables.

Wouldn't it still be called for the original variable (%x here)? And at that point can't we get (%arg0) and do the appropriate processing?

NimishMishra

Thanks @ergawy for the patch. I have a few questions.

Additionally, I tried to build this locally and check the LLVM IR. But I am experiencing a crash. I think the same issue is with the build-bot too. Once that is resolved, maybe we can take a relook at the IR.

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

mlir/test/Target/LLVMIR/openmp-private.mlir

skatrak

Thank you for addressing my previous comments. I think this looks good, but I'll leave the approval to other reviewers depending on what's decided about either making some OMPIRBuilder changes or keeping the omp.parallel clone approach.

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

kiranchandramohan

It is unfortunate we have to clone the operation. This might be costly if there are many parallel operations. Ideally the OpenMPIRBuilder should be modified to make this work.

It is alrite as a solution for now. Please wait a day incase @jdoerfert has comments.

ergawy · 2024-02-22T13:01:21Z

It is unfortunate we have to clone the operation. This might be costly if there are many parallel operations. Ideally the OpenMPIRBuilder should be modified to make this work.

It is alrite as a solution for now. Please wait a day incase @jdoerfert has comments.

We don't have to clone. At some point, I directly re-mapped the region arguments to the original SSA values they privatize and things were working fine.

If preferred, I can go back to doing that and then undoing this remapping after the conversion. This way the op is not cloned and at the same time is restored to its original state after lowering.

kiranchandramohan · 2024-02-22T13:10:16Z

Is the privatizer callback called after outlining?
If not, the module translation already has the mlir and llvm value that is going to be privatized. So it can effectively work with this information on a parameter less callback.

NimishMishra · 2024-02-22T15:55:32Z

Is the privatizer callback called after outlining? If not, the module translation already has the mlir and llvm value that is going to be privatized. So it can effectively work with this information on a parameter less callback.

Does outlining remove the MLIR value? I am not aware of this; if you could explain a bit...

NimishMishra · 2024-02-22T16:19:48Z

If preferred, I can go back to doing that and then undoing this remapping after the conversion. This way the op is not cloned and at the same time is restored to its original state after lowering.

If this approach is taken, I think it implies the following. Correct me please if my understanding is incorrect.

If the remapped operation is short-lived (i.e if the number of transformations/checks we perform on the remapped operation are reasonably small), then the possibility of unforeseen problems arising with the remapped operation would be less. I think @skatrak had a comment on hard-to-debug problems with modifying the original operation.
By not explicitly cloning the parallel op, we would be avoiding perf overheads as @kiranchandramohan mentioned. I am not sure, but the perf overhead of remapping and undoing the same would be less than cloning? Is it the case?

This is of course if the privCB is not called after outlining, as Kiran mentioned.

ergawy · 2024-02-23T04:49:03Z

Thanks everyone for the discussion.

Just to clarify what I meant, please have a look this new commit: d9a97e6. With this approach, there is not cloning, and the MLIR op is restored to its original state after conversion.

Is the privatizer callback called after outlining?

@kiranchandramohan I might have misunderstood what you mean, but the problem is not really related to when the callback is called, it is purely on the MLIR level. The callback will not be called unless an input is detected by the CodeExtractor. No input will be detected by the CodeExtractor if the value does not cross into the parallel op's region. Hence the need for the mapping I did on the MLIR level.

@NimishMishra both your points are correct yes. We want to restore the MLIR op so that we don't accidentally invalidate it and make it difficult to debug (for example, printing might crash). We also want to reduce the cost by not cloning.

Is this RAII approach better?

skatrak

This compromise seems reasonable to me. The operation is not invalidated and a full clone is avoided. Without a way to get the OMPIRBuilder to detect these block arguments as external variables to be privatized, this seems like a good plan B.

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp

kiranchandramohan · 2024-02-25T23:24:43Z

@kiranchandramohan I might have misunderstood what you mean, but the problem is not really related to when the callback is called, it is purely on the MLIR level. The callback will not be called unless an input is detected by the CodeExtractor. No input will be detected by the CodeExtractor if the value does not cross into the parallel op's region. Hence the need for the mapping I did on the MLIR level.

Isn't the CodeExtractor is used for outlining?

What I was saying was that an alternative way to implement the call back would be to invoke the callback before outlining and without calling it per input. MLIR already carries enough information about the variable/value that is privatized. It only needs materialization of the alloca. So the call back when invoked can go through the privatization operands, create allocas for them and just replace the privatized value with the alloc'ed value.

Note: This is just for info and not requesting a change here.

Adds basic support for materializing delayed privatization. So far, the restrictions on the implementation are: - Only `private` clauses are supported (`firstprivate` support will be added in a later PR). - Only single-block `omp.private -> alloc` regions are supported (multi-block ones will be supported in a later PR).

ergawy requested review from DavidTruby, kiranchandramohan and agozillon February 14, 2024 08:19

llvmbot added mlir:llvm mlir flang Flang issues not falling into any other category mlir:openmp flang:fir-hlfir flang:openmp labels Feb 14, 2024

ergawy requested review from skatrak, raghavendhra, DominikAdamski, kparzysz, jsjodin and luporl February 14, 2024 08:20

ergawy force-pushed the delayed_privatization_6 branch from 12c8f14 to 99cbe2d Compare February 14, 2024 08:23

ergawy mentioned this pull request Feb 14, 2024

[WIP] Delayed privatization. #79862

Closed

skatrak reviewed Feb 15, 2024

View reviewed changes

ergawy force-pushed the delayed_privatization_6 branch 2 times, most recently from ed4a853 to 0e1137a Compare February 16, 2024 06:54

skatrak reviewed Feb 16, 2024

View reviewed changes

ergawy force-pushed the delayed_privatization_6 branch from 0e1137a to 8e9349b Compare February 18, 2024 09:19

ergawy mentioned this pull request Feb 18, 2024

[MLIR][OpenMP] Extend omp.private materialization support: firstprivate #82164

Merged

kiranchandramohan requested a review from jdoerfert February 18, 2024 15:00

kiranchandramohan reviewed Feb 18, 2024

View reviewed changes

ergawy force-pushed the delayed_privatization_6 branch from 8e9349b to 3933d26 Compare February 20, 2024 05:22

llvmbot added the clang:openmp OpenMP related changes to Clang label Feb 20, 2024

NimishMishra reviewed Feb 20, 2024

View reviewed changes

ergawy force-pushed the delayed_privatization_6 branch 4 times, most recently from 95bb573 to af31311 Compare February 21, 2024 05:02

skatrak reviewed Feb 21, 2024

View reviewed changes

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp Outdated Show resolved Hide resolved

ergawy force-pushed the delayed_privatization_6 branch from af31311 to fdc0989 Compare February 22, 2024 04:20

kiranchandramohan approved these changes Feb 22, 2024

View reviewed changes

ergawy force-pushed the delayed_privatization_6 branch from fdc0989 to d9a97e6 Compare February 23, 2024 04:42

skatrak reviewed Feb 23, 2024

View reviewed changes

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp Outdated Show resolved Hide resolved

mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp Outdated Show resolved Hide resolved

ergawy force-pushed the delayed_privatization_6 branch 3 times, most recently from 38385f0 to 4f6d692 Compare February 25, 2024 07:20

ergawy force-pushed the delayed_privatization_6 branch 3 times, most recently from df16ecc to 689bc6c Compare February 26, 2024 10:18

ergawy added 2 commits February 27, 2024 11:12

Add RAII object to manage mapping of the op's arguments.

d2d1ee0

ergawy force-pushed the delayed_privatization_6 branch from 689bc6c to d2d1ee0 Compare February 27, 2024 17:12

ergawy merged commit 9d56be0 into llvm:main Feb 28, 2024
3 of 4 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[MLIR][OpenMP] Support basic materialization for `omp.private` ops #81715

[MLIR][OpenMP] Support basic materialization for `omp.private` ops #81715

ergawy commented Feb 14, 2024 •

edited

llvmbot commented Feb 14, 2024 •

edited

This is a follow-up to both #81414 & #81452, only the latest commit (with the same title as the PR) is relevant to this PR.

github-actions bot commented Feb 14, 2024 •

edited

skatrak left a comment

skatrak Feb 15, 2024

ergawy Feb 16, 2024

skatrak Feb 16, 2024

ergawy Feb 18, 2024 •

edited

kiranchandramohan Feb 18, 2024

ergawy Feb 20, 2024 •

edited

ergawy Feb 20, 2024

NimishMishra Feb 20, 2024

ergawy Feb 21, 2024 •

edited

NimishMishra Feb 21, 2024

skatrak Feb 16, 2024

luporl commented Feb 16, 2024

kiranchandramohan commented Feb 18, 2024

kiranchandramohan Feb 18, 2024

NimishMishra left a comment

skatrak left a comment

kiranchandramohan left a comment

ergawy commented Feb 22, 2024 •

edited

kiranchandramohan commented Feb 22, 2024

NimishMishra commented Feb 22, 2024

NimishMishra commented Feb 22, 2024 •

edited

ergawy commented Feb 23, 2024

skatrak left a comment

kiranchandramohan commented Feb 25, 2024

[MLIR][OpenMP] Support basic materialization for omp.private ops #81715

[MLIR][OpenMP] Support basic materialization for omp.private ops #81715

Conversation

ergawy commented Feb 14, 2024 • edited

llvmbot commented Feb 14, 2024 • edited

This is a follow-up to both #81414 & #81452, only the latest commit (with the same title as the PR) is relevant to this PR.

github-actions bot commented Feb 14, 2024 • edited

skatrak left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ergawy Feb 18, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ergawy Feb 20, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

ergawy Feb 21, 2024 • edited

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

luporl commented Feb 16, 2024

kiranchandramohan commented Feb 18, 2024

Choose a reason for hiding this comment

NimishMishra left a comment

Choose a reason for hiding this comment

skatrak left a comment

Choose a reason for hiding this comment

kiranchandramohan left a comment

Choose a reason for hiding this comment

ergawy commented Feb 22, 2024 • edited

kiranchandramohan commented Feb 22, 2024

NimishMishra commented Feb 22, 2024

NimishMishra commented Feb 22, 2024 • edited

ergawy commented Feb 23, 2024

skatrak left a comment

Choose a reason for hiding this comment

kiranchandramohan commented Feb 25, 2024

[MLIR][OpenMP] Support basic materialization for `omp.private` ops #81715

[MLIR][OpenMP] Support basic materialization for `omp.private` ops #81715

ergawy commented Feb 14, 2024 •

edited

llvmbot commented Feb 14, 2024 •

edited

github-actions bot commented Feb 14, 2024 •

edited

ergawy Feb 18, 2024 •

edited

ergawy Feb 20, 2024 •

edited

ergawy Feb 21, 2024 •

edited

ergawy commented Feb 22, 2024 •

edited

NimishMishra commented Feb 22, 2024 •

edited