diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h index ef4f64a167742..686bd32217466 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h @@ -107,6 +107,18 @@ class OpenACCClauseCIREmitter final .CaseLower("radeon", mlir::acc::DeviceType::Radeon); } + mlir::acc::GangArgType decodeGangType(OpenACCGangKind gk) { + switch (gk) { + case OpenACCGangKind::Num: + return mlir::acc::GangArgType::Num; + case OpenACCGangKind::Dim: + return mlir::acc::GangArgType::Dim; + case OpenACCGangKind::Static: + return mlir::acc::GangArgType::Static; + } + llvm_unreachable("unknown gang kind"); + } + public: OpenACCClauseCIREmitter(OpTy &operation, CIRGen::CIRGenFunction &cgf, CIRGen::CIRGenBuilderTy &builder, @@ -424,6 +436,42 @@ class OpenACCClauseCIREmitter final return clauseNotImplemented(clause); } } + + void VisitGangClause(const OpenACCGangClause &clause) { + if constexpr (isOneOfTypes) { + if (clause.getNumExprs() == 0) { + operation.addEmptyGang(builder.getContext(), lastDeviceTypeValues); + } else { + llvm::SmallVector values; + llvm::SmallVector argTypes; + for (unsigned i : llvm::index_range(0u, clause.getNumExprs())) { + auto [kind, expr] = clause.getExpr(i); + mlir::Location exprLoc = cgf.cgm.getLoc(expr->getBeginLoc()); + argTypes.push_back(decodeGangType(kind)); + if (kind == OpenACCGangKind::Dim) { + llvm::APInt curValue = + expr->EvaluateKnownConstInt(cgf.cgm.getASTContext()); + // The value is 1, 2, or 3, but the type isn't necessarily smaller + // than 64. + curValue = curValue.sextOrTrunc(64); + values.push_back( + createConstantInt(exprLoc, 64, curValue.getSExtValue())); + } else if (isa(expr)) { + values.push_back(createConstantInt(exprLoc, 64, -1)); + } else { + values.push_back(createIntExpr(expr)); + } + } + + operation.addGangOperands(builder.getContext(), lastDeviceTypeValues, + argTypes, values); + } + } else { + // TODO: When we've implemented this for everything, switch this to an + // unreachable. Combined constructs remain. + return clauseNotImplemented(clause); + } + } }; template diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp index d636d1b37d969..4b7a7e7366323 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp @@ -323,4 +323,73 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { // CHECK: acc.yield // CHECK-NEXT: } loc } + +#pragma acc parallel + // CHECK: acc.parallel { + { +#pragma acc loop gang + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop gang { + // CHECK: acc.yield + // CHECK-NEXT: } loc +#pragma acc loop gang device_type(nvidia) gang + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop gang([#acc.device_type, #acc.device_type]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc +#pragma acc loop gang(dim:1) device_type(nvidia) gang(dim:2) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64 + // CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64 + // CHECK-NEXT: acc.loop gang({dim=%[[ONE_CONST]] : i64}, {dim=%[[TWO_CONST]] : i64} [#acc.device_type]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc +#pragma acc loop gang(static:N, dim: 1) device_type(nvidia, radeon) gang(static:*, dim : 2) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64 + // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64 + // CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64 + // CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32, dim=%[[ONE_CONST]] : i64}, {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type], {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + } +#pragma acc kernels + // CHECK: acc.kernels { + { +#pragma acc loop gang(num:N) device_type(nvidia, radeon) gang(num:N) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32 + // CHECK-NEXT: acc.loop gang({num=%[[N_CONV]] : si32}, {num=%[[N_CONV2]] : si32} [#acc.device_type], {num=%[[N_CONV2]] : si32} [#acc.device_type]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc +#pragma acc loop gang(static:N) device_type(nvidia) gang(static:*) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64 + // CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32}, {static=%[[STAR_CONST]] : i64} [#acc.device_type]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc +#pragma acc loop gang(static:N, num: N + 1) device_type(nvidia) gang(static:*, num : N + 2) + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32 + // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[CIR_ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[CIR_ONE_CONST]]) nsw : !s32i + // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32 + // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64 + // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr, !s32i + // CHECK-NEXT: %[[CIR_TWO_CONST:.*]] = cir.const #cir.int<2> : !s32i + // CHECK-NEXT: %[[N_PLUS_TWO:.*]] = cir.binop(add, %[[N_LOAD3]], %[[CIR_TWO_CONST]]) nsw : !s32i + // CHECK-NEXT: %[[N_PLUS_TWO_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_TWO]] : !s32i to si32 + // CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32, num=%[[N_PLUS_ONE_CONV]] : si32}, {static=%[[STAR_CONST]] : i64, num=%[[N_PLUS_TWO_CONV]] : si32} [#acc.device_type]) { + // CHECK: acc.yield + // CHECK-NEXT: } loc + } } diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index ca564037fad19..5d5add6318e06 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2231,6 +2231,16 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", // device_types. This is for the case where there is no expression specified // in a 'worker'. void addEmptyWorker(MLIRContext *, llvm::ArrayRef); + + // Adds a collection of operands for a 'gang' clause that has various types + // corresponding to each operand. + void addGangOperands(MLIRContext *, llvm::ArrayRef, + llvm::ArrayRef, mlir::ValueRange); + + // Add an empty value to the 'gang' list with a current list of + // device_types. This is for the case where there is no expression specified + // in a 'gang'. + void addEmptyGang(MLIRContext *, llvm::ArrayRef); }]; let hasCustomAssemblyFormat = 1; diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 9f4645a4a7ca8..7eb72d433c972 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -2748,6 +2748,52 @@ void acc::LoopOp::addEmptyWorker( effectiveDeviceTypes)); } +void acc::LoopOp::addEmptyGang( + MLIRContext *context, llvm::ArrayRef effectiveDeviceTypes) { + setGangAttr(addDeviceTypeAffectedOperandHelper(context, getGangAttr(), + effectiveDeviceTypes)); +} + +void acc::LoopOp::addGangOperands( + MLIRContext *context, llvm::ArrayRef effectiveDeviceTypes, + llvm::ArrayRef argTypes, mlir::ValueRange values) { + llvm::SmallVector segments; + if (std::optional> existingSegments = + getGangOperandsSegments()) + llvm::copy(*existingSegments, std::back_inserter(segments)); + + unsigned beforeCount = segments.size(); + + setGangOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper( + context, getGangOperandsDeviceTypeAttr(), effectiveDeviceTypes, values, + getGangOperandsMutable(), segments)); + + setGangOperandsSegments(segments); + + // This is a bit of extra work to make sure we update the 'types' correctly by + // adding to the types collection the correct number of times. We could + // potentially add something similar to the + // addDeviceTypeAffectedOperandHelper, but it seems that would be pretty + // excessive for a one-off case. + unsigned numAdded = segments.size() - beforeCount; + + if (numAdded > 0) { + llvm::SmallVector gangTypes; + if (getGangOperandsArgTypeAttr()) + llvm::copy(getGangOperandsArgTypeAttr(), std::back_inserter(gangTypes)); + + for (auto i : llvm::index_range(0u, numAdded)) { + llvm::transform(argTypes, std::back_inserter(gangTypes), + [=](mlir::acc::GangArgType gangTy) { + return mlir::acc::GangArgTypeAttr::get(context, gangTy); + }); + (void)i; + } + + setGangOperandsArgTypeAttr(mlir::ArrayAttr::get(context, gangTypes)); + } +} + //===----------------------------------------------------------------------===// // DataOp //===----------------------------------------------------------------------===//