| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 | ||
| ; RUN: llc < %s -verify-machineinstrs -ppc-asm-full-reg-names -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s | ||
|
|
||
| %struct.anon = type { i32 } | ||
|
|
||
| @b = local_unnamed_addr global %struct.anon { i32 -1 }, align 4 | ||
| @g = local_unnamed_addr global [1 x i1] zeroinitializer, align 1 | ||
|
|
||
| define noundef signext i32 @main() { | ||
| ; CHECK-LABEL: main: | ||
| ; CHECK: # %bb.0: # %entry | ||
| ; CHECK-NEXT: ld r3, L..C0(r2) # @b | ||
| ; CHECK-NEXT: lwz r3, 0(r3) | ||
| ; CHECK-NEXT: extsw r4, r3 | ||
| ; CHECK-NEXT: neg r4, r4 | ||
| ; CHECK-NEXT: andi. r5, r3, 65535 | ||
| ; CHECK-NEXT: rldicl r4, r4, 1, 63 | ||
| ; CHECK-NEXT: bne cr0, L..BB0_4 | ||
| ; CHECK-NEXT: # %bb.1: # %lor.rhs.i.i | ||
| ; CHECK-NEXT: xori r5, r4, 1 | ||
| ; CHECK-NEXT: cmpw r3, r5 | ||
| ; CHECK-NEXT: crnot 4*cr5+lt, eq | ||
| ; CHECK-NEXT: li r3, 1 | ||
| ; CHECK-NEXT: bc 12, 4*cr5+lt, L..BB0_3 | ||
| ; CHECK-NEXT: # %bb.2: # %lor.rhs.i.i | ||
| ; CHECK-NEXT: li r3, 0 | ||
| ; CHECK-NEXT: L..BB0_3: # %lor.rhs.i.i | ||
| ; CHECK-NEXT: ld r5, L..C1(r2) # @g | ||
| ; CHECK-NEXT: stb r3, 0(r5) | ||
| ; CHECK-NEXT: L..BB0_4: # %g.exit | ||
| ; CHECK-NEXT: ld r5, L..C1(r2) # @g | ||
| ; CHECK-NEXT: li r3, 0 | ||
| ; CHECK-NEXT: stb r4, 0(r5) | ||
| ; CHECK-NEXT: blr | ||
| entry: | ||
| %0 = load i32, ptr @b, align 4 | ||
| %conv4.i = sext i32 %0 to i64 | ||
| %cmp.i = icmp slt i32 %0, 1 | ||
| %conv.i = zext i1 %cmp.i to i32 | ||
| %cmp1.i = icmp ne i32 %0, %conv.i | ||
| %conv3.i = trunc i32 %0 to i16 | ||
| %tobool.not.i.i = icmp eq i16 %conv3.i, 0 | ||
| br i1 %tobool.not.i.i, label %lor.rhs.i.i, label %g.exit | ||
|
|
||
| lor.rhs.i.i: ; preds = %entry | ||
| store i1 %cmp1.i, ptr @g, align 1 | ||
| br label %g.exit | ||
|
|
||
| g.exit: ; preds = %lor.end.i.i | ||
| %4 = trunc i64 %conv4.i to i32 | ||
| %cmp.i9.i = icmp sgt i32 %4, 0 | ||
| store i1 %cmp.i9.i, ptr @g, align 1 | ||
| ret i32 0 | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,4 +11,4 @@ target triple = "x86_64-pc-windows-msvc" | |
| ; CHECK: U f | ||
|
|
||
| declare dllimport void @f() | ||
| @fp = constant ptr @f | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,6 @@ declare dllimport void @f() | |
|
|
||
| define void @g() { | ||
| call void @f() | ||
| store i32 42, ptr @v | ||
| ret void | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| //===- AllInterfaces.h - ----------------------------------------*- C++ -*-===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // This file defines a common entry point for registering all external | ||
| // interface implementations to the linalg dialect. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef MLIR_DIALECT_LINALG_TRANSFORMS_ALLINTERFACES_H | ||
| #define MLIR_DIALECT_LINALG_TRANSFORMS_ALLINTERFACES_H | ||
|
|
||
| namespace mlir { | ||
| class DialectRegistry; | ||
|
|
||
| namespace linalg { | ||
| void registerAllDialectInterfaceImplementations(DialectRegistry ®istry); | ||
| } // namespace linalg | ||
|
|
||
| } // namespace mlir | ||
|
|
||
| #endif // MLIR_DIALECT_LINALG_TRANSFORMS_ALLINTERFACES_H |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| //===- MeshShardingInterfaceImpl.h ----------------------------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef MLIR_DIALECT_LINALG_MESHSHARDINGINTERFACEIMPL_H | ||
| #define MLIR_DIALECT_LINALG_MESHSHARDINGINTERFACEIMPL_H | ||
|
|
||
| namespace mlir { | ||
| class DialectRegistry; | ||
|
|
||
| namespace linalg { | ||
| void registerMeshShardingInterfaceExternalModels(DialectRegistry ®istry); | ||
| } // namespace linalg | ||
| } // namespace mlir | ||
|
|
||
| #endif // MLIR_DIALECT_LINALG_MESHSHARDINGINTERFACEIMPL_H |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| //===- AllInterfaces.cpp - ------------------------------------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include "mlir/Dialect/Linalg/Transforms/AllInterfaces.h" | ||
|
|
||
| #include "mlir/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.h" | ||
| #include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h" | ||
| #include "mlir/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.h" | ||
| #include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h" | ||
| #include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h" | ||
|
|
||
| void mlir::linalg::registerAllDialectInterfaceImplementations( | ||
| DialectRegistry ®istry) { | ||
| registerBufferizableOpInterfaceExternalModels(registry); | ||
| registerMeshShardingInterfaceExternalModels(registry); | ||
| registerSubsetOpInterfaceExternalModels(registry); | ||
| registerTilingInterfaceExternalModels(registry); | ||
| registerValueBoundsOpInterfaceExternalModels(registry); | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,353 @@ | ||
| //===- MeshShardingInterfaceImpl.cpp --------------------------------------===// | ||
| // | ||
| // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include "mlir/Dialect/Linalg/Transforms/MeshShardingInterfaceImpl.h" | ||
|
|
||
| #include "mlir/Analysis/SliceAnalysis.h" | ||
| #include "mlir/Dialect/Affine/IR/AffineOps.h" | ||
| #include "mlir/Dialect/Arith/IR/Arith.h" | ||
| #include "mlir/Dialect/Linalg/IR/Linalg.h" | ||
| #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" | ||
| #include "mlir/Dialect/Mesh/IR/MeshOps.h" | ||
| #include "mlir/Dialect/Mesh/Interfaces/ShardingInterface.h" | ||
| #include "mlir/Dialect/Mesh/Interfaces/ShardingInterfaceImpl.h" | ||
| #include "mlir/Dialect/Mesh/Transforms/Transforms.h" | ||
| #include "mlir/Dialect/SCF/IR/SCF.h" | ||
| #include "mlir/Dialect/Tensor/IR/Tensor.h" | ||
| #include "mlir/Dialect/Utils/StructuredOpsUtils.h" | ||
| #include "mlir/IR/AffineExpr.h" | ||
| #include "mlir/IR/DialectRegistry.h" | ||
| #include "mlir/IR/IRMapping.h" | ||
| #include "mlir/IR/ImplicitLocOpBuilder.h" | ||
| #include "mlir/IR/MLIRContext.h" | ||
| #include "mlir/IR/OpDefinition.h" | ||
| #include "mlir/IR/Operation.h" | ||
| #include "mlir/IR/SymbolTable.h" | ||
| #include "mlir/IR/Value.h" | ||
| #include "mlir/Interfaces/TilingInterface.h" | ||
| #include "mlir/Support/LogicalResult.h" | ||
| #include "llvm/ADT/ArrayRef.h" | ||
| #include "llvm/ADT/STLExtras.h" | ||
| #include "llvm/ADT/SmallVector.h" | ||
| #include "llvm/ADT/TypeSwitch.h" | ||
| #include <iterator> | ||
| #include <optional> | ||
| #include <utility> | ||
|
|
||
| namespace mlir::linalg { | ||
|
|
||
| using MeshAxis = mesh::MeshAxis; | ||
| using ReductionKind = mesh::ReductionKind; | ||
| using MeshShardingAttr = mesh::MeshShardingAttr; | ||
| using ShardingArray = mesh::ShardingArray; | ||
| using MeshOp = mesh::MeshOp; | ||
|
|
||
| // Returns the corresponding mesh reduction kind for the given arith op. | ||
| static ReductionKind getReductionKind(Operation *op) { | ||
| return llvm::TypeSwitch<Operation *, ReductionKind>(op) | ||
| // Floating-point operations. | ||
| .Case([](arith::AddFOp op) { return ReductionKind::Sum; }) | ||
| .Case([](arith::MulFOp op) { return ReductionKind::Product; }) | ||
| // TODO: handle maxnumf and minnumf. | ||
| .Case([](arith::MaximumFOp op) { return ReductionKind::Max; }) | ||
| .Case([](arith::MinimumFOp op) { return ReductionKind::Min; }) | ||
| // Integer operations. | ||
| .Case([](arith::AddIOp op) { return ReductionKind::Sum; }) | ||
| .Case([](arith::OrIOp op) { return ReductionKind::BitwiseOr; }) | ||
| .Case([](arith::XOrIOp op) { return ReductionKind::BitwiseXor; }) | ||
| .Case([](arith::AndIOp op) { return ReductionKind::Sum; }) | ||
| // TODO: handle signless, signed and unsigned types properly. | ||
| // It is assumed that the element type of the collective operands and | ||
| // result drive the meaning of the reduction kind, whether it is signed | ||
| // or unsigned. | ||
| // The reduction op inside the linalg op may have different result type | ||
| // from the element type of the linalg op's result. | ||
| // Also signed and unsigned Arith dialect ops may accept signed, unsigned | ||
| // or signless operands. | ||
| // Maybe expand the reduction kinds. | ||
| .Case([](arith::MaxUIOp op) { return ReductionKind::Max; }) | ||
| .Case([](arith::MinUIOp op) { return ReductionKind::Min; }) | ||
| .Case([](arith::MaxSIOp op) { return ReductionKind::Max; }) | ||
| .Case([](arith::MinSIOp op) { return ReductionKind::Min; }) | ||
| .Case([](arith::MulIOp op) { return ReductionKind::Product; }) | ||
| .Default([](Operation *op) { return ReductionKind::Generic; }); | ||
| } | ||
|
|
||
| static std::optional<Operation *> getCombinerOp(LinalgOp op) { | ||
| SmallVector<Operation *> combinerOps; | ||
| Value reducedValue = matchReduction(op.getRegionOutputArgs(), 0, combinerOps); | ||
| if (!reducedValue || combinerOps.size() != 1) { | ||
| return std::nullopt; | ||
| } | ||
|
|
||
| return combinerOps[0]; | ||
| } | ||
|
|
||
| static ReductionKind getReductionKindOfLinalgOp(LinalgOp op) { | ||
| std::optional<Operation *> reductionOp = getCombinerOp(op); | ||
| if (!reductionOp) { | ||
| return ReductionKind::Generic; | ||
| } | ||
| [[maybe_unused]] Type resultElementType = | ||
| llvm::cast<RankedTensorType>(op->getResult(0).getType()).getElementType(); | ||
| // TODO: handle case when result type of the reduction op does not match the | ||
| // element type of the result tensor. | ||
| // Would it makes sense at all? | ||
| assert(resultElementType == reductionOp.value()->getResult(0).getType()); | ||
| return getReductionKind(reductionOp.value()); | ||
| } | ||
|
|
||
| static MeshOp getMesh(Operation *op, | ||
| ArrayRef<MeshShardingAttr> operandShardings, | ||
| ArrayRef<MeshShardingAttr> resultShardings, | ||
| SymbolTableCollection &symbolTable) { | ||
| for (MeshShardingAttr sharding : operandShardings) { | ||
| if (sharding) { | ||
| return mesh::getMesh(op, sharding.getMesh(), symbolTable); | ||
| } | ||
| } | ||
|
|
||
| for (MeshShardingAttr sharding : resultShardings) { | ||
| if (sharding) { | ||
| return mesh::getMesh(op, sharding.getMesh(), symbolTable); | ||
| } | ||
| } | ||
|
|
||
| assert(false); | ||
| return nullptr; | ||
| } | ||
|
|
||
| // Choose the operand based on the current process index along the reduction | ||
| // mesh axes. | ||
| // We need to use the initial value only once to avoid including it in the | ||
| // reduction multiple times. | ||
| // In each process group only the leading process with linear index 0 would use | ||
| // the original operand. | ||
| // The other processes would use the reduction operation neutral tensor. | ||
| static Value createDestinationPassingStyleInitOperand( | ||
| LinalgOp op, Value spmdizedOperand, ArrayRef<MeshAxis> reductionMeshAxes, | ||
| MeshOp meshOp, ImplicitLocOpBuilder &builder) { | ||
| Value processLinearIndexInReductionGroup = mesh::createProcessLinearIndex( | ||
| meshOp.getSymName(), reductionMeshAxes, builder); | ||
| Value zero = builder.create<arith::ConstantIndexOp>(0); | ||
| Value isLeadProcess = builder.create<arith::CmpIOp>( | ||
| builder.getI1Type(), arith::CmpIPredicate::eq, | ||
| processLinearIndexInReductionGroup, zero); | ||
| scf::IfOp ifOp = builder.create<scf::IfOp>(spmdizedOperand.getType(), | ||
| isLeadProcess, true, true); | ||
| // Then block. | ||
| { | ||
| OpBuilder::InsertionGuard insertionGuard(builder); | ||
| builder.setInsertionPointToEnd(&ifOp.getThenRegion().front()); | ||
| builder.create<scf::YieldOp>(spmdizedOperand); | ||
| } | ||
|
|
||
| // Else block. | ||
| { | ||
| OpBuilder::InsertionGuard insertionGuard(builder); | ||
| builder.setInsertionPointToEnd(&ifOp.getElseRegion().front()); | ||
| SmallVector<OpFoldResult> shape = | ||
| tensor::getMixedSizes(builder, builder.getLoc(), spmdizedOperand); | ||
| PartialReductionOpInterface partialReductionIface = | ||
| llvm::cast<PartialReductionOpInterface>(op.getOperation()); | ||
| FailureOr<Operation *> reductionNeutralTensorOp = | ||
| partialReductionIface.generateInitialTensorForPartialReduction( | ||
| builder, builder.getLoc(), shape, {}); | ||
| assert(succeeded(reductionNeutralTensorOp)); | ||
| builder.create<scf::YieldOp>( | ||
| reductionNeutralTensorOp.value()->getResult(0)); | ||
| } | ||
| return ifOp.getResult(0); | ||
| } | ||
|
|
||
| // Create the DPS init operands for the spmdized Linalg op. | ||
| // Return all the new spmdized operands. | ||
| static SmallVector<Value> createDestinationPassingStyleInitOperands( | ||
| LinalgOp op, MeshOp meshOp, ArrayRef<Value> spmdizedOperands, | ||
| ArrayRef<MeshAxis> reductionMeshAxes, IRMapping &spmdizationMap, | ||
| ImplicitLocOpBuilder &builder) { | ||
| // TODO: add support for multiple destination passing style initial value | ||
| // operands. | ||
| // PartialReductionOpInterface::generateInitialTensorForPartialReduction | ||
| // needs to also support multiple DPS initial operands. | ||
| SmallVector<Value> newOperands = llvm::to_vector(spmdizedOperands); | ||
| auto operandIdx = op.getDpsInitOperand(0)->getOperandNumber(); | ||
| Value spmdizedInitOperand = | ||
| spmdizationMap.lookup(op->getOperands()[operandIdx]); | ||
| newOperands[operandIdx] = createDestinationPassingStyleInitOperand( | ||
| op, spmdizedInitOperand, reductionMeshAxes, meshOp, builder); | ||
| return newOperands; | ||
| } | ||
|
|
||
| static void createAllReduceForResultWithoutPartialSharding( | ||
| Value unshardedLinalgOpResult, ArrayRef<MeshAxis> opReductionMeshAxes, | ||
| MeshShardingAttr resultSharding, ReductionKind reductionKind, | ||
| IRMapping &spmdizationMap, ImplicitLocOpBuilder &builder) { | ||
| SmallVector<MeshAxis> allReduceMeshAxes; | ||
| llvm::copy_if(opReductionMeshAxes, std::back_inserter(allReduceMeshAxes), | ||
| [&resultSharding](MeshAxis axis) { | ||
| return !llvm::is_contained(resultSharding.getPartialAxes(), | ||
| axis); | ||
| }); | ||
| if (allReduceMeshAxes.empty()) { | ||
| return; | ||
| } | ||
|
|
||
| Value spmdizedLinalgOpResult = spmdizationMap.lookup(unshardedLinalgOpResult); | ||
| Value reducedValue = builder.create<mesh::AllReduceOp>( | ||
| spmdizedLinalgOpResult, resultSharding.getMesh().getValue(), | ||
| allReduceMeshAxes, reductionKind); | ||
| spmdizationMap.map(unshardedLinalgOpResult, reducedValue); | ||
| } | ||
|
|
||
| static void createAllReduceForResultsWithoutPartialShardings( | ||
| LinalgOp unshardedOp, ArrayRef<MeshAxis> opReductionMeshAxes, | ||
| ArrayRef<MeshShardingAttr> resultShardings, IRMapping &spmdizationMap, | ||
| ImplicitLocOpBuilder &builder) { | ||
| ReductionKind reductionKind = getReductionKindOfLinalgOp(unshardedOp); | ||
| for (auto [unshardedLinalgOpResult, resultSharding] : | ||
| llvm::zip_equal(unshardedOp->getResults(), resultShardings)) { | ||
| createAllReduceForResultWithoutPartialSharding( | ||
| unshardedLinalgOpResult, opReductionMeshAxes, resultSharding, | ||
| reductionKind, spmdizationMap, builder); | ||
| } | ||
| } | ||
|
|
||
| static void spmdizeLinalgOpWithShardedReduction( | ||
| LinalgOp op, ArrayRef<Value> spmdizedOperands, | ||
| ArrayRef<MeshShardingAttr> operandShardings, | ||
| ArrayRef<MeshShardingAttr> resultShardings, | ||
| ArrayRef<utils::IteratorType> loopIteratorTypes, | ||
| ArrayRef<SmallVector<MeshAxis>> meshAxisAssignmentForLoopIterators, | ||
| IRMapping &spmdizationMap, SymbolTableCollection &symbolTable, | ||
| ImplicitLocOpBuilder &builder) { | ||
| MeshOp mesh = getMesh(op, operandShardings, resultShardings, symbolTable); | ||
| SmallVector<MeshAxis> reductionMeshAxes = mesh::getReductionMeshAxes( | ||
| loopIteratorTypes, meshAxisAssignmentForLoopIterators); | ||
| SmallVector<Value> spmdizedLinalgOpOperands = | ||
| createDestinationPassingStyleInitOperands(op, mesh, spmdizedOperands, | ||
| reductionMeshAxes, | ||
| spmdizationMap, builder); | ||
| // We must not change the operand mappings of the original spmdizationMap as | ||
| // they are the mappings for the whole spmdization blob and may be used by | ||
| // others. | ||
| IRMapping internalSpmdizationMap; | ||
| for (auto [unshardedOperand, spmdizedOperand] : | ||
| llvm::zip_equal(op->getOperands(), spmdizedLinalgOpOperands)) { | ||
| internalSpmdizationMap.map(unshardedOperand, spmdizedOperand); | ||
| } | ||
| spmdizeTriviallyShardableOperation( | ||
| *op, spmdizedLinalgOpOperands, operandShardings, resultShardings, | ||
| internalSpmdizationMap, symbolTable, builder); | ||
| for (Value result : op->getResults()) { | ||
| spmdizationMap.map(result, internalSpmdizationMap.lookup(result)); | ||
| } | ||
|
|
||
| // Handle partial shardings. | ||
| createAllReduceForResultsWithoutPartialShardings( | ||
| op, reductionMeshAxes, resultShardings, spmdizationMap, builder); | ||
| } | ||
|
|
||
| namespace { | ||
|
|
||
| // ShardingInterface for ops that implement LinalgStructuredInterface. | ||
| // The supported ops are only those where the indexing maps are projected | ||
| // permutations. | ||
| template <typename Op> | ||
| struct StructuredOpShardingInterface | ||
| : public mesh::ShardingInterface::ExternalModel< | ||
| StructuredOpShardingInterface<Op>, Op> { | ||
| SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const { | ||
| return llvm::cast<LinalgOp>(op).getIteratorTypesArray(); | ||
| } | ||
|
|
||
| SmallVector<AffineMap> getIndexingMaps(Operation *op) const { | ||
| LinalgOp linalgOp = llvm::cast<LinalgOp>(op); | ||
| SmallVector<AffineMap> res = linalgOp.getIndexingMapsArray(); | ||
|
|
||
| // Results must have the same indexing as destination passing style initial | ||
| // operands. | ||
| for (int64_t i = 0; i < linalgOp.getNumDpsInits(); ++i) { | ||
| res.push_back(res[linalgOp.getDpsInitOperand(i)->getOperandNumber()]); | ||
| } | ||
|
|
||
| return res; | ||
| } | ||
|
|
||
| LogicalResult spmdize(Operation *op, ArrayRef<Value> spmdizedOperands, | ||
| ArrayRef<MeshShardingAttr> operandShardings, | ||
| ArrayRef<MeshShardingAttr> resultShardings, | ||
| IRMapping &spmdizationMap, | ||
| SymbolTableCollection &symbolTable, | ||
| OpBuilder &builder) const { | ||
| LinalgOp linalgOp = llvm::cast<LinalgOp>(op); | ||
|
|
||
| SmallVector<AffineMap> indexingMaps = linalgOp.getIndexingMapsArray(); | ||
| bool allIndexingMapsAreProjectedPermutation = | ||
| llvm::all_of(indexingMaps, [](AffineMap map) { | ||
| return map.isProjectedPermutation(); | ||
| }); | ||
| if (!allIndexingMapsAreProjectedPermutation) { | ||
| // TODO: handle non-projected permutations. | ||
| return op->emitOpError() | ||
| << "supports indexing maps that are only projected permutation."; | ||
| } | ||
|
|
||
| SmallVector<utils::IteratorType> loopIteratorTypes = | ||
| linalgOp.getIteratorTypesArray(); | ||
| ShardingArray meshAxisAssignmentForLoopIterators = | ||
| getMeshAxisAssignmentForLoopIterators(operandShardings, resultShardings, | ||
| loopIteratorTypes, indexingMaps); | ||
| if (mesh::isAtLeastOneReductionIteratorSharded( | ||
| loopIteratorTypes, meshAxisAssignmentForLoopIterators)) { | ||
| ImplicitLocOpBuilder implicitLocBuilder(op->getLoc(), builder); | ||
| spmdizeLinalgOpWithShardedReduction( | ||
| linalgOp, spmdizedOperands, operandShardings, resultShardings, | ||
| loopIteratorTypes, meshAxisAssignmentForLoopIterators, spmdizationMap, | ||
| symbolTable, implicitLocBuilder); | ||
| } else { | ||
| spmdizeTriviallyShardableOperation(*op, spmdizedOperands, | ||
| operandShardings, resultShardings, | ||
| spmdizationMap, symbolTable, builder); | ||
| } | ||
|
|
||
| return success(); | ||
| } | ||
| }; | ||
|
|
||
| } // namespace | ||
|
|
||
| template <typename OpType> | ||
| static void registerOne(MLIRContext *ctx) { | ||
| OpType::template attachInterface<StructuredOpShardingInterface<OpType>>(*ctx); | ||
| } | ||
|
|
||
| /// Variadic helper function. | ||
| template <typename... OpTypes> | ||
| static void registerAll(MLIRContext *ctx) { | ||
| (registerOne<OpTypes>(ctx), ...); | ||
| } | ||
|
|
||
| void registerMeshShardingInterfaceExternalModels(DialectRegistry ®istry) { | ||
| registry.addExtension(+[](MLIRContext *ctx, LinalgDialect *dialect) { | ||
| DialectRegistry registry; | ||
| registry.insert<affine::AffineDialect, arith::ArithDialect, scf::SCFDialect, | ||
| tensor::TensorDialect>(); | ||
| ctx->appendDialectRegistry(registry); | ||
| for (StringRef name : registry.getDialectNames()) | ||
| ctx->getOrLoadDialect(name); | ||
|
|
||
| registerOne<linalg::GenericOp>(ctx); | ||
| registerAll< | ||
| #define GET_OP_LIST | ||
| #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc" | ||
| >(ctx); | ||
| }); | ||
| } | ||
|
|
||
| } // namespace mlir::linalg |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
| // RUN: mlir-opt \ | ||
| // RUN: --mesh-spmdization \ | ||
| // RUN: --test-constant-fold \ | ||
| // RUN: --split-input-file \ | ||
| // RUN: %s | FileCheck %s | ||
|
|
||
| // CHECK: #[[$MAP_IDENTITY_1D:.*]] = affine_map<(d0) -> (d0)> | ||
| #map_identity_1d = affine_map<(d0) -> (d0)> | ||
|
|
||
| mesh.mesh @mesh_1d(shape = 2) | ||
|
|
||
| // CHECK-LABEL: func @elementwise_static_1d_mesh_static_1d_tensor | ||
| func.func @elementwise_static_1d_mesh_static_1d_tensor( | ||
| // CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<1xi8>, | ||
| %in1: tensor<2xi8>, | ||
| // CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<1xi8>, | ||
| %in2: tensor<2xi8>, | ||
| // CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<1xi8> | ||
| %dps_out: tensor<2xi8> | ||
| // CHECK-SAME: -> tensor<1xi8> { | ||
| ) -> tensor<2xi8> { | ||
| %in1_shared1 = mesh.shard %in1 to <@mesh_1d, [[0]]> : tensor<2xi8> | ||
| %in1_shared2 = mesh.shard %in1_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<2xi8> | ||
| %in2_shared1 = mesh.shard %in2 to <@mesh_1d, [[0]]> : tensor<2xi8> | ||
| %in2_shared2 = mesh.shard %in2_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<2xi8> | ||
| %dps_out_shared1 = mesh.shard %dps_out to <@mesh_1d, [[0]]> : tensor<2xi8> | ||
| %dps_out_shared2 = mesh.shard %dps_out_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<2xi8> | ||
| // CHECK: %[[RES:.*]] = linalg.generic { | ||
| // CHECK-SAME: indexing_maps = [#[[$MAP_IDENTITY_1D]], #[[$MAP_IDENTITY_1D]], #[[$MAP_IDENTITY_1D]]], | ||
| // CHECK-SAME: iterator_types = ["parallel"]} | ||
| // CHECK-SAME: ins(%[[IN1]], %[[IN2]] : tensor<1xi8>, tensor<1xi8>) | ||
| // CHECK-SAME: outs(%[[DPS_OUT]] : tensor<1xi8>) { | ||
| %res = linalg.generic { | ||
| indexing_maps = [#map_identity_1d, #map_identity_1d, #map_identity_1d], | ||
| iterator_types = ["parallel"] | ||
| } ins(%in1_shared2, %in2_shared2 : tensor<2xi8>, tensor<2xi8>) | ||
| outs(%dps_out_shared2 : tensor<2xi8>) { | ||
| ^bb0(%in1_scalar: i8, %in2_scalar: i8, %out: i8): | ||
| %res_scalar = arith.muli %in1_scalar, %in2_scalar : i8 | ||
| linalg.yield %res_scalar : i8 | ||
| } -> tensor<2xi8> | ||
| %res_shared1 = mesh.shard %res to <@mesh_1d, [[0]]> : tensor<2xi8> | ||
| %res_shared2 = mesh.shard %res_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<2xi8> | ||
| // CHECK: return %[[RES]] : tensor<1xi8> | ||
| return %res_shared2 : tensor<2xi8> | ||
| } | ||
|
|
||
| // ----- | ||
|
|
||
| mesh.mesh @mesh_1d(shape = 4) | ||
|
|
||
| // CHECK-LABEL: func @matmul_1d_mesh_static_tensors_parallel_iterator_sharding | ||
| func.func @matmul_1d_mesh_static_tensors_parallel_iterator_sharding( | ||
| // CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<1x3xi8>, | ||
| %in1: tensor<4x3xi8>, | ||
| // CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<3x8xi8>, | ||
| %in2: tensor<3x8xi8>, | ||
| // CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<1x8xi8> | ||
| %dps_out: tensor<4x8xi8> | ||
| // CHECK-SAME: -> tensor<1x8xi8> { | ||
| ) -> tensor<4x8xi8> { | ||
| %in1_shared1 = mesh.shard %in1 to <@mesh_1d, [[0]]> : tensor<4x3xi8> | ||
| %in1_shared2 = mesh.shard %in1_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<4x3xi8> | ||
| %in2_shared1 = mesh.shard %in2 to <@mesh_1d, [[]]> : tensor<3x8xi8> | ||
| %in2_shared2 = mesh.shard %in2_shared1 to <@mesh_1d, [[]]> annotate_for_users: tensor<3x8xi8> | ||
| %dps_out_shared1 = mesh.shard %dps_out to <@mesh_1d, [[0]]> : tensor<4x8xi8> | ||
| %dps_out_shared2 = mesh.shard %dps_out_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<4x8xi8> | ||
| // CHECK: %[[RES:.*]] = linalg.matmul | ||
| // CHECK-SAME: ins(%[[IN1]], %[[IN2]] : tensor<1x3xi8>, tensor<3x8xi8>) | ||
| // CHECK-SAME: outs(%[[DPS_OUT]] : tensor<1x8xi8>) | ||
| // CHECK-SAME: -> tensor<1x8xi8> | ||
| %res = linalg.matmul ins(%in1_shared2, %in2_shared2 : tensor<4x3xi8>, tensor<3x8xi8>) | ||
| outs(%dps_out_shared2 : tensor<4x8xi8>) -> tensor<4x8xi8> | ||
| %res_shared1 = mesh.shard %res to <@mesh_1d, [[0]]> : tensor<4x8xi8> | ||
| %res_shared2 = mesh.shard %res_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<4x8xi8> | ||
| // CHECK: return %[[RES]] : tensor<1x8xi8> | ||
| return %res_shared2 : tensor<4x8xi8> | ||
| } | ||
|
|
||
| // ----- | ||
|
|
||
| mesh.mesh @mesh_1d(shape = 3) | ||
|
|
||
| // CHECK-LABEL: func @matmul_1d_mesh_static_tensors_reduction_iterator_sharding | ||
| func.func @matmul_1d_mesh_static_tensors_reduction_iterator_sharding( | ||
| // CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<4x2xi8>, | ||
| %in1: tensor<4x6xi8>, | ||
| // CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<2x8xi8>, | ||
| %in2: tensor<6x8xi8>, | ||
| // CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<4x8xi8> | ||
| %dps_out: tensor<4x8xi8> | ||
| // CHECK-SAME: -> tensor<4x8xi8> { | ||
| ) -> tensor<4x8xi8> { | ||
| %in1_shared1 = mesh.shard %in1 to <@mesh_1d, [[], [0]]> : tensor<4x6xi8> | ||
| %in1_shared2 = mesh.shard %in1_shared1 to <@mesh_1d, [[], [0]]> annotate_for_users: tensor<4x6xi8> | ||
| %in2_shared1 = mesh.shard %in2 to <@mesh_1d, [[0]]> : tensor<6x8xi8> | ||
| %in2_shared2 = mesh.shard %in2_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<6x8xi8> | ||
| %dps_out_shared1 = mesh.shard %dps_out to <@mesh_1d, [[]]> : tensor<4x8xi8> | ||
| %dps_out_shared2 = mesh.shard %dps_out_shared1 to <@mesh_1d, [[]]> annotate_for_users: tensor<4x8xi8> | ||
| // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index | ||
| // CHECK-DAG: %[[C0_I8:.*]] = arith.constant 0 : i8 | ||
| // CHECK-DAG: %[[PROCESS_IDX:.*]] = mesh.process_multi_index on @mesh_1d axes = [0] : index | ||
| // CHECK-DAG: %[[MESH_SIZE:.*]] = mesh.mesh_shape @mesh_1d axes = [0] : index | ||
| // CHECK: %[[DPS_INIT_OPERAND_CONDITION:.*]] = arith.cmpi eq, %[[PROCESS_IDX]], %[[C0]] : index | ||
| // CHECK: %[[DPS_INIT_OPERAND:.*]] = scf.if %[[DPS_INIT_OPERAND_CONDITION]] -> (tensor<4x8xi8>) { | ||
| // CHECK: scf.yield %[[DPS_OUT]] : tensor<4x8xi8> | ||
| // CHECK: } else { | ||
| // CHECK-DAG: %[[EMPTY_TENSOR:.*]] = tensor.empty() : tensor<4x8xi8> | ||
| // CHECK: %[[NEUTRAL_ELEMENT_FILLED_TENSOR:.*]] = linalg.fill ins(%[[C0_I8]] : i8) | ||
| // CHECK-SAME: outs(%[[EMPTY_TENSOR]] : tensor<4x8xi8>) -> tensor<4x8xi8> | ||
| // CHECK: scf.yield %[[NEUTRAL_ELEMENT_FILLED_TENSOR]] : tensor<4x8xi8> | ||
| // CHECK: } | ||
| // CHECK: %[[SHARDED_MATMUL:.*]] = linalg.matmul ins(%[[IN1]], %[[IN2]] : tensor<4x2xi8>, tensor<2x8xi8>) | ||
| // CHECK-SAME: outs(%[[DPS_INIT_OPERAND]] : tensor<4x8xi8>) -> tensor<4x8xi8> | ||
| // CHECK: %[[ALL_REDUCED:.*]] = mesh.all_reduce %[[SHARDED_MATMUL]] on @mesh_1d mesh_axes = [0] : tensor<4x8xi8> -> tensor<4x8xi8> | ||
| %res = linalg.matmul ins(%in1_shared2, %in2_shared2 : tensor<4x6xi8>, tensor<6x8xi8>) | ||
| outs(%dps_out_shared2 : tensor<4x8xi8>) -> tensor<4x8xi8> | ||
| %res_shared1 = mesh.shard %res to <@mesh_1d, [[]]> : tensor<4x8xi8> | ||
| %res_shared2 = mesh.shard %res_shared1 to <@mesh_1d, [[]]> annotate_for_users: tensor<4x8xi8> | ||
| // CHECK: return %[[ALL_REDUCED]] : tensor<4x8xi8> | ||
| return %res_shared2 : tensor<4x8xi8> | ||
| } | ||
|
|
||
| // ----- | ||
|
|
||
| mesh.mesh @mesh_1d(shape = 3) | ||
|
|
||
| // CHECK-LABEL: func @matmul_1d_mesh_static_tensors_reduction_iterator_sharding_with_partial_result | ||
| func.func @matmul_1d_mesh_static_tensors_reduction_iterator_sharding_with_partial_result( | ||
| // CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<4x2xi8>, | ||
| %in1: tensor<4x6xi8>, | ||
| // CHECK-SAME: %[[IN2:[A-Za-z0-9_]+]]: tensor<2x8xi8>, | ||
| %in2: tensor<6x8xi8>, | ||
| // CHECK-SAME: %[[DPS_OUT:[A-Za-z0-9_]+]]: tensor<4x8xi8> | ||
| %dps_out: tensor<4x8xi8> | ||
| // CHECK-SAME: -> tensor<4x8xi8> { | ||
| ) -> tensor<4x8xi8> { | ||
| %in1_shared1 = mesh.shard %in1 to <@mesh_1d, [[], [0]]> : tensor<4x6xi8> | ||
| %in1_shared2 = mesh.shard %in1_shared1 to <@mesh_1d, [[], [0]]> annotate_for_users: tensor<4x6xi8> | ||
| %in2_shared1 = mesh.shard %in2 to <@mesh_1d, [[0]]> : tensor<6x8xi8> | ||
| %in2_shared2 = mesh.shard %in2_shared1 to <@mesh_1d, [[0]]> annotate_for_users: tensor<6x8xi8> | ||
| %dps_out_shared1 = mesh.shard %dps_out to <@mesh_1d, [[]]> : tensor<4x8xi8> | ||
| %dps_out_shared2 = mesh.shard %dps_out_shared1 to <@mesh_1d, [[]]> annotate_for_users: tensor<4x8xi8> | ||
| // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index | ||
| // CHECK-DAG: %[[C0_I8:.*]] = arith.constant 0 : i8 | ||
| // CHECK-DAG: %[[PROCESS_IDX:.*]] = mesh.process_multi_index on @mesh_1d axes = [0] : index | ||
| // CHECK-DAG: %[[MESH_SIZE:.*]] = mesh.mesh_shape @mesh_1d axes = [0] : index | ||
| // CHECK: %[[DPS_INIT_OPERAND_CONDITION:.*]] = arith.cmpi eq, %[[PROCESS_IDX]], %[[C0]] : index | ||
| // CHECK: %[[DPS_INIT_OPERAND:.*]] = scf.if %[[DPS_INIT_OPERAND_CONDITION]] -> (tensor<4x8xi8>) { | ||
| // CHECK: scf.yield %[[DPS_OUT]] : tensor<4x8xi8> | ||
| // CHECK: } else { | ||
| // CHECK-DAG: %[[EMPTY_TENSOR:.*]] = tensor.empty() : tensor<4x8xi8> | ||
| // CHECK: %[[NEUTRAL_ELEMENT_FILLED_TENSOR:.*]] = linalg.fill ins(%[[C0_I8]] : i8) | ||
| // CHECK-SAME: outs(%[[EMPTY_TENSOR]] : tensor<4x8xi8>) -> tensor<4x8xi8> | ||
| // CHECK: scf.yield %[[NEUTRAL_ELEMENT_FILLED_TENSOR]] : tensor<4x8xi8> | ||
| // CHECK: } | ||
| // CHECK: %[[SHARDED_MATMUL:.*]] = linalg.matmul ins(%[[IN1]], %[[IN2]] : tensor<4x2xi8>, tensor<2x8xi8>) | ||
| // CHECK-SAME: outs(%[[DPS_INIT_OPERAND]] : tensor<4x8xi8>) -> tensor<4x8xi8> | ||
| %res = linalg.matmul ins(%in1_shared2, %in2_shared2 : tensor<4x6xi8>, tensor<6x8xi8>) | ||
| outs(%dps_out_shared2 : tensor<4x8xi8>) -> tensor<4x8xi8> | ||
| %res_shared1 = mesh.shard %res to <@mesh_1d, [[]], partial = sum[0]> : tensor<4x8xi8> | ||
| %res_shared2 = mesh.shard %res_shared1 to <@mesh_1d, [[]], partial = sum[0]> annotate_for_users: tensor<4x8xi8> | ||
| // CHECK: return %[[SHARDED_MATMUL]] : tensor<4x8xi8> | ||
| return %res_shared2 : tensor<4x8xi8> | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| // RUN: %libomp-compile-and-run | ||
| #include <stdio.h> | ||
| #include <stdlib.h> | ||
| #include <string.h> | ||
| #include "omp.h" | ||
|
|
||
| #ifndef MAX_BOUND | ||
| #define MAX_BOUND 64 | ||
| #endif | ||
| #ifndef _MSC_VER | ||
| #define NO_EFFICIENCY_CHECK | ||
| #endif | ||
|
|
||
| /* To ensure Correctness, only valid iterations are executed and are executed | ||
| only once. Stores the number of times an iteration is executed. */ | ||
| unsigned *execution_count = NULL; | ||
| /* Stores the number of iterations executed by each thread. */ | ||
| unsigned *iterations_per_thread = NULL; | ||
|
|
||
| unsigned *Alloc(unsigned bound1, unsigned bound2) { | ||
| return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned))); | ||
| } | ||
|
|
||
| void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) { | ||
| memset(p, 0, bound1 * bound2 * sizeof(unsigned)); | ||
| } | ||
|
|
||
| void Free(unsigned *p) { free((void *)p); } | ||
|
|
||
| unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) { | ||
| return &p[i * bound2 + j]; | ||
| } | ||
|
|
||
| int test(unsigned upper_bound) { | ||
|
|
||
| unsigned total_iterations = upper_bound * (upper_bound - 1) / 2; | ||
| unsigned num_threads = omp_get_max_threads(); | ||
| unsigned lower_per_chunk = total_iterations / num_threads; | ||
| unsigned upper_per_chunk = | ||
| lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0); | ||
| int i, j; | ||
|
|
||
| omp_set_num_threads(num_threads); | ||
|
|
||
| ZeroOut(execution_count, upper_bound, upper_bound); | ||
| ZeroOut(iterations_per_thread, num_threads, 1); | ||
|
|
||
| #ifdef VERBOSE | ||
| fprintf(stderr, | ||
| "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] " | ||
| "chunks " | ||
| "loop type lower triangle <,< - ", | ||
| num_threads, upper_bound, total_iterations, lower_per_chunk, | ||
| upper_per_chunk); | ||
| #endif | ||
|
|
||
| #pragma omp parallel shared(iterations_per_thread, execution_count) | ||
| { /* begin of parallel */ | ||
| /* Lower triangular execution_count matrix */ | ||
| #pragma omp for schedule(static) collapse(2) | ||
| for (i = 0; i < upper_bound; i++) { | ||
| for (j = 0; j < i; j++) { | ||
| (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++; | ||
| (*Index(execution_count, i, j, upper_bound))++; | ||
| } | ||
| } /* end of for*/ | ||
| } /* end of parallel */ | ||
|
|
||
| /* check the execution_count array */ | ||
| for (i = 0; i < upper_bound; i++) { | ||
| for (j = 0; j < i; j++) { | ||
| unsigned value = *Index(execution_count, i, j, upper_bound); | ||
| /* iteration with j<=i are valid, but should have been executed only once | ||
| */ | ||
| if (value != 1) { | ||
| fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n", | ||
| i, j, value); | ||
| return 0; | ||
| } | ||
| } | ||
| for (j = i; j < upper_bound; j++) { | ||
| unsigned value = *Index(execution_count, i, j, upper_bound); | ||
| /* iteration with j>=i are invalid and should not have been executed | ||
| */ | ||
| if (value > 0) { | ||
| fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", | ||
| i, j, value); | ||
| return 0; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #ifndef NO_EFFICIENCY_CHECK | ||
| /* Ensure the number of iterations executed by each thread is within bounds */ | ||
| for (i = 0; i < num_threads; i++) { | ||
| unsigned value = *Index(iterations_per_thread, i, 0, 1); | ||
| if (value < lower_per_chunk || value > upper_per_chunk) { | ||
| fprintf(stderr, | ||
| "ERROR: Inefficient Collapse thread %d of %d assigned %i " | ||
| "iterations; must be between %d and %d\n", | ||
| i, num_threads, value, lower_per_chunk, upper_per_chunk); | ||
| return 0; | ||
| } | ||
| } | ||
| #endif | ||
| #ifdef VERBOSE | ||
| fprintf(stderr, "PASSED\r\n"); | ||
| #endif | ||
| return 1; | ||
| } | ||
|
|
||
| int main() { | ||
|
|
||
| execution_count = Alloc(MAX_BOUND, MAX_BOUND); | ||
| iterations_per_thread = Alloc(omp_get_max_threads(), 1); | ||
|
|
||
| for (unsigned j = 0; j < MAX_BOUND; j++) { | ||
| if (!test(j)) | ||
| return 1; | ||
| } | ||
| Free(execution_count); | ||
| Free(iterations_per_thread); | ||
| return 0; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| // RUN: %libomp-compile-and-run | ||
| #include <stdio.h> | ||
| #include <stdlib.h> | ||
| #include <string.h> | ||
| #include "omp.h" | ||
|
|
||
| #ifndef MAX_BOUND | ||
| #define MAX_BOUND 64 | ||
| #endif | ||
| #ifndef _MSC_VER | ||
| #define NO_EFFICIENCY_CHECK | ||
| #endif | ||
|
|
||
| /* To ensure Correctness, only valid iterations are executed and are executed | ||
| only once. Stores the number of times an iteration is executed. */ | ||
| unsigned *execution_count = NULL; | ||
| /* Stores the number of iterations executed by each thread. */ | ||
| unsigned *iterations_per_thread = NULL; | ||
|
|
||
| unsigned *Alloc(unsigned bound1, unsigned bound2) { | ||
| return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned))); | ||
| } | ||
|
|
||
| void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) { | ||
| memset(p, 0, bound1 * bound2 * sizeof(unsigned)); | ||
| } | ||
|
|
||
| void Free(unsigned *p) { free((void *)p); } | ||
|
|
||
| unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) { | ||
| return &p[i * bound2 + j]; | ||
| } | ||
|
|
||
| int test(int upper_bound) { | ||
|
|
||
| unsigned total_iterations = upper_bound * (upper_bound + 1) / 2; | ||
| unsigned num_threads = omp_get_max_threads(); | ||
| unsigned lower_per_chunk = total_iterations / num_threads; | ||
| unsigned upper_per_chunk = | ||
| lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0); | ||
| int i, j; | ||
|
|
||
| omp_set_num_threads(num_threads); | ||
|
|
||
| ZeroOut(execution_count, upper_bound, upper_bound); | ||
| ZeroOut(iterations_per_thread, num_threads, 1); | ||
|
|
||
| #ifdef VERBOSE | ||
| fprintf(stderr, | ||
| "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] " | ||
| "chunks " | ||
| "loop type lower triangle <,<= - ", | ||
| num_threads, upper_bound, total_iterations, lower_per_chunk, | ||
| upper_per_chunk); | ||
| #endif | ||
|
|
||
| #pragma omp parallel shared(iterations_per_thread, execution_count) | ||
| { /* begin of parallel */ | ||
| /* Lower triangular execution_count matrix */ | ||
| #pragma omp for schedule(static) collapse(2) | ||
| for (i = 0; i < upper_bound; i++) { | ||
| for (j = 0; j <= i; j++) { | ||
| (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++; | ||
| (*Index(execution_count, i, j, upper_bound))++; | ||
| } | ||
| } /* end of for*/ | ||
| } /* end of parallel */ | ||
|
|
||
| /* check the execution_count array */ | ||
| for (i = 0; i < upper_bound; i++) { | ||
| for (j = 0; j <= i; j++) { | ||
| unsigned value = *Index(execution_count, i, j, upper_bound); | ||
| /* iteration with j<=i are valid, but should have been executed only once | ||
| */ | ||
| if (value != 1) { | ||
| fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n", | ||
| i, j, value); | ||
| return 0; | ||
| } | ||
| } | ||
| for (j = i + 1; j < upper_bound; j++) { | ||
| unsigned value = *Index(execution_count, i, j, upper_bound); | ||
| /* iteration with j>=i are invalid and should not have been executed | ||
| */ | ||
| if (value > 0) { | ||
| fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", | ||
| i, j, value); | ||
| return 0; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #ifndef NO_EFFICIENCY_CHECK | ||
| /* Ensure the number of iterations executed by each thread is within bounds */ | ||
| for (i = 0; i < num_threads; i++) { | ||
| unsigned value = *Index(iterations_per_thread, i, 0, 1); | ||
| if (value < lower_per_chunk || value > upper_per_chunk) { | ||
| fprintf(stderr, | ||
| "ERROR: Inefficient Collapse thread %d of %d assigned %i " | ||
| "iterations; must be between %d and %d\n", | ||
| i, num_threads, value, lower_per_chunk, upper_per_chunk); | ||
| return 0; | ||
| } | ||
| } | ||
| #endif | ||
| #ifdef VERBOSE | ||
| fprintf(stderr, "PASSED\r\n"); | ||
| #endif | ||
| return 1; | ||
| } | ||
|
|
||
| int main() { | ||
|
|
||
| execution_count = Alloc(MAX_BOUND, MAX_BOUND); | ||
| iterations_per_thread = Alloc(omp_get_max_threads(), 1); | ||
|
|
||
| for (unsigned j = 0; j < MAX_BOUND; j++) { | ||
| if (!test(j)) | ||
| return 1; | ||
| } | ||
| Free(execution_count); | ||
| Free(iterations_per_thread); | ||
| return 0; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| // RUN: %libomp-compile-and-run | ||
| #include <stdio.h> | ||
| #include <stdlib.h> | ||
| #include <string.h> | ||
| #include "omp.h" | ||
|
|
||
| #ifndef MAX_BOUND | ||
| #define MAX_BOUND 64 | ||
| #endif | ||
| #ifndef _MSC_VER | ||
| #define NO_EFFICIENCY_CHECK | ||
| #endif | ||
|
|
||
| /* To ensure Correctness, only valid iterations are executed and are executed | ||
| only once. Stores the number of times an iteration is executed. */ | ||
| unsigned *execution_count = NULL; | ||
| /* Stores the number of iterations executed by each thread. */ | ||
| unsigned *iterations_per_thread = NULL; | ||
|
|
||
| unsigned *Alloc(unsigned bound1, unsigned bound2) { | ||
| return (unsigned *)(malloc(bound1 * bound2 * sizeof(unsigned))); | ||
| } | ||
|
|
||
| void ZeroOut(unsigned *p, unsigned bound1, unsigned bound2) { | ||
| memset(p, 0, bound1 * bound2 * sizeof(unsigned)); | ||
| } | ||
|
|
||
| void Free(unsigned *p) { free((void *)p); } | ||
|
|
||
| unsigned *Index(unsigned *p, unsigned i, unsigned j, unsigned bound2) { | ||
| return &p[i * bound2 + j]; | ||
| } | ||
|
|
||
| int test(unsigned upper_bound) { | ||
|
|
||
| unsigned total_iterations = upper_bound * (upper_bound + 1) / 2; | ||
| unsigned num_threads = omp_get_max_threads(); | ||
| unsigned lower_per_chunk = total_iterations / num_threads; | ||
| unsigned upper_per_chunk = | ||
| lower_per_chunk + ((total_iterations % num_threads) ? 1 : 0); | ||
| int i, j; | ||
|
|
||
| omp_set_num_threads(num_threads); | ||
|
|
||
| ZeroOut(execution_count, upper_bound, upper_bound); | ||
| ZeroOut(iterations_per_thread, num_threads, 1); | ||
|
|
||
| #ifdef VERBOSE | ||
| fprintf(stderr, | ||
| "INFO: Using %6d threads for %6d outer iterations with %6d [%6d:%6d] " | ||
| "chunks " | ||
| "loop type upper triangle <,< - ", | ||
| num_threads, upper_bound, total_iterations, lower_per_chunk, | ||
| upper_per_chunk); | ||
| #endif | ||
|
|
||
| #pragma omp parallel shared(iterations_per_thread, execution_count) | ||
| { /* begin of parallel */ | ||
| /* Lower triangular execution_count matrix */ | ||
| #pragma omp for schedule(static) collapse(2) | ||
| for (i = 0; i < upper_bound; i++) { | ||
| for (j = i; j < upper_bound; j++) { | ||
| (*Index(iterations_per_thread, omp_get_thread_num(), 0, 1))++; | ||
| (*Index(execution_count, i, j, upper_bound))++; | ||
| } | ||
| } /* end of for*/ | ||
| } /* end of parallel */ | ||
|
|
||
| /* check the execution_count array */ | ||
| for (i = 0; i < upper_bound; i++) { | ||
| for (j = i; j < upper_bound; j++) { | ||
| unsigned value = *Index(execution_count, i, j, upper_bound); | ||
| /* iteration with j<=i are valid, but should have been executed only once | ||
| */ | ||
| if (value != 1) { | ||
| fprintf(stderr, "ERROR: valid iteration [%i,%i] executed %i times.\n", | ||
| i, j, value); | ||
| return 0; | ||
| } | ||
| } | ||
| for (j = 0; j < i; j++) { | ||
| unsigned value = *Index(execution_count, i, j, upper_bound); | ||
| /* iteration with j>=i are invalid and should not have been executed | ||
| */ | ||
| if (value > 0) { | ||
| fprintf(stderr, "ERROR: invalid iteration [%i,%i] executed %i times.\n", | ||
| i, j, value); | ||
| return 0; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| #ifndef NO_EFFICIENCY_CHECK | ||
| /* Ensure the number of iterations executed by each thread is within bounds */ | ||
| for (i = 0; i < num_threads; i++) { | ||
| unsigned value = *Index(iterations_per_thread, i, 0, 1); | ||
| if (value < lower_per_chunk || value > upper_per_chunk) { | ||
| fprintf(stderr, | ||
| "ERROR: Inefficient Collapse thread %d of %d assigned %i " | ||
| "iterations; must be between %d and %d\n", | ||
| i, num_threads, value, lower_per_chunk, upper_per_chunk); | ||
| return 0; | ||
| } | ||
| } | ||
| #endif | ||
| #ifdef VERBOSE | ||
| fprintf(stderr, "PASSED\r\n"); | ||
| #endif | ||
| return 1; | ||
| } | ||
|
|
||
| int main() { | ||
|
|
||
| execution_count = Alloc(MAX_BOUND, MAX_BOUND); | ||
| iterations_per_thread = Alloc(omp_get_max_threads(), 1); | ||
|
|
||
| for (unsigned j = 0; j < MAX_BOUND; j++) { | ||
| if (!test(j)) | ||
| return 1; | ||
| } | ||
| Free(execution_count); | ||
| Free(iterations_per_thread); | ||
| return 0; | ||
| } |