diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index b3d7ab4acf303..18828380abd32 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1359,6 +1359,22 @@ class OpenMPIRBuilder { : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {} }; + /// Generator for `#omp taskloop` + /// + /// \param Loc The location where the taskloop construct was encountered. + /// \param AllocaIP The insertion point to be used for alloca instructions. + /// \param BodyGenCB Callback that will generate the region code. + /// \param LoopInfo Callback that return the CLI + /// \param LBVal Lowerbound value of loop + /// \param UBVal Upperbound value of loop + /// \param StepVal Step value of loop + /// \param Tied True if the task is tied, false if the task is untied. + LLVM_ABI InsertPointOrErrorTy createTaskloop( + const LocationDescription &Loc, InsertPointTy AllocaIP, + BodyGenCallbackTy BodyGenCB, + llvm::function_ref()> LoopInfo, + Value *LBVal, Value *UBVal, Value *StepVal, bool Tied = true); + /// Generator for `#omp task` /// /// \param Loc The location where the task construct was encountered. diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 46b3d53a4b408..032495dfe9d61 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -95,6 +95,7 @@ __OMP_STRUCT_TYPE(KernelArgs, __tgt_kernel_arguments, false, Int32, Int32, VoidP __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr) __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8) __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr) +__OMP_STRUCT_TYPE(Taskloop, kmp_task_info, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr, Int64, Int64, Int64) __OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false, Int8, Int8, Int8, Int32, Int32, Int32, Int32, Int32, Int32) __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index fff9a815e5368..e88e722b1370e 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1933,6 +1933,205 @@ static Value *emitTaskDependencies( return DepArray; } +OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop( + const LocationDescription &Loc, InsertPointTy AllocaIP, + BodyGenCallbackTy BodyGenCB, + llvm::function_ref()> loopInfo, + Value *LBVal, Value *UBVal, Value *StepVal, bool Tied) { + + if (!updateToLocation(Loc)) + return InsertPointTy(); + + uint32_t SrcLocStrSize; + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize); + Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize); + + BasicBlock *TaskloopExitBB = + splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit"); + BasicBlock *TaskloopBodyBB = + splitBB(Builder, /*CreateBranch=*/true, "taskloop.body"); + BasicBlock *TaskloopAllocaBB = + splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca"); + + InsertPointTy TaskloopAllocaIP = + InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin()); + InsertPointTy TaskloopBodyIP = + InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin()); + + if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP)) + return Err; + + llvm::Expected result = loopInfo(); + if (!result) { + return result.takeError(); + } + + llvm::CanonicalLoopInfo *CLI = result.get(); + OutlineInfo OI; + OI.EntryBB = TaskloopAllocaBB; + OI.OuterAllocaBB = AllocaIP.getBlock(); + OI.ExitBB = TaskloopExitBB; + + // Add the thread ID argument. + SmallVector ToBeDeleted; + // dummy instruction to be used as a fake argument + OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal( + Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false)); + + OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied, + TaskloopAllocaBB, CLI, Loc, + ToBeDeleted](Function &OutlinedFn) mutable { + // Replace the Stale CI by appropriate RTL function call. + assert(OutlinedFn.hasOneUse() && + "there must be a single user for the outlined function"); + CallInst *StaleCI = cast(OutlinedFn.user_back()); + + // HasShareds is true if any variables are captured in the outlined region, + // false otherwise. + bool HasShareds = StaleCI->arg_size() > 1; + Builder.SetInsertPoint(StaleCI); + + // Gather the arguments for emitting the runtime call for + // @__kmpc_omp_task_alloc + Function *TaskAllocFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc); + + Value *ThreadID = getOrCreateThreadID(Ident); + + // Emit runtime call for @__kmpc_taskgroup + Function *TaskgroupFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup); + Builder.CreateCall(TaskgroupFn, {Ident, ThreadID}); + + // The flags are set to 1 if the task is tied, 0 otherwise. + Value *Flags = Builder.getInt32(Tied); + + Value *TaskSize = Builder.getInt64( + divideCeil(M.getDataLayout().getTypeSizeInBits(Taskloop), 8)); + + Value *SharedsSize = Builder.getInt64(0); + if (HasShareds) { + AllocaInst *ArgStructAlloca = + dyn_cast(StaleCI->getArgOperand(1)); + assert(ArgStructAlloca && + "Unable to find the alloca instruction corresponding to arguments " + "for extracted function"); + StructType *ArgStructType = + dyn_cast(ArgStructAlloca->getAllocatedType()); + assert(ArgStructType && "Unable to find struct type corresponding to " + "arguments for extracted function"); + SharedsSize = + Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType)); + } + + // Emit the @__kmpc_omp_task_alloc runtime call + // The runtime call returns a pointer to an area where the task captured + // variables must be copied before the task is run (TaskData) + CallInst *TaskData = Builder.CreateCall( + TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags, + /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize, + /*task_func=*/&OutlinedFn}); + + // Get the pointer to loop lb, ub, step from task ptr + // and set up the lowerbound,upperbound and step values + llvm::Value *lb = + Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 5); + // Value *LbVal_ext = Builder.CreateSExt(LBVal, Builder.getInt64Ty()); + Builder.CreateStore(LBVal, lb); + + llvm::Value *ub = + Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 6); + Builder.CreateStore(UBVal, ub); + + llvm::Value *step = + Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, TaskData, 7); + Value *Step_ext = Builder.CreateSExt(StepVal, Builder.getInt64Ty()); + Builder.CreateStore(Step_ext, step); + llvm::Value *loadstep = Builder.CreateLoad(Builder.getInt64Ty(), step); + + if (HasShareds) { + Value *Shareds = StaleCI->getArgOperand(1); + Align Alignment = TaskData->getPointerAlignment(M.getDataLayout()); + Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData); + Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment, + SharedsSize); + } + + // set up the arguments for emitting kmpc_taskloop runtime call + // setting default values for ifval, nogroup, sched, grainsize, task_dup + Value *IfVal = Builder.getInt32(1); + Value *NoGroup = Builder.getInt32(1); + Value *Sched = Builder.getInt32(0); + Value *GrainSize = Builder.getInt64(0); + Value *TaskDup = Constant::getNullValue(Builder.getPtrTy()); + + Value *Args[] = {Ident, ThreadID, TaskData, IfVal, lb, ub, + loadstep, NoGroup, Sched, GrainSize, TaskDup}; + + // taskloop runtime call + Function *TaskloopFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop); + Builder.CreateCall(TaskloopFn, Args); + + // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup + Function *EndTaskgroupFn = + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup); + Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID}); + + StaleCI->eraseFromParent(); + + Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin()); + + if (HasShareds) { + LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1)); + OutlinedFn.getArg(1)->replaceUsesWithIf( + Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; }); + } + + Value *IV = CLI->getIndVar(); + Type *IVTy = IV->getType(); + Constant *One = ConstantInt::get(IVTy, 1); + + Value *task_lb = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, + OutlinedFn.getArg(1), 5, "gep_lb"); + Value *LowerBound = Builder.CreateLoad(IVTy, task_lb, "lb"); + + Value *task_ub = Builder.CreateStructGEP(OpenMPIRBuilder::Taskloop, + OutlinedFn.getArg(1), 6, "gep_ub"); + Value *UpperBound = Builder.CreateLoad(IVTy, task_ub, "ub"); + + Builder.SetInsertPoint(CLI->getPreheader()->getTerminator()); + + Value *TripCountMinusOne = Builder.CreateSub(UpperBound, LowerBound); + Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt"); + // set the trip count in the CLI + CLI->setTripCount(TripCount); + + Builder.SetInsertPoint(CLI->getBody(), + CLI->getBody()->getFirstInsertionPt()); + + llvm::BasicBlock *Body = CLI->getBody(); + for (llvm::Instruction &I : *Body) { + if (auto *Add = llvm::dyn_cast(&I)) { + if (Add->getOpcode() == llvm::Instruction::Add) { + if (llvm::isa(Add->getOperand(0))) { + // update the starting index of the loop + Add->setOperand(1, LowerBound); + } + } + } + } + + for (Instruction *I : llvm::reverse(ToBeDeleted)) { + I->eraseFromParent(); + } + }; + + addOutlineInfo(std::move(OI)); + Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin()); + return Builder.saveIP(); +} + OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask( const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition, diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index 8edec990eaaba..d69fcd3db0413 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -323,6 +323,18 @@ static LogicalResult checkImplementationStatus(Operation &op) { if (op.getDistScheduleChunkSize()) result = todo("dist_schedule with chunk_size"); }; + auto checkFinal = [&todo](auto op, LogicalResult &result) { + if (op.getFinal()) + result = todo("final"); + }; + auto checkGrainsize = [&todo](auto op, LogicalResult &result) { + if (op.getGrainsize()) + result = todo("grainsize"); + }; + auto checkIf = [](auto op, LogicalResult &) { + if (op.getIfExpr()) + op.emitWarning("if"); + }; auto checkHint = [](auto op, LogicalResult &) { if (op.getHint()) op.emitWarning("hint clause discarded"); @@ -340,10 +352,22 @@ static LogicalResult checkImplementationStatus(Operation &op) { if (!op.getLinearVars().empty() || !op.getLinearStepVars().empty()) result = todo("linear"); }; + auto checkMergeable = [&todo](auto op, LogicalResult &result) { + if (op.getMergeable()) + result = todo("mergeable"); + }; + auto checkNogroup = [&todo](auto op, LogicalResult &result) { + if (op.getNogroup()) + result = todo("nogroup"); + }; auto checkNowait = [&todo](auto op, LogicalResult &result) { if (op.getNowait()) result = todo("nowait"); }; + auto checkNumTasks = [&todo](auto op, LogicalResult &result) { + if (op.getNumTasks()) + result = todo("num_tasks"); + }; auto checkOrder = [&todo](auto op, LogicalResult &result) { if (op.getOrder() || op.getOrderMod()) result = todo("order"); @@ -417,7 +441,15 @@ static LogicalResult checkImplementationStatus(Operation &op) { checkNowait(op, result); }) .Case([&](omp::TaskloopOp op) { - // TODO: Add other clauses check + checkAllocate(op, result); + checkFinal(op, result); + checkGrainsize(op, result); + checkIf(op, result); + checkInReduction(op, result); + checkMergeable(op, result); + checkNogroup(op, result); + checkNumTasks(op, result); + checkReduction(op, result); checkUntied(op, result); checkPriority(op, result); }) @@ -2097,6 +2129,8 @@ class TaskContextStructManager { /// private decls. void createGEPsToPrivateVars(); + llvm::Value *isAllocated(); + /// De-allocate the task context structure. void freeStructPtr(); @@ -2177,13 +2211,26 @@ void TaskContextStructManager::createGEPsToPrivateVars() { } } +llvm::Value *TaskContextStructManager::isAllocated() { + if (!structPtr) + return nullptr; + + return builder.CreateIsNotNull(structPtr); +} + void TaskContextStructManager::freeStructPtr() { if (!structPtr) return; llvm::IRBuilderBase::InsertPointGuard guard{builder}; - // Ensure we don't put the call to free() after the terminator - builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator()); + llvm::BasicBlock *currentBlock = builder.GetInsertBlock(); + if (currentBlock->getTerminator()) { + // Ensure we don't put the call to free() after the terminator + builder.SetInsertPoint(currentBlock->getTerminator()); + } else { + // Insert the call to free() at the end of the current block + builder.SetInsertPoint(currentBlock); + } builder.CreateFree(structPtr); } @@ -2419,6 +2466,207 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder, return success(); } +// Converts an OpenMP taskloop construct into LLVM IR using OpenMPIRBuilder. +static LogicalResult +convertOmpTaskloopOp(Operation &opInst, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy; + auto taskloopOp = cast(opInst); + if (failed(checkImplementationStatus(opInst))) + return failure(); + + // It stores the pointer of allocated firstprivate copies, + // which can be used later for freeing the allocated space. + SmallVector llvmFirstPrivateVars; + PrivateVarsInfo privateVarsInfo(taskloopOp); + TaskContextStructManager taskStructMgr{builder, moduleTranslation, + privateVarsInfo.privatizers}; + + llvm::OpenMPIRBuilder::InsertPointTy allocaIP = + findAllocaInsertPoint(builder, moduleTranslation); + + assert(builder.GetInsertPoint() == builder.GetInsertBlock()->end()); + llvm::BasicBlock *taskloopStartBlock = llvm::BasicBlock::Create( + builder.getContext(), "omp.taskloop.start", + /*Parent=*/builder.GetInsertBlock()->getParent()); + llvm::Instruction *branchToTaskloopStartBlock = + builder.CreateBr(taskloopStartBlock); + builder.SetInsertPoint(branchToTaskloopStartBlock); + + llvm::BasicBlock *copyBlock = + splitBB(builder, /*CreateBranch=*/true, "omp.private.copy"); + llvm::BasicBlock *initBlock = + splitBB(builder, /*CreateBranch=*/true, "omp.private.init"); + + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocaIP); + + // Allocate and initialize private variables + builder.SetInsertPoint(initBlock->getTerminator()); + + taskStructMgr.generateTaskContextStruct(); + taskStructMgr.createGEPsToPrivateVars(); + + llvmFirstPrivateVars.resize(privateVarsInfo.blockArgs.size()); + int index = 0; + + for (auto [privDecl, mlirPrivVar, blockArg, llvmPrivateVarAlloc] : + llvm::zip_equal(privateVarsInfo.privatizers, privateVarsInfo.mlirVars, + privateVarsInfo.blockArgs, + taskStructMgr.getLLVMPrivateVarGEPs())) { + // To be handled inside the taskloop. + if (!privDecl.readsFromMold()) + continue; + assert(llvmPrivateVarAlloc && + "reads from mold so shouldn't have been skipped"); + + llvm::Expected privateVarOrErr = + initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar, + blockArg, llvmPrivateVarAlloc, initBlock); + if (!privateVarOrErr) + return handleError(privateVarOrErr, *taskloopOp.getOperation()); + + llvmFirstPrivateVars[index++] = privateVarOrErr.get(); + + llvm::IRBuilderBase::InsertPointGuard guard(builder); + builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator()); + + if ((privateVarOrErr.get() != llvmPrivateVarAlloc) && + !mlir::isa(blockArg.getType())) { + builder.CreateStore(privateVarOrErr.get(), llvmPrivateVarAlloc); + // Load it so we have the value pointed to by the GEP + llvmPrivateVarAlloc = builder.CreateLoad(privateVarOrErr.get()->getType(), + llvmPrivateVarAlloc); + } + assert(llvmPrivateVarAlloc->getType() == + moduleTranslation.convertType(blockArg.getType())); + } + + // firstprivate copy region + setInsertPointForPossiblyEmptyBlock(builder, copyBlock); + if (failed(copyFirstPrivateVars( + taskloopOp, builder, moduleTranslation, privateVarsInfo.mlirVars, + taskStructMgr.getLLVMPrivateVarGEPs(), privateVarsInfo.privatizers, + taskloopOp.getPrivateNeedsBarrier()))) + return llvm::failure(); + + // Set up inserttion point for call to createTaskloop() + builder.SetInsertPoint(taskloopStartBlock); + + auto bodyCB = [&](InsertPointTy allocaIP, + InsertPointTy codegenIP) -> llvm::Error { + // Save the alloca insertion point on ModuleTranslation stack for use in + // nested regions. + LLVM::ModuleTranslation::SaveStack frame( + moduleTranslation, allocaIP); + + // translate the body of the taskloop: + builder.restoreIP(codegenIP); + + llvm::BasicBlock *privInitBlock = nullptr; + privateVarsInfo.llvmVars.resize(privateVarsInfo.blockArgs.size()); + for (auto [i, zip] : llvm::enumerate(llvm::zip_equal( + privateVarsInfo.blockArgs, privateVarsInfo.privatizers, + privateVarsInfo.mlirVars))) { + auto [blockArg, privDecl, mlirPrivVar] = zip; + // This is handled before the task executes + if (privDecl.readsFromMold()) + continue; + + llvm::IRBuilderBase::InsertPointGuard guard(builder); + llvm::Type *llvmAllocType = + moduleTranslation.convertType(privDecl.getType()); + builder.SetInsertPoint(allocaIP.getBlock()->getTerminator()); + llvm::Value *llvmPrivateVar = builder.CreateAlloca( + llvmAllocType, /*ArraySize=*/nullptr, "omp.private.alloc"); + + llvm::Expected privateVarOrError = + initPrivateVar(builder, moduleTranslation, privDecl, mlirPrivVar, + blockArg, llvmPrivateVar, privInitBlock); + if (!privateVarOrError) + return privateVarOrError.takeError(); + moduleTranslation.mapValue(blockArg, privateVarOrError.get()); + privateVarsInfo.llvmVars[i] = privateVarOrError.get(); + // Add private var to llvmFirstPrivateVars + llvmFirstPrivateVars[index++] = privateVarOrError.get(); + } + + taskStructMgr.createGEPsToPrivateVars(); + for (auto [i, llvmPrivVar] : + llvm::enumerate(taskStructMgr.getLLVMPrivateVarGEPs())) { + if (!llvmPrivVar) { + assert(privateVarsInfo.llvmVars[i] && + "This is added in the loop above"); + continue; + } + privateVarsInfo.llvmVars[i] = llvmPrivVar; + } + + // Find and map the addresses of each variable within the taskloop context + // structure + for (auto [blockArg, llvmPrivateVar, privateDecl] : + llvm::zip_equal(privateVarsInfo.blockArgs, privateVarsInfo.llvmVars, + privateVarsInfo.privatizers)) { + // This was handled above. + if (!privateDecl.readsFromMold()) + continue; + // Fix broken pass-by-value case for Fortran character boxes + if (!mlir::isa(blockArg.getType())) { + llvmPrivateVar = builder.CreateLoad( + moduleTranslation.convertType(blockArg.getType()), llvmPrivateVar); + } + assert(llvmPrivateVar->getType() == + moduleTranslation.convertType(blockArg.getType())); + moduleTranslation.mapValue(blockArg, llvmPrivateVar); + } + + auto continuationBlockOrError = + convertOmpOpRegions(taskloopOp.getRegion(), "omp.taskloop.region", + builder, moduleTranslation); + ; + if (failed(handleError(continuationBlockOrError, opInst))) + return llvm::make_error(); + + builder.SetInsertPoint(continuationBlockOrError.get()->getTerminator()); + + // dummy check to ensure that the task context structure is accessed inside + // the outlined fn. + llvm::Value *cond = taskStructMgr.isAllocated(); + return llvm::Error::success(); + }; + + auto loopOp = cast(taskloopOp.getWrappedLoop()); + + auto loopInfo = [&]() -> llvm::Expected { + llvm::CanonicalLoopInfo *loopInfo = findCurrentLoopInfo(moduleTranslation); + return loopInfo; + }; + + llvm::OpenMPIRBuilder &ompBuilder = *moduleTranslation.getOpenMPBuilder(); + llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder); + llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = + moduleTranslation.getOpenMPBuilder()->createTaskloop( + ompLoc, allocaIP, bodyCB, loopInfo, + moduleTranslation.lookupValue(loopOp.getLoopLowerBounds()[0]), + moduleTranslation.lookupValue(loopOp.getLoopUpperBounds()[0]), + moduleTranslation.lookupValue(loopOp.getLoopSteps()[0])); + + if (failed(handleError(afterIP, opInst))) + return failure(); + + builder.restoreIP(*afterIP); + + // freeing the task context structure in exit block of taskloop. + if (failed(cleanupPrivateVars(builder, moduleTranslation, taskloopOp.getLoc(), + llvmFirstPrivateVars, + privateVarsInfo.privatizers))) + return failure(); + + taskStructMgr.freeStructPtr(); + + return success(); +} + /// Converts an OpenMP taskgroup construct into LLVM IR using OpenMPIRBuilder. static LogicalResult convertOmpTaskgroupOp(omp::TaskgroupOp tgOp, llvm::IRBuilderBase &builder, @@ -6224,6 +6472,9 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder, .Case([&](omp::TaskOp op) { return convertOmpTaskOp(op, builder, moduleTranslation); }) + .Case([&](omp::TaskloopOp op) { + return convertOmpTaskloopOp(*op, builder, moduleTranslation); + }) .Case([&](omp::TaskgroupOp op) { return convertOmpTaskgroupOp(op, builder, moduleTranslation); }) diff --git a/mlir/test/Target/LLVMIR/openmp-taskloop.mlir b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir new file mode 100644 index 0000000000000..536a1fe9d9157 --- /dev/null +++ b/mlir/test/Target/LLVMIR/openmp-taskloop.mlir @@ -0,0 +1,151 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +omp.private {type = private} @_QFtestEi_private_i32 : i32 + +omp.private {type = firstprivate} @_QFtestEa_firstprivate_i32 : i32 copy { +^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.load %arg0 : !llvm.ptr -> i32 + llvm.store %0, %arg1 : i32, !llvm.ptr + omp.yield(%arg1 : !llvm.ptr) +} + + +llvm.func @_QPtest() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr + %3 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr + %6 = llvm.mlir.constant(20 : i32) : i32 + llvm.store %6, %3 : i32, !llvm.ptr + %7 = llvm.mlir.constant(1 : i32) : i32 + %8 = llvm.mlir.constant(5 : i32) : i32 + %9 = llvm.mlir.constant(1 : i32) : i32 + omp.taskloop private(@_QFtestEa_firstprivate_i32 %3 -> %arg0, @_QFtestEi_private_i32 %1 -> %arg1 : !llvm.ptr, !llvm.ptr) { + omp.loop_nest (%arg2) : i32 = (%7) to (%8) inclusive step (%9) { + llvm.store %arg2, %arg1 : i32, !llvm.ptr + %10 = llvm.load %arg0 : !llvm.ptr -> i32 + %11 = llvm.mlir.constant(1 : i32) : i32 + %12 = llvm.add %10, %11 : i32 + llvm.store %12, %arg0 : i32, !llvm.ptr + omp.yield + } + } + llvm.return +} + +// CHECK: %struct.kmp_task_info = type { ptr, ptr, i32, ptr, ptr, i64, i64, i64 } + +// CHECK-LABEL: define void @_QPtest() { +// CHECK: %[[STRUCTARG:.*]] = alloca { ptr }, align 8 +// CHECK: %[[VAL1:.*]] = alloca i32, i64 1, align 4 +// CHECK: %[[VAL_X:.*]] = alloca i32, i64 1, align 4 +// CHECK: store i32 20, ptr %[[VAL_X]], align 4 +// CHECK: br label %entry + +// CHECK: entry: +// CHECK: br label %omp.private.init + +// CHECK: omp.private.init: ; preds = %entry +// CHECK: %[[OMP_TASK_CONTEXT_PTR:.*]] = tail call ptr @malloc(i64 ptrtoint (ptr getelementptr ({ i32 }, ptr null, i32 1) to i64)) +// CHECK: %[[PRIV_GEP:.*]] = getelementptr { i32 }, ptr %[[OMP_TASK_CONTEXT_PTR]], i32 0, i32 0 +// CHECK: br label %omp.private.copy + +// CHECK: omp.private.copy: +// CHECK: br label %omp.private.copy1 + +// CHECK: omp.private.copy1: +// CHECK: %[[LOAD_X:.*]] = load i32, ptr %[[VAL_X]], align 4 +// CHECK: store i32 %[[LOAD_X]], ptr %[[PRIV_GEP]], align 4 +// CHECK: br label %omp.taskloop.start + +// CHECK: omp.taskloop.start: +// CHECK: br label %codeRepl + +// CHECK: codeRepl: +// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[STRUCTARG]], i32 0, i32 0 +// CHECK: store ptr %[[OMP_TASK_CONTEXT_PTR]], ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8 +// CHECK: %[[GTID:.*]] = call i32 @__kmpc_global_thread_num(ptr @1) +// CHECK: call void @__kmpc_taskgroup(ptr @1, i32 %[[GTID]]) +// CHECK: %[[TASK_PTR:.*]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %[[GTID]], i32 1, i64 64, i64 8, ptr @_QPtest..omp_par) +// CHECK: %[[LB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 5 +// CHECK: store i32 1, ptr %[[LB_GEP]], align 4 +// CHECK: %[[UB_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 6 +// CHECK: store i32 5, ptr %[[UB_GEP]], align 4 +// CHECK: %[[STEP_GEP:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR]], i32 0, i32 7 +// CHECK: store i64 1, ptr %[[STEP_GEP]], align 4 +// CHECK: %[[LOAD_STEP:.*]] = load i64, ptr %[[STEP_GEP]], align 4 +// CHECK: %10 = load ptr, ptr %[[TASK_PTR]], align 8 +// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 %10, ptr align 1 %[[STRUCTARG]], i64 8, i1 false) +// CHECK: call void @__kmpc_taskloop(ptr @1, i32 %[[GTID]], ptr %[[TASK_PTR]], i32 1, ptr %[[LB_GEP]], ptr %[[UB_GEP]], i64 %[[LOAD_STEP]], i32 1, i32 0, i64 0, ptr null) +// CHECK: call void @__kmpc_end_taskgroup(ptr @1, i32 %[[GTID]]) +// CHECK: br label %taskloop.exit + +// CHECK: taskloop.exit: +// CHECK: tail call void @free(ptr %[[OMP_TASK_CONTEXT_PTR]]) +// CHECK: ret void +// CHECK: } + +// CHECK-LABEL: define internal void @_QPtest..omp_par +// CHECK-SAME: i32 %[[GLOBAL_TID:.*]], ptr %[[TASK_PTR1:.*]]) { +// CHECK: taskloop.alloca: +// CHECK: %[[LOAD_TASK_PTR:.*]] = load ptr, ptr %[[TASK_PTR1]], align 8 +// CHECK: %[[GEP_LB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 5 +// CHECK: %[[LB:.*]] = load i32, ptr %[[GEP_LB]], align 4 +// CHECK: %[[GEP_UB:.*]] = getelementptr inbounds nuw %struct.kmp_task_info, ptr %[[TASK_PTR1]], i32 0, i32 6 +// CHECK: %[[UB:.*]] = load i32, ptr %[[GEP_UB]], align 4 +// CHECK: %[[GEP_OMP_TASK_CONTEXT_PTR:.*]] = getelementptr { ptr }, ptr %[[LOAD_TASK_PTR]], i32 0, i32 0 +// CHECK: %[[LOADGEP_OMP_TASK_CONTEXT_PTR:.*]] = load ptr, ptr %[[GEP_OMP_TASK_CONTEXT_PTR]], align 8, !align !1 +// CHECK: %[[OMP_PRIVATE_ALLOC:.*]] = alloca i32, align 4 +// CHECK: br label %taskloop.body + +// CHECK: taskloop.body: +// CHECK: %[[LOAD_X:.*]] = getelementptr { i32 }, ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], i32 0, i32 0 +// CHECK: br label %omp.taskloop.region + +// CHECK: omp.taskloop.region: +// CHECK: br label %omp_loop.preheader + +// CHECK: omp_loop.preheader: +// CHECK: %[[VAL2:.*]] = sub i32 %[[UB]], %[[LB]] +// CHECK: %[[TRIP_CNT:.*]] = add i32 %[[VAL2]], 1 +// CHECK: br label %omp_loop.header + +// CHECK: omp_loop.header: +// CHECK: %[[OMP_LOOP_IV:.*]] = phi i32 [ 0, %omp_loop.preheader ], [ %omp_loop.next, %omp_loop.inc ] +// CHECK: br label %omp_loop.cond + +// CHECK: omp_loop.cond: +// CHECK: %[[OMP_LOOP_CMP:.*]] = icmp ult i32 %[[OMP_LOOP_IV]], %[[TRIP_CNT]] +// CHECK: br i1 %[[OMP_LOOP_CMP]], label %omp_loop.body, label %omp_loop.exit + +// CHECK: omp_loop.exit: +// CHECK: br label %omp_loop.after + +// CHECK: omp_loop.after: +// CHECK: br label %omp.region.cont + +// CHECK: omp.region.cont: +// CHECK: %[[IS_ALLOCATED:.*]] = icmp ne ptr %[[LOADGEP_OMP_TASK_CONTEXT_PTR]], null +// CHECK: br label %taskloop.exit.exitStub + +// CHECK: omp_loop.body: +// CHECK: %[[VAL3:.*]] = mul i32 %[[OMP_LOOP_IV]], 1 +// CHECK: %[[VAL5:.*]] = add i32 %[[VAL3]], %[[LB]] +// CHECK: br label %omp.loop_nest.region + +// CHECK: omp.loop_nest.region: +// CHECK: store i32 %[[VAL5]], ptr %[[OMP_PRIVATE_ALLOC]], align 4 +// CHECK: %[[VAL6:.*]] = load i32, ptr %[[LOAD_X]], align 4 +// CHECK: %[[RES:.*]] = add i32 %[[VAL6]], 1 +// CHECK: store i32 %[[RES]], ptr %[[LOAD_X]], align 4 +// CHECK: br label %omp.region.cont2 + +// CHECK: omp.region.cont2: +// CHECK: br label %omp_loop.inc + +// CHECK: omp_loop.inc: +// CHECK: %omp_loop.next = add nuw i32 %[[OMP_LOOP_IV]], 1 +// CHECK: br label %omp_loop.header + +// CHECK: taskloop.exit.exitStub: +// CHECK: ret void +// CHECK: } \ No newline at end of file diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir index af6d254cfd3c3..d33cb7e4708b4 100644 --- a/mlir/test/Target/LLVMIR/openmp-todo.mlir +++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir @@ -360,21 +360,8 @@ llvm.func @taskgroup_task_reduction(%x : !llvm.ptr) { // ----- -llvm.func @taskloop(%lb : i32, %ub : i32, %step : i32) { - // expected-error@below {{not yet implemented: omp.taskloop}} - // expected-error@below {{LLVM Translation failed for operation: omp.taskloop}} - omp.taskloop { - omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) { - omp.yield - } - } - llvm.return -} - -// ----- - llvm.func @taskloop_untied(%lb : i32, %ub : i32, %step : i32) { - // expected-error@below {{not yet implemented: omp.taskloop}} + // expected-error@below {{not yet implemented: Unhandled clause untied in omp.taskloop operation}} // expected-error@below {{LLVM Translation failed for operation: omp.taskloop}} omp.taskloop untied { omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {