diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 7b097d1ac0ee0..71234c7ee898a 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -1731,9 +1731,10 @@ class OpenMPIRBuilder { /// need to be copied to the new function. /// /// \return The ListToGlobalCopy function. - Function *emitListToGlobalCopyFunction(ArrayRef ReductionInfos, - Type *ReductionsBufferTy, - AttributeList FuncAttrs); + Expected + emitListToGlobalCopyFunction(ArrayRef ReductionInfos, + Type *ReductionsBufferTy, + AttributeList FuncAttrs, ArrayRef IsByRef); /// This function emits a helper that copies all the reduction variables from /// the team into the provided global buffer for the reduction variables. @@ -1748,9 +1749,10 @@ class OpenMPIRBuilder { /// need to be copied to the new function. /// /// \return The GlobalToList function. - Function *emitGlobalToListCopyFunction(ArrayRef ReductionInfos, - Type *ReductionsBufferTy, - AttributeList FuncAttrs); + Expected + emitGlobalToListCopyFunction(ArrayRef ReductionInfos, + Type *ReductionsBufferTy, + AttributeList FuncAttrs, ArrayRef IsByRef); /// This function emits a helper that reduces all the reduction variables from /// the team into the provided global buffer for the reduction variables. @@ -1769,10 +1771,11 @@ class OpenMPIRBuilder { /// need to be copied to the new function. /// /// \return The ListToGlobalReduce function. - Function * + Expected emitListToGlobalReduceFunction(ArrayRef ReductionInfos, Function *ReduceFn, Type *ReductionsBufferTy, - AttributeList FuncAttrs); + AttributeList FuncAttrs, + ArrayRef IsByRef); /// This function emits a helper that reduces all the reduction variables from /// the team into the provided global buffer for the reduction variables. @@ -1791,10 +1794,11 @@ class OpenMPIRBuilder { /// need to be copied to the new function. /// /// \return The GlobalToListReduce function. - Function * + Expected emitGlobalToListReduceFunction(ArrayRef ReductionInfos, Function *ReduceFn, Type *ReductionsBufferTy, - AttributeList FuncAttrs); + AttributeList FuncAttrs, + ArrayRef IsByRef); /// Get the function name of a reduction function. std::string getReductionFuncName(StringRef Name) const; diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index c962368859730..4264d7cb986af 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3151,9 +3151,9 @@ Expected OpenMPIRBuilder::emitShuffleAndReduceFunction( return SarFunc; } -Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( +Expected OpenMPIRBuilder::emitListToGlobalCopyFunction( ArrayRef ReductionInfos, Type *ReductionsBufferTy, - AttributeList FuncAttrs) { + AttributeList FuncAttrs, ArrayRef IsByRef) { OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( @@ -3223,7 +3223,21 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( switch (RI.EvaluationKind) { case EvalKind::Scalar: { - Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr); + Value *TargetElement; + + if (IsByRef.empty() || !IsByRef[En.index()]) { + TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr); + } else { + InsertPointOrErrorTy GenResult = + RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr); + + if (!GenResult) + return GenResult.takeError(); + + ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr); + TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr); + } + Builder.CreateStore(TargetElement, GlobVal); break; } @@ -3261,9 +3275,9 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( return LtGCFunc; } -Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( +Expected OpenMPIRBuilder::emitListToGlobalReduceFunction( ArrayRef ReductionInfos, Function *ReduceFn, - Type *ReductionsBufferTy, AttributeList FuncAttrs) { + Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef IsByRef) { OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( @@ -3302,6 +3316,8 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( Value *LocalReduceList = Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list"); + InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()}; + Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( BufferArgAlloca, Builder.getPtrTy(), BufferArgAlloca->getName() + ".ascast"); @@ -3323,6 +3339,20 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( Type *IndexTy = Builder.getIndexTy( M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Value *ByRefAlloc; + + if (!IsByRef.empty() && IsByRef[En.index()]) { + InsertPointTy OldIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + + ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType); + ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast( + ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast"); + + Builder.restoreIP(OldIP); + } + Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP( RedListArrayTy, LocalReduceListAddrCast, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); @@ -3331,7 +3361,21 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( // Global = Buffer.VD[Idx]; Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32( ReductionsBufferTy, BufferVD, 0, En.index()); - Builder.CreateStore(GlobValPtr, TargetElementPtrPtr); + + if (!IsByRef.empty() && IsByRef[En.index()]) { + Value *ByRefDataPtr; + + InsertPointOrErrorTy GenResult = + RI.DataPtrPtrGen(Builder.saveIP(), ByRefAlloc, ByRefDataPtr); + + if (!GenResult) + return GenResult.takeError(); + + Builder.CreateStore(GlobValPtr, ByRefDataPtr); + Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr); + } else { + Builder.CreateStore(GlobValPtr, TargetElementPtrPtr); + } } // Call reduce_function(GlobalReduceList, ReduceList) @@ -3344,32 +3388,32 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( return LtGRFunc; } -Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( +Expected OpenMPIRBuilder::emitGlobalToListCopyFunction( ArrayRef ReductionInfos, Type *ReductionsBufferTy, - AttributeList FuncAttrs) { + AttributeList FuncAttrs, ArrayRef IsByRef) { OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, /* IsVarArg */ false); - Function *LtGCFunc = + Function *GtLCFunc = Function::Create(FuncTy, GlobalVariable::InternalLinkage, "_omp_reduction_global_to_list_copy_func", &M); - LtGCFunc->setAttributes(FuncAttrs); - LtGCFunc->addParamAttr(0, Attribute::NoUndef); - LtGCFunc->addParamAttr(1, Attribute::NoUndef); - LtGCFunc->addParamAttr(2, Attribute::NoUndef); + GtLCFunc->setAttributes(FuncAttrs); + GtLCFunc->addParamAttr(0, Attribute::NoUndef); + GtLCFunc->addParamAttr(1, Attribute::NoUndef); + GtLCFunc->addParamAttr(2, Attribute::NoUndef); - BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); + BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc); Builder.SetInsertPoint(EntryBlock); // Buffer: global reduction buffer. - Argument *BufferArg = LtGCFunc->getArg(0); + Argument *BufferArg = GtLCFunc->getArg(0); // Idx: index of the buffer. - Argument *IdxArg = LtGCFunc->getArg(1); + Argument *IdxArg = GtLCFunc->getArg(1); // ReduceList: thread local Reduce list. - Argument *ReduceListArg = LtGCFunc->getArg(2); + Argument *ReduceListArg = GtLCFunc->getArg(2); Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, BufferArg->getName() + ".addr"); @@ -3413,7 +3457,20 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( switch (RI.EvaluationKind) { case EvalKind::Scalar: { - Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr); + Type *ElemType = RI.ElementType; + + if (!IsByRef.empty() && IsByRef[En.index()]) { + ElemType = RI.ByRefElementType; + InsertPointOrErrorTy GenResult = + RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr); + + if (!GenResult) + return GenResult.takeError(); + + ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr); + } + + Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr); Builder.CreateStore(TargetElement, ElemPtr); break; } @@ -3449,35 +3506,35 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( Builder.CreateRetVoid(); Builder.restoreIP(OldIP); - return LtGCFunc; + return GtLCFunc; } -Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( +Expected OpenMPIRBuilder::emitGlobalToListReduceFunction( ArrayRef ReductionInfos, Function *ReduceFn, - Type *ReductionsBufferTy, AttributeList FuncAttrs) { + Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef IsByRef) { OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); LLVMContext &Ctx = M.getContext(); auto *FuncTy = FunctionType::get( Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()}, /* IsVarArg */ false); - Function *LtGRFunc = + Function *GtLRFunc = Function::Create(FuncTy, GlobalVariable::InternalLinkage, "_omp_reduction_global_to_list_reduce_func", &M); - LtGRFunc->setAttributes(FuncAttrs); - LtGRFunc->addParamAttr(0, Attribute::NoUndef); - LtGRFunc->addParamAttr(1, Attribute::NoUndef); - LtGRFunc->addParamAttr(2, Attribute::NoUndef); + GtLRFunc->setAttributes(FuncAttrs); + GtLRFunc->addParamAttr(0, Attribute::NoUndef); + GtLRFunc->addParamAttr(1, Attribute::NoUndef); + GtLRFunc->addParamAttr(2, Attribute::NoUndef); - BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); + BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc); Builder.SetInsertPoint(EntryBlock); // Buffer: global reduction buffer. - Argument *BufferArg = LtGRFunc->getArg(0); + Argument *BufferArg = GtLRFunc->getArg(0); // Idx: index of the buffer. - Argument *IdxArg = LtGRFunc->getArg(1); + Argument *IdxArg = GtLRFunc->getArg(1); // ReduceList: thread local Reduce list. - Argument *ReduceListArg = LtGRFunc->getArg(2); + Argument *ReduceListArg = GtLRFunc->getArg(2); Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr, BufferArg->getName() + ".addr"); @@ -3493,6 +3550,8 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( Value *LocalReduceList = Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list"); + InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()}; + Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast( BufferArgAlloca, Builder.getPtrTy(), BufferArgAlloca->getName() + ".ascast"); @@ -3514,6 +3573,20 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( Type *IndexTy = Builder.getIndexTy( M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace()); for (auto En : enumerate(ReductionInfos)) { + const ReductionInfo &RI = En.value(); + Value *ByRefAlloc; + + if (!IsByRef.empty() && IsByRef[En.index()]) { + InsertPointTy OldIP = Builder.saveIP(); + Builder.restoreIP(AllocaIP); + + ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType); + ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast( + ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast"); + + Builder.restoreIP(OldIP); + } + Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP( RedListArrayTy, ReductionList, {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())}); @@ -3522,7 +3595,19 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs); Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32( ReductionsBufferTy, BufferVD, 0, En.index()); - Builder.CreateStore(GlobValPtr, TargetElementPtrPtr); + + if (!IsByRef.empty() && IsByRef[En.index()]) { + Value *ByRefDataPtr; + InsertPointOrErrorTy GenResult = + RI.DataPtrPtrGen(Builder.saveIP(), ByRefAlloc, ByRefDataPtr); + if (!GenResult) + return GenResult.takeError(); + + Builder.CreateStore(GlobValPtr, ByRefDataPtr); + Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr); + } else { + Builder.CreateStore(GlobValPtr, TargetElementPtrPtr); + } } // Call reduce_function(ReduceList, GlobalReduceList) @@ -3532,7 +3617,7 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( ->addFnAttr(Attribute::NoUnwind); Builder.CreateRetVoid(); Builder.restoreIP(OldIP); - return LtGRFunc; + return GtLRFunc; } std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const { @@ -3788,7 +3873,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType); if (Size > MaxDataSize) MaxDataSize = Size; - ReductionTypeArgs.emplace_back(En.value().ElementType); + Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()]) + ? En.value().ByRefElementType + : En.value().ElementType; + ReductionTypeArgs.emplace_back(RedTypeArg); } Value *ReductionDataSize = Builder.getInt64(MaxDataSize * ReductionInfos.size()); @@ -3806,20 +3894,33 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( CodeGenIP = Builder.saveIP(); StructType *ReductionsBufferTy = StructType::create( Ctx, ReductionTypeArgs, "struct._globalized_locals_ty"); - Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr( + Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr( RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer); - Function *LtGCFunc = emitListToGlobalCopyFunction( - ReductionInfos, ReductionsBufferTy, FuncAttrs); - Function *LtGRFunc = emitListToGlobalReduceFunction( - ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs); - Function *GtLCFunc = emitGlobalToListCopyFunction( - ReductionInfos, ReductionsBufferTy, FuncAttrs); - Function *GtLRFunc = emitGlobalToListReduceFunction( - ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs); + + Expected LtGCFunc = emitListToGlobalCopyFunction( + ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef); + if (!LtGCFunc) + return LtGCFunc.takeError(); + + Expected LtGRFunc = emitListToGlobalReduceFunction( + ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef); + if (!LtGRFunc) + return LtGRFunc.takeError(); + + Expected GtLCFunc = emitGlobalToListCopyFunction( + ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef); + if (!GtLCFunc) + return GtLCFunc.takeError(); + + Expected GtLRFunc = emitGlobalToListReduceFunction( + ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef); + if (!GtLRFunc) + return GtLRFunc.takeError(); + Builder.restoreIP(CodeGenIP); Value *KernelTeamsReductionPtr = createRuntimeFunctionCall( - RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr"); + RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr"); Value *Args3[] = {SrcLocInfo, KernelTeamsReductionPtr, @@ -3828,10 +3929,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( RL, *SarFunc, WcFunc, - LtGCFunc, - LtGRFunc, - GtLCFunc, - GtLRFunc}; + *LtGCFunc, + *LtGRFunc, + *GtLCFunc, + *GtLRFunc}; Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr( RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2); diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir index df606150b760a..95d12f304aca0 100644 --- a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir +++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction.mlir @@ -1,3 +1,5 @@ +// Tests single-team by-ref GPU reductions. + // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { diff --git a/mlir/test/Target/LLVMIR/allocatable_gpu_reduction_teams.mlir b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction_teams.mlir new file mode 100644 index 0000000000000..1c73a49b0bf9f --- /dev/null +++ b/mlir/test/Target/LLVMIR/allocatable_gpu_reduction_teams.mlir @@ -0,0 +1,121 @@ +// Tests cross-teams by-ref GPU reductions. + +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + omp.private {type = private} @_QFfooEi_private_i32 : i32 + omp.declare_reduction @add_reduction_byref_box_heap_f32 : !llvm.ptr attributes {byref_element_type = f32} alloc { + %0 = llvm.mlir.constant(1 : i64) : i64 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> : (i64) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + omp.yield(%2 : !llvm.ptr) + } init { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + omp.yield(%arg1 : !llvm.ptr) + } combiner { + ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr): + %0 = llvm.mlir.constant(1 : i32) : i32 + %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr + %3 = llvm.mlir.constant(1 : i32) : i32 + %4 = llvm.alloca %3 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {alignment = 8 : i64} : (i32) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %6 = llvm.mlir.constant(24 : i32) : i32 + "llvm.intr.memcpy"(%5, %arg0, %6) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %7 = llvm.mlir.constant(24 : i32) : i32 + "llvm.intr.memcpy"(%2, %arg1, %7) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> () + %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = llvm.load %8 : !llvm.ptr -> !llvm.ptr + %10 = llvm.getelementptr %2[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %11 = llvm.load %10 : !llvm.ptr -> !llvm.ptr + %12 = llvm.load %9 : !llvm.ptr -> f32 + %13 = llvm.load %11 : !llvm.ptr -> f32 + %14 = llvm.fadd %12, %13 {fastmathFlags = #llvm.fastmath} : f32 + llvm.store %14, %9 : f32, !llvm.ptr + omp.yield(%arg0 : !llvm.ptr) + } data_ptr_ptr { + ^bb0(%arg0: !llvm.ptr): + %0 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + omp.yield(%0 : !llvm.ptr) + } + + llvm.func @foo_() { + %0 = llvm.mlir.constant(1 : i64) : i64 + %4 = llvm.alloca %0 x i1 : (i64) -> !llvm.ptr<5> + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr + %8 = llvm.getelementptr %5[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> + %9 = omp.map.info var_ptr(%5 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) var_ptr_ptr(%8 : !llvm.ptr) -> !llvm.ptr {name = ""} + %10 = omp.map.info var_ptr(%5 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, descriptor, to, attach) capture(ByRef) members(%9 : [0] : !llvm.ptr) -> !llvm.ptr {name = "scalar_alloc"} + omp.target map_entries(%10 -> %arg0 : !llvm.ptr) { + %14 = llvm.mlir.constant(1000000 : i32) : i32 + %15 = llvm.mlir.constant(1 : i32) : i32 + omp.teams reduction(byref @add_reduction_byref_box_heap_f32 %arg0 -> %arg3 : !llvm.ptr) { + omp.parallel { + omp.distribute { + omp.wsloop reduction(byref @add_reduction_byref_box_heap_f32 %arg3 -> %arg5 : !llvm.ptr) { + omp.loop_nest (%arg6) : i32 = (%15) to (%14) inclusive step (%15) { + omp.yield + } + } {omp.composite} + } {omp.composite} + omp.terminator + } {omp.composite} + omp.terminator + } + omp.terminator + } + llvm.return + } +} + +// CHECK: %[[GLOBALIZED_LOCALS:.*]] = type { float } + +// CHECK: define internal void @_omp_reduction_list_to_global_copy_func({{.*}}) {{.*}} { +// CHECK: %[[RED_ARR_LIST:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0 +// CHECK: %[[RED_ELEM_PTR:.*]] = load ptr, ptr %[[RED_ARR_LIST]], align 8 +// CHECK: %[[GLOB_ELEM_PTR:.*]] = getelementptr inbounds %[[GLOBALIZED_LOCALS]], ptr %{{.*}}, i32 0, i32 0 +// CHECK: %[[ALLOC_PTR_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[RED_ELEM_PTR]], i32 0, i32 0 +// CHECK: %[[ALLOC_PTR:.*]] = load ptr, ptr %[[ALLOC_PTR_PTR]], align 8 +// CHECK: %[[ALLOC_VAL:.*]] = load float, ptr %[[ALLOC_PTR]], align 4 +// Verify that the actual value managed by the descriptor is stored in the globalized +// locals arrays; rather than a pointer to the descriptor or a pointer to the value. +// CHECK: store float %[[ALLOC_VAL]], ptr %[[GLOB_ELEM_PTR]], align 4 +// CHECK: } + +// CHECK: define internal void @_omp_reduction_list_to_global_reduce_func({{.*}}) {{.*}} { +// Allocate a descriptor to manage the element retrieved from the globalized local array. +// CHECK: %[[ALLOC_DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5) +// CHECK: %[[ALLOC_DESC_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ALLOC_DESC]] to ptr + +// CHECK: %[[RED_ARR_LIST:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0 +// CHECK: %[[GLOB_ELEM_PTR:.*]] = getelementptr inbounds %[[GLOBALIZED_LOCALS]], ptr %{{.*}}, i32 0, i32 0 +// CHECK: %[[ALLOC_PTR_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[ALLOC_DESC_ASCAST]], i32 0, i32 0 +// Store the pointer to the gloalized local element into the locally allocated descriptor. +// CHECK: store ptr %[[GLOB_ELEM_PTR]], ptr %[[ALLOC_PTR_PTR]], align 8 +// CHECK: store ptr %[[ALLOC_DESC_ASCAST]], ptr %[[RED_ARR_LIST]], align 8 +// CHECK: } + +// CHECK: define internal void @_omp_reduction_global_to_list_copy_func({{.*}}) {{.*}} { +// CHECK: %[[RED_ARR_LIST:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0 +// CHECK: %[[RED_ELEM_PTR:.*]] = load ptr, ptr %[[RED_ARR_LIST]], align 8 +// CHECK: %[[GLOB_ELEM_PTR:.*]] = getelementptr inbounds %[[GLOBALIZED_LOCALS]], ptr %{{.*}}, i32 0, i32 0 +// CHECK: %[[ALLOC_PTR_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[RED_ELEM_PTR]], i32 0, i32 0 +// Similar to _omp_reduction_list_to_global_copy_func(...) but in the reverse direction; i.e. +// the globalized local array is copied from rather than copied to. +// CHECK: %[[ALLOC_PTR:.*]] = load ptr, ptr %[[ALLOC_PTR_PTR]], align 8 +// CHECK: %[[ALLOC_VAL:.*]] = load float, ptr %[[GLOB_ELEM_PTR]], align 4 +// CHECK: store float %[[ALLOC_VAL]], ptr %[[ALLOC_PTR]], align 4 +// CHECK: } + +// CHECK: define internal void @_omp_reduction_global_to_list_reduce_func({{.*}}) {{.*}} { +// Allocate a descriptor to manage the element retrieved from the globalized local array. +// CHECK: %[[ALLOC_DESC:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8, addrspace(5) +// CHECK: %[[ALLOC_DESC_ASCAST:.*]] = addrspacecast ptr addrspace(5) %[[ALLOC_DESC]] to ptr + +// CHECK: %[[RED_ARR_LIST:.*]] = getelementptr inbounds [1 x ptr], ptr %{{.*}}, i64 0, i64 0 +// CHECK: %[[GLOB_ELEM_PTR:.*]] = getelementptr inbounds %[[GLOBALIZED_LOCALS]], ptr %{{.*}}, i32 0, i32 0 +// CHECK: %[[ALLOC_PTR_PTR:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[ALLOC_DESC_ASCAST]], i32 0, i32 0 +// Store the pointer to the gloalized local element into the locally allocated descriptor. +// CHECK: store ptr %[[GLOB_ELEM_PTR]], ptr %[[ALLOC_PTR_PTR]], align 8 +// CHECK: store ptr %[[ALLOC_DESC_ASCAST]], ptr %[[RED_ARR_LIST]], align 8 +// CHECK: }