diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index 39286d41ec865..a1950cbb62908 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -330,6 +330,54 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, return Rc; } +/// Returns a buffer of the requested \p Size, to be used as the source for +/// `submitData`. +/// +/// For small buffers (`Size <= sizeof(void*)`), uses \p AsyncInfo's +/// getVoidPtrLocation(). +/// For larger buffers, creates a dynamic buffer which will be eventually +/// deleted by \p AsyncInfo's post-processing callback. +static char *getOrCreateSourceBufferForSubmitData(AsyncInfoTy &AsyncInfo, + int64_t Size) { + constexpr int64_t VoidPtrSize = sizeof(void *); + + if (Size <= VoidPtrSize) { + void *&BufferElement = AsyncInfo.getVoidPtrLocation(); + return reinterpret_cast(&BufferElement); + } + + // Create a dynamic buffer for larger data and schedule its deletion. + char *DataBuffer = new char[Size]; + AsyncInfo.addPostProcessingFunction([DataBuffer]() { + delete[] DataBuffer; + return OFFLOAD_SUCCESS; + }); + return DataBuffer; +} + +/// Calculates the target pointee base by applying the host +/// pointee begin/base delta to the target pointee begin. +/// +/// ``` +/// TgtPteeBase = TgtPteeBegin - (HstPteeBegin - HstPteeBase) +/// ``` +static void *calculateTargetPointeeBase(void *HstPteeBase, void *HstPteeBegin, + void *TgtPteeBegin) { + uint64_t Delta = reinterpret_cast(HstPteeBegin) - + reinterpret_cast(HstPteeBase); + void *TgtPteeBase = reinterpret_cast( + reinterpret_cast(TgtPteeBegin) - Delta); + + DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD + ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n", + DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta); + DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD + "\n", + DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin)); + + return TgtPteeBase; +} + /// Utility function to perform a pointer attachment operation. /// /// For something like: @@ -399,16 +447,8 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo, constexpr int64_t VoidPtrSize = sizeof(void *); assert(HstPtrSize >= VoidPtrSize && "PointerSize is too small"); - uint64_t Delta = reinterpret_cast(HstPteeBegin) - - reinterpret_cast(HstPteeBase); - void *TgtPteeBase = reinterpret_cast( - reinterpret_cast(TgtPteeBegin) - Delta); - DP("HstPteeBase: " DPxMOD ", HstPteeBegin: " DPxMOD - ", Delta (HstPteeBegin - HstPteeBase): %" PRIu64 ".\n", - DPxPTR(HstPteeBase), DPxPTR(HstPteeBegin), Delta); - DP("TgtPteeBase (TgtPteeBegin - Delta): " DPxMOD ", TgtPteeBegin : " DPxMOD - "\n", - DPxPTR(TgtPteeBase), DPxPTR(TgtPteeBegin)); + void *TgtPteeBase = + calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, TgtPteeBegin); // Add shadow pointer tracking if (!PtrTPR.getEntry()->addShadowPointer( @@ -435,48 +475,32 @@ static int performPointerAttachment(DeviceTy &Device, AsyncInfoTy &AsyncInfo, return OFFLOAD_SUCCESS; }; - bool IsPtrAFortranDescriptor = HstPtrSize > VoidPtrSize; - if (!IsPtrAFortranDescriptor) { - // For "regular" pointers, we can use the VoidPtrLocation from AsyncInfo as - // the buffer space for the submission. - void *&BufferElement = AsyncInfo.getVoidPtrLocation(); - BufferElement = TgtPteeBase; - - // Submit the updated pointer value to device - return HandleSubmitResult(Device.submitData( - TgtPtrAddr, &BufferElement, VoidPtrSize, AsyncInfo, PtrTPR.getEntry())); + // Get a buffer to be used as the source for data submission. + char *SrcBuffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize); + + // The pointee's address should occupy the first VoidPtrSize bytes + // irrespective of HstPtrSize. + std::memcpy(SrcBuffer, &TgtPteeBase, VoidPtrSize); + + // For larger "pointers" (e.g., Fortran descriptors), copy remaining + // descriptor fields from the host descriptor into the buffer. + if (HstPtrSize > VoidPtrSize) { + uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; + void *HstDescriptorFieldsAddr = + reinterpret_cast(HstPtrAddr) + VoidPtrSize; + std::memcpy(SrcBuffer + VoidPtrSize, HstDescriptorFieldsAddr, + HstDescriptorFieldsSize); + + DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD + ") (pointer + %" PRId64 " additional bytes from host descriptor " DPxMOD + ")\n", + HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize, + DPxPTR(HstDescriptorFieldsAddr)); } - // For larger "pointers" (like Fortran's descriptors), we create a dynamic - // buffer, which will be eventually destroyed by AsyncInfo's post-processing - // callback. - char *DataBuffer = new char[HstPtrSize]; - - // For such descriptors, to the first VoidPtrSize bytes, we store the - // pointee's device address. - std::memcpy(DataBuffer, &TgtPteeBase, sizeof(void *)); - - // And to the remaining bytes, we copy the remaining contents of the host - // descriptor after the initial VoidPtrSize bytes. - uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; - void *HstDescriptorFieldsAddr = - reinterpret_cast(HstPtrAddr) + VoidPtrSize; - std::memcpy(DataBuffer + VoidPtrSize, HstDescriptorFieldsAddr, - HstDescriptorFieldsSize); - - DP("Updating %" PRId64 " bytes of descriptor (" DPxMOD ") (pointer + %" PRId64 - " additional bytes from host descriptor " DPxMOD ")\n", - HstPtrSize, DPxPTR(TgtPtrAddr), HstDescriptorFieldsSize, - DPxPTR(HstDescriptorFieldsAddr)); - - // Submit the entire buffer to device - int SubmitResult = Device.submitData(TgtPtrAddr, DataBuffer, HstPtrSize, + // Submit the populated source buffer to device. + int SubmitResult = Device.submitData(TgtPtrAddr, SrcBuffer, HstPtrSize, AsyncInfo, PtrTPR.getEntry()); - - AsyncInfo.addPostProcessingFunction([DataBuffer]() -> int { - delete[] DataBuffer; - return OFFLOAD_SUCCESS; - }); return HandleSubmitResult(SubmitResult); } @@ -525,10 +549,17 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, // ATTACH map-types are supposed to be handled after all mapping for the // construct is done. Defer their processing. if (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH) { - AttachInfo->AttachEntries.emplace_back( - /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin, - /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I], - /*PointeeName=*/HstPtrName); + const bool IsCorrespondingPointerInit = + (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE); + // We don't need to keep track of PRIVATE | ATTACH entries. They + // represent corresponding-pointer-initialization, and are handled + // similar to firstprivate (PRIVATE | TO) entries by + // PrivateArgumentManager. + if (!IsCorrespondingPointerInit) + AttachInfo->AttachEntries.emplace_back( + /*PointerBase=*/HstPtrBase, /*PointeeBegin=*/HstPtrBegin, + /*PointerSize=*/DataSize, /*MapType=*/ArgTypes[I], + /*PointeeName=*/HstPtrName); DP("Deferring ATTACH map-type processing for argument %d\n", I); continue; @@ -1397,13 +1428,24 @@ class PrivateArgumentManagerTy { uint32_t Padding; /// Host pointer name map_var_info_t HstPtrName = nullptr; + /// For corresponding-pointer-initialization: host pointee base address. + void *HstPteeBase = nullptr; + /// For corresponding-pointer-initialization: host pointee begin address. + void *HstPteeBegin = nullptr; + /// Whether this argument needs corresponding-pointer-initialization. + bool IsCorrespondingPointerInit = false; FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size, uint32_t Alignment, uint32_t Padding, - map_var_info_t HstPtrName = nullptr) + map_var_info_t HstPtrName = nullptr, + void *HstPteeBase = nullptr, + void *HstPteeBegin = nullptr, + bool IsCorrespondingPointerInit = false) : HstPtrBegin(reinterpret_cast(HstPtr)), HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment), - Size(Size), Padding(Padding), HstPtrName(HstPtrName) {} + Size(Size), Padding(Padding), HstPtrName(HstPtrName), + HstPteeBase(HstPteeBase), HstPteeBegin(HstPteeBegin), + IsCorrespondingPointerInit(IsCorrespondingPointerInit) {} }; /// A vector of target pointers for all private arguments @@ -1421,6 +1463,153 @@ class PrivateArgumentManagerTy { /// A pointer to a \p AsyncInfoTy object AsyncInfoTy &AsyncInfo; + /// \returns the value of the target pointee's base to be used for + /// corresponding-pointer-initialization. + void *getTargetPointeeBaseForCorrespondingPointerInitialization( + void *HstPteeBase, void *HstPteeBegin) { + // See if the pointee's begin address has corresponding storage on device. + void *TgtPteeBegin = [&]() -> void * { + if (!HstPteeBegin) { + DP("Corresponding-pointer-initialization: pointee begin address is " + "null\n"); + return nullptr; + } + + return Device.getMappingInfo() + .getTgtPtrBegin(HstPteeBegin, /*Size=*/0, /*UpdateRefCount=*/false, + /*UseHoldRefCount=*/false) + .TargetPointer; + }(); + + // If it does, we calculate target pointee base using it, and return it. + // Otherwise, we retain the host pointee's base as the target pointee base + // of the initialized pointer. It's the user's responsibility to ensure + // that if a lookup fails, the host pointee is accessible on the device. + return TgtPteeBegin ? calculateTargetPointeeBase(HstPteeBase, HstPteeBegin, + TgtPteeBegin) + : HstPteeBase; + } + + /// Initialize the source buffer for corresponding-pointer-initialization. + /// + /// It computes and stores the target pointee base address (or the host + /// pointee's base address, if lookup of target pointee fails) to the first + /// `sizeof(void*)` bytes of \p Buffer, and for larger pointers + /// (Fortran descriptors), the remaining fields of the host descriptor + /// \p HstPtr after those `sizeof(void*)` bytes. + /// + /// Corresponding-pointer-initialization represents the initialization of the + /// private version of a base-pointer/referring-pointer on a target construct. + /// + /// For example, for the following test: + /// ```cpp + /// int x[10]; + /// int *px = &x[0]; + /// ... + /// #pragma omp target data map(tofrom:px) + /// { + /// int **ppx = omp_get_mapped_ptr(&px, omp_get_default_device()); + /// #pragma omp target map(tofrom:px[1]) is_device_ptr(ppx) + /// { + /// foo(px, ppx); + /// } + /// } + /// ``` + /// The following shows a possible way to implement the mapping of `px`, + /// which is pre-determined firstprivate and should get initialized + /// via corresponding-pointer-initialization: + /// + /// (A) Possible way to implement the above with PRIVATE | ATTACH: + /// ```llvm + /// ; maps for px: + /// ; &px[0], &px[1], sizeof(px[1]), TO | FROM // (1) + /// ; &px, &px[1], sizeof(px), ATTACH // (2) + /// ; &px, &px[1], sizeof(px), PRIVATE | ATTACH | PARAM // (3) + /// call... @__omp_outlined...(ptr %px, ptr %ppx) + /// define ... @__omp_outlined(ptr %px, ptr %ppx) {... + /// foo(%px, %ppx) + /// ...} + /// ``` + /// `(1)` maps the pointee `px[1]. + /// `(2)` attaches it to the mapped version of `px`. It can be controlled by + /// the user based on the `attach(auto/always/never)` map-type modifier. + /// `(3)` privatizes and initializes the private pointer `px`, and passes it + /// into the kernel as the argument `%px`. Can be skipped if `px` is not + /// referenced in the target construct. + /// + /// While this method is not too beneficial compared to just doing the + /// initialization in the body of the kernel, like: + /// (B) Possible way to implement the above without PRIVATE | ATTACH: + /// ```llvm + /// ; maps for px: + /// ; &px[0], &px[1], sizeof(px[1]), TO | FROM | PARAM // (4) + /// ; &px, &px[1], sizeof(px), ATTACH // (5) + /// call... @__omp_outlined...(ptr %px0, ptr %ppx) + /// define ... __omp_outlined...(ptr %px0, ptr %ppx) { + /// %px = alloca ptr; + /// store ptr %px0, ptr %px + /// foo(%px, %ppx) + /// } + /// ``` + /// + /// (B) is not so convenient for Fortran descriptors, because in + /// addition to the lookup, the remaining fields of the descriptor have + /// to be passed into the kernel to initialize the private copy, which + /// makes (A) a cleaner option for them. e.g. + /// ```f90 + /// integer, pointer :: p(:) + /// !$omp target map(p(1)) + /// ``` + /// + /// (C) Possible mapping for the above Fortran test using PRIVATE | ATTACH: + /// ```llvm + /// ; maps for p: + /// ; &p(1), &p(1), sizeof(p(1)), TO | FROM + /// ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), ATTACH + /// ; &ref_ptr(p), &p(1), sizeof(ref_ptr(p)), PRIVATE | ATTACH | PARAM + /// call... @__omp_outlined...(ptr %ref_ptr_of_p) + void initBufferForCorrespondingPointerInitialization(char *Buffer, + void *HstPtr, + int64_t HstPtrSize, + void *HstPteeBase, + void *HstPteeBegin) { + constexpr int64_t VoidPtrSize = sizeof(void *); + assert(HstPtrSize >= VoidPtrSize && + "corresponding-pointer-initialization: pointer size is too small"); + + void *TgtPteeBase = + getTargetPointeeBaseForCorrespondingPointerInitialization(HstPteeBase, + HstPteeBegin); + + // Store the target pointee base address to the first VoidPtrSize bytes + DP("Initializing corresponding-pointer-initialization source buffer " + "for " DPxMOD ", with pointee base " DPxMOD "\n", + DPxPTR(HstPtr), DPxPTR(TgtPteeBase)); + std::memcpy(Buffer, &TgtPteeBase, VoidPtrSize); + if (HstPtrSize <= VoidPtrSize) + return; + + // For Fortran descriptors, copy the remaining descriptor fields from host + uint64_t HstDescriptorFieldsSize = HstPtrSize - VoidPtrSize; + void *HstDescriptorFieldsAddr = static_cast(HstPtr) + VoidPtrSize; + DP("Copying %" PRId64 + " bytes of descriptor fields into corresponding-pointer-initialization " + "buffer at offset %" PRId64 ", from " DPxMOD "\n", + HstDescriptorFieldsSize, VoidPtrSize, DPxPTR(HstDescriptorFieldsAddr)); + std::memcpy(Buffer + VoidPtrSize, HstDescriptorFieldsAddr, + HstDescriptorFieldsSize); + } + + /// Helper function to create and initialize a buffer to be used as the source + /// for corresponding-pointer-initialization. + void *createAndInitSourceBufferForCorrespondingPointerInitialization( + void *HstPtr, int64_t HstPtrSize, void *HstPteeBase, void *HstPteeBegin) { + char *Buffer = getOrCreateSourceBufferForSubmitData(AsyncInfo, HstPtrSize); + initBufferForCorrespondingPointerInitialization(Buffer, HstPtr, HstPtrSize, + HstPteeBase, HstPteeBegin); + return Buffer; + } + // TODO: What would be the best value here? Should we make it configurable? // If the size is larger than this threshold, we will allocate and transfer it // immediately instead of packing it. @@ -1435,7 +1624,9 @@ class PrivateArgumentManagerTy { int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, map_var_info_t HstPtrName = nullptr, - const bool AllocImmediately = false) { + const bool AllocImmediately = false, void *HstPteeBase = nullptr, + void *HstPteeBegin = nullptr, + bool IsCorrespondingPointerInit = false) { // If the argument is not first-private, or its size is greater than a // predefined threshold, we will allocate memory and issue the transfer // immediately. @@ -1458,9 +1649,19 @@ class PrivateArgumentManagerTy { // If first-private, copy data from host if (IsFirstPrivate) { DP("Submitting firstprivate data to the device.\n"); - int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); + + // The source value used for corresponding-pointer-initialization + // is different vs regular firstprivates. + void *DataSource = + IsCorrespondingPointerInit + ? createAndInitSourceBufferForCorrespondingPointerInitialization( + HstPtr, ArgSize, HstPteeBase, HstPteeBegin) + : HstPtr; + int Ret = Device.submitData(TgtPtr, DataSource, ArgSize, AsyncInfo); if (Ret != OFFLOAD_SUCCESS) { - DP("Copying data to device failed, failed.\n"); + DP("Copying %s data to device failed.\n", + IsCorrespondingPointerInit ? "corresponding-pointer-initialization" + : "firstprivate"); return OFFLOAD_FAIL; } } @@ -1506,8 +1707,10 @@ class PrivateArgumentManagerTy { } } - FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, - StartAlignment, Padding, HstPtrName); + FirstPrivateArgInfo.emplace_back( + TgtArgsIndex, HstPtr, ArgSize, StartAlignment, Padding, HstPtrName, + HstPteeBase, HstPteeBegin, IsCorrespondingPointerInit); + FirstPrivateArgSize += Padding + ArgSize; } @@ -1526,7 +1729,13 @@ class PrivateArgumentManagerTy { for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { // First pad the pointer as we (have to) pad it on the device too. Itr = std::next(Itr, Info.Padding); - std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); + + if (Info.IsCorrespondingPointerInit) + initBufferForCorrespondingPointerInitialization( + &*Itr, Info.HstPtrBegin, Info.Size, Info.HstPteeBase, + Info.HstPteeBegin); + else + std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); Itr = std::next(Itr, Info.Size); } // Allocate target memory @@ -1682,8 +1891,40 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, TgtPtrBegin = HstPtrBase; TgtBaseOffset = 0; } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { + // For cases like: + // ``` + // int *p = ...; + // #pragma omp target map(p[0:10]) + // ``` + // `p` is predetermined firstprivate on the target construct, and the + // method to determine the initial value of the private copy on the + // device is called "corresponding-pointer-initialization". + // + // Such firstprivate pointers that need + // corresponding-pointer-initialization are represented using the + // `PRIVATE | ATTACH` map-types, in contrast to regular firstprivate + // entries, which use `PRIVATE | TO`. The structure of these + // `PRIVATE | ATTACH` entries is the same as the non-private + // `ATTACH` entries used to represent pointer-attachments, i.e.: + // ``` + // &hst_ptr_base/begin, &hst_ptee_begin, sizeof(hst_ptr) + // ``` + const bool IsAttach = (ArgTypes[I] & OMP_TGT_MAPTYPE_ATTACH); + void *HstPteeBase = nullptr; + void *HstPteeBegin = nullptr; + if (IsAttach) { + // For corresponding-pointer-initialization, Args[I] is HstPteeBegin, + // and ArgBases[I] is both HstPtrBase/HstPtrBegin. + HstPteeBase = *reinterpret_cast(HstPtrBase); + HstPteeBegin = Args[I]; + HstPtrBegin = ArgBases[I]; + } TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; - const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); + // Corresponding-pointer-initialization is a special case of firstprivate, + // since it also involves initializing the private pointer. + const bool IsFirstPrivate = + (ArgTypes[I] & OMP_TGT_MAPTYPE_TO) || IsAttach; + // If there is a next argument and it depends on the current one, we need // to allocate the private memory immediately. If this is not the case, // then the argument can be marked for optimization and packed with the @@ -1692,9 +1933,11 @@ static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); Ret = PrivateArgumentManager.addArg( HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, - TgtArgs.size(), HstPtrName, AllocImmediately); + /*TgtArgsIndex=*/TgtArgs.size(), HstPtrName, AllocImmediately, + HstPteeBase, HstPteeBegin, /*IsCorrespondingPointerInit=*/IsAttach); if (Ret != OFFLOAD_SUCCESS) { - REPORT("Failed to process %sprivate argument " DPxMOD "\n", + REPORT("Failed to process %s%sprivate argument " DPxMOD "\n", + IsAttach ? "corresponding-pointer-initialization " : "", (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin)); return OFFLOAD_FAIL; }