diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index 06157fee9385e5..5fc3710ec3d34c 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -89,6 +89,7 @@ with `-g` for full debug information. A full list of flags supported by * Dump the contents of the device pointer map at kernel exit: ``0x04`` * Indicate when an entry is changed in the device mapping table: ``0x08`` * Print OpenMP kernel information from device plugins: ``0x10`` + * Indicate when data is copied to and from the device: ``0x20`` Any combination of these flags can be used by setting the appropriate bits. For example, to enable printing all data active in an OpenMP target region along @@ -137,44 +138,53 @@ provide the following output from the runtime library. .. code-block:: text - Info: Device supports up to 65536 CUDA blocks and 1024 threads with a warp size of 32 Info: Entering OpenMP data region at zaxpy.cpp:14:1 with 2 arguments: - Info: to(X[0:N])[16384] - Info: tofrom(Y[0:N])[16384] - Info: Creating new map entry with HstPtrBegin=0x00007fff963f4000, - TgtPtrBegin=0x00007fff963f4000, Size=16384, Name=X[0:N] - Info: Creating new map entry with HstPtrBegin=0x00007fff963f8000, - TgtPtrBegin=0x00007fff963f00000, Size=16384, Name=Y[0:N] + Info: to(X[0:N])[16384] + Info: tofrom(Y[0:N])[16384] + Info: Creating new map entry with HstPtrBegin=0x00007ffde9e99000, + TgtPtrBegin=0x00007f15dc600000, Size=16384, Name=X[0:N] + Info: Copying data from host to device, HstPtr=0x00007ffde9e99000, + TgtPtr=0x00007f15dc600000, Size=16384, Name=X[0:N] + Info: Creating new map entry with HstPtrBegin=0x00007ffde9e95000, + TgtPtrBegin=0x00007f15dc604000, Size=16384, Name=Y[0:N] + Info: Copying data from host to device, HstPtr=0x00007ffde9e95000, + TgtPtr=0x00007f15dc604000, Size=16384, Name=Y[0:N] Info: OpenMP Host-Device pointer mappings after block at zaxpy.cpp:14:1: Info: Host Ptr Target Ptr Size (B) RefCount Declaration - Info: 0x00007fff963f4000 0x00007fd225004000 16384 1 Y[0:N] at zaxpy.cpp:13:17 - Info: 0x00007fff963f8000 0x00007fd225000000 16384 1 X[0:N] at zaxpy.cpp:13:11 + Info: 0x00007ffde9e95000 0x00007f15dc604000 16384 1 Y[0:N] at zaxpy.cpp:13:17 + Info: 0x00007ffde9e99000 0x00007f15dc600000 16384 1 X[0:N] at zaxpy.cpp:13:11 Info: Entering OpenMP kernel at zaxpy.cpp:6:1 with 4 arguments: Info: firstprivate(N)[8] (implicit) Info: use_address(Y)[0] (implicit) Info: tofrom(D)[16] (implicit) Info: use_address(X)[0] (implicit) - Info: Mapping exists (implicit) with HstPtrBegin=0x00007ffe37d8be80, - TgtPtrBegin=0x00007f90ff004000, Size=0, updated RefCount=2, Name=Y - Info: Creating new map entry with HstPtrBegin=0x00007fff963f33ff0, - TgtPtrBegin=0x00007fd225003ff0, Size=16, Name=D - Info: Mapping exists (implicit) with HstPtrBegin=0x00007ffe37d8fe80, - TgtPtrBegin=0x00007f90ff000000, Size=0, updated RefCount=2, Name=X - Info: Launching kernel __omp_offloading_fd02_c2c4ac1a__Z5daxpyPNSt3__17complexIdEES2_S1_m_l6 + Info: Mapping exists (implicit) with HstPtrBegin=0x00007ffde9e95000, + TgtPtrBegin=0x00007f15dc604000, Size=0, updated RefCount=2, Name=Y + Info: Creating new map entry with HstPtrBegin=0x00007ffde9e94fb0, + TgtPtrBegin=0x00007f15dc608000, Size=16, Name=D + Info: Copying data from host to device, HstPtr=0x00007ffde9e94fb0, + TgtPtr=0x00007f15dc608000, Size=16, Name=D + Info: Mapping exists (implicit) with HstPtrBegin=0x00007ffde9e99000, + TgtPtrBegin=0x00007f15dc600000, Size=0, updated RefCount=2, Name=X + Info: Launching kernel __omp_offloading_fd02_e25f6e76__Z5zaxpyPSt7complexIdES1_S0_m_l6 with 8 blocks and 128 threads in SPMD mode - Info: Removing map entry with HstPtrBegin=0x00007fff963f33ff0, - TgtPtrBegin=0x00007fd225003ff0, Size=16, Name=D + Info: Copying data from device to host, TgtPtr=0x00007f15dc608000, + HstPtr=0x00007ffde9e94fb0, Size=16, Name=D + Info: Removing map entry with HstPtrBegin=0x00007ffde9e94fb0, + TgtPtrBegin=0x00007f15dc608000, Size=16, Name=D Info: OpenMP Host-Device pointer mappings after block at zaxpy.cpp:6:1: Info: Host Ptr Target Ptr Size (B) RefCount Declaration - Info: 0x00007fff963f4000 0x00007fd225004000 16384 1 Y[0:N] at zaxpy.cpp:13:17 - Info: 0x00007fff963f8000 0x00007fd225000000 16384 1 X[0:N] at zaxpy.cpp:13:11 + Info: 0x00007ffde9e95000 0x00007f15dc604000 16384 1 Y[0:N] at zaxpy.cpp:13:17 + Info: 0x00007ffde9e99000 0x00007f15dc600000 16384 1 X[0:N] at zaxpy.cpp:13:11 Info: Exiting OpenMP data region at zaxpy.cpp:14:1 with 2 arguments: - Info: to(X[0:N])[16384] - Info: tofrom(Y[0:N])[16384] - Info: Removing map entry with HstPtrBegin=0x00007fff963f4000, - TgtPtrBegin=0x00007fff963f4000, Size=16384, Name=X[0:N] - Info: Removing map entry with HstPtrBegin=0x00007fff963f8000, - TgtPtrBegin=0x00007fff963f00000, Size=16384, Name=Y[0:N] + Info: to(X[0:N])[16384] + Info: tofrom(Y[0:N])[16384] + Info: Copying data from device to host, TgtPtr=0x00007f15dc604000, + HstPtr=0x00007ffde9e95000, Size=16384, Name=Y[0:N] + Info: Removing map entry with HstPtrBegin=0x00007ffde9e95000, + TgtPtrBegin=0x00007f15dc604000, Size=16384, Name=Y[0:N] + Info: Removing map entry with HstPtrBegin=0x00007ffde9e99000, + TgtPtrBegin=0x00007f15dc600000, Size=16384, Name=X[0:N] From this information, we can see the OpenMP kernel being launched on the CUDA device with enough threads and blocks for all ``1024`` iterations of the loop in diff --git a/openmp/libomptarget/include/Debug.h b/openmp/libomptarget/include/Debug.h index 6dd2f0e11a0faa..942f39fe9c1a23 100644 --- a/openmp/libomptarget/include/Debug.h +++ b/openmp/libomptarget/include/Debug.h @@ -52,6 +52,8 @@ enum OpenMPInfoType : uint32_t { OMP_INFOTYPE_MAPPING_CHANGED = 0x0008, // Print kernel information from target device plugins. OMP_INFOTYPE_PLUGIN_KERNEL = 0x0010, + // Print whenever data is transferred to the device + OMP_INFOTYPE_DATA_TRANSFER = 0x0020, // Enable every flag. OMP_INFOTYPE_ALL = 0xffffffff, }; diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 648b2066585de2..82692c05d700a8 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -420,6 +420,18 @@ int32_t DeviceTy::deleteData(void *TgtPtrBegin) { // Submit data to device int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, AsyncInfoTy &AsyncInfo) { + if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) { + LookupResult LR = lookupMapping(HstPtrBegin, Size); + auto *HT = &*LR.Entry; + + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID, + "Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD + ", Size=%" PRId64 ", Name=%s\n", + DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin), Size, + (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str() + : "unknown"); + } + if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize) return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); else @@ -430,6 +442,17 @@ int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, // Retrieve data from device int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, AsyncInfoTy &AsyncInfo) { + if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) { + LookupResult LR = lookupMapping(HstPtrBegin, Size); + auto *HT = &*LR.Entry; + INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID, + "Copying data from device to host, TgtPtr=" DPxMOD ", HstPtr=" DPxMOD + ", Size=%" PRId64 ", Name=%s\n", + DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin), Size, + (HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str() + : "unknown"); + } + if (!RTL->data_retrieve_async || !RTL->synchronize) return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); else diff --git a/openmp/libomptarget/test/offloading/info.c b/openmp/libomptarget/test/offloading/info.c index bddb87161ddbe4..3498b2fff229dd 100644 --- a/openmp/libomptarget/test/offloading/info.c +++ b/openmp/libomptarget/test/offloading/info.c @@ -1,4 +1,4 @@ -// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -gline-tables-only && env LIBOMPTARGET_INFO=31 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=INFO +// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -gline-tables-only && env LIBOMPTARGET_INFO=63 %libomptarget-run-nvptx64-nvidia-cuda 2>&1 | %fcheck-nvptx64-nvidia-cuda -allow-empty -check-prefix=INFO // REQUIRES: nvptx64-nvidia-cuda #include @@ -14,28 +14,34 @@ int main() { int C[N]; int val = 1; -// INFO: CUDA device 0 info: Device supports up to {{.*}} CUDA blocks and {{.*}} threads with a warp size of {{.*}} -// INFO: Libomptarget device 0 info: Entering OpenMP data region at info.c:{{[0-9]+}}:1 with 3 arguments: +// INFO: CUDA device 0 info: Device supports up to {{[0-9]+}} CUDA blocks and {{[0-9]+}} threads with a warp size of {{[0-9]+}} +// INFO: Libomptarget device 0 info: Entering OpenMP data region at info.c:{{[0-9]+}}:{{[0-9]+}} with 3 arguments: // INFO: Libomptarget device 0 info: alloc(A[0:64])[256] // INFO: Libomptarget device 0 info: tofrom(B[0:64])[256] // INFO: Libomptarget device 0 info: to(C[0:64])[256] // INFO: Libomptarget device 0 info: Creating new map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=A[0:64] // INFO: Libomptarget device 0 info: Creating new map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=B[0:64] +// INFO: Libomptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=256, Name=B[0:64] // INFO: Libomptarget device 0 info: Creating new map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=C[0:64] -// INFO: Libomptarget device 0 info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:1: +// INFO: Libomptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=256, Name=C[0:64] +// INFO: Libomptarget device 0 info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}: // INFO: Libomptarget device 0 info: Host Ptr Target Ptr Size (B) RefCount Declaration -// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 C[0:64] at info.c:{{[0-9]+}}:7 -// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 B[0:64] at info.c:{{[0-9]+}}:7 -// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 A[0:64] at info.c:{{[0-9]+}}:7 -// INFO: Libomptarget device 0 info: Entering OpenMP kernel at info.c:{{[0-9]+}}:1 with 1 arguments: +// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}} +// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 B[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}} +// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 A[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}} +// INFO: Libomptarget device 0 info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments: // INFO: Libomptarget device 0 info: firstprivate(val)[4] -// INFO: CUDA device 0 info: Launching kernel {{.*}} with {{.*}} and {{.*}} threads in {{.*}} mode -// INFO: Libomptarget device 0 info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:1: +// INFO: CUDA device 0 info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode +// INFO: Libomptarget device 0 info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}: // INFO: Libomptarget device 0 info: Host Ptr Target Ptr Size (B) RefCount Declaration -// INFO: Libomptarget device 0 info: 0x{{.*}} 0x{{.*}} 256 1 C[0:64] at info.c:{{[0-9]+}}:7 -// INFO: Libomptarget device 0 info: 0x{{.*}} 0x{{.*}} 256 1 B[0:64] at info.c:{{[0-9]+}}:7 -// INFO: Libomptarget device 0 info: 0x{{.*}} 0x{{.*}} 256 1 A[0:64] at info.c:{{[0-9]+}}:7 -// INFO: Libomptarget device 0 info: Exiting OpenMP data region at info.c:{{[0-9]+}}:1 +// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}} +// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 B[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}} +// INFO: Libomptarget device 0 info: {{.*}} {{.*}} 256 1 A[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}} +// INFO: Libomptarget device 0 info: Exiting OpenMP data region at info.c:{{[0-9]+}}:{{[0-9]+}} with 3 arguments: +// INFO: Libomptarget device 0 info: alloc(A[0:64])[256] +// INFO: Libomptarget device 0 info: tofrom(B[0:64])[256] +// INFO: Libomptarget device 0 info: to(C[0:64])[256] +// INFO: Libomptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=256, Name=B[0:64] // INFO: Libomptarget device 0 info: Removing map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=C[0:64] // INFO: Libomptarget device 0 info: Removing map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=B[0:64] // INFO: Libomptarget device 0 info: Removing map entry with HstPtrBegin={{.*}}, TgtPtrBegin={{.*}}, Size=256, Name=A[0:64]