diff --git a/CMakeLists.txt b/CMakeLists.txt index 4beca48..c7b30d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ set(PROJECT_NAME "ishmem") set(PROJECT_FULL_NAME "Intel® SHMEM") set(ISHMEM_MAJOR_VERSION "1") -set(ISHMEM_MINOR_VERSION "4") +set(ISHMEM_MINOR_VERSION "5") set(ISHMEM_PATCH_VERSION "0") set(PROJECT_VERSION "${ISHMEM_MAJOR_VERSION}.${ISHMEM_MINOR_VERSION}.${ISHMEM_PATCH_VERSION}") @@ -40,7 +40,9 @@ option(ENABLE_AOT_COMPILATION "Enables AOT compilation for GPU kernels" TRUE) # Set default device type(s) for AOT compilation if (NOT ISHMEM_AOT_DEVICE_TYPES) - set(ISHMEM_AOT_DEVICE_TYPES "pvc") + # xe-hpc: Intel(R) Data Center GPU Max Series + # xe2: Intel(R) Arc(TM) B-Series GPU Family + set(ISHMEM_AOT_DEVICE_TYPES "xe-hpc,xe2") endif() # ------------------------------------------------------------------- @@ -87,6 +89,7 @@ if (ENABLE_AOT_COMPILATION) endif() message(STATUS "Enable OpenSHMEM support: ${ENABLE_OPENSHMEM}") message(STATUS "Enable MPI support: ${ENABLE_MPI}") +message(STATUS "Default Runtime: ${ISHMEM_DEFAULT_RUNTIME_STR}") message(STATUS "===================================\n") # ------------------------------------------------------------------- @@ -116,7 +119,7 @@ set(ISHMEM_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/src") # ------------------------------------------------------------------- # Generate and install files -configure_file(${PROJECT_SOURCE_DIR}/src/ishmem_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ishmem/config.h) +configure_file(${PROJECT_SOURCE_DIR}/src/ishmem_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/ishmem/config.h @ONLY) configure_file(${PROJECT_SOURCE_DIR}/cmake/ishmem ${CMAKE_CURRENT_BINARY_DIR}/ishmem @ONLY) configure_file(${PROJECT_SOURCE_DIR}/cmake/vars.sh.in ${CMAKE_CURRENT_BINARY_DIR}/vars.sh @ONLY) configure_file(${PROJECT_SOURCE_DIR}/pkgconfig/ishmem.pc.in ${CMAKE_CURRENT_BINARY_DIR}/pkgconfig/ishmem.pc @ONLY) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0d8b75d..95e9c12 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -34,14 +34,12 @@ Contributed code must be: ### Coding style -The code style and consistency is maintained using `clang-format`. When submitting a contribution, please make sure that it adheres to the existing coding style by using the following command: +The code style and consistency is maintained using `clang-format`. When submitting a contribution, please ensure it adheres to the existing coding style by using the following command from the top-level directory of the repository: ``` clang-format -style=file -i ``` -This will format the code using the `.clang-format` file found in the top-level directory of this repository. - ### Unit tests Be sure to extend the existing tests when fixing an issue. @@ -90,7 +88,9 @@ By making a contribution to this project, I certify that: Then add a line to every git commit message: - Signed-off-by: Kris Smith +``` +Signed-off-by: Kris Smith +``` **Note**: Use your real name. diff --git a/LICENSE b/LICENSE index 2cfae37..b6f76f4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2024 Intel Corporation. +Copyright (c) 2025 Intel Corporation. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -7,24 +7,24 @@ met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in the +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. -3. Neither the name of the copyright holder nor the names of its -contributors may be used to endorse or promote products derived from +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -“AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +“AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SPDX-License-Identifier: BSD-3-Clause diff --git a/README.md b/README.md index aea993d..b2822e1 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ Intel® SHMEM provides an efficient implementation of GPU-initiated communicatio ## Prerequisites - Linux OS -- Intel® oneAPI DPC++/C++ Compiler 2024.0 or higher. +- Intel® oneAPI DPC++/C++ Compiler 2025.0 or higher. ### SYCL support Intel® oneAPI DPC++/C++ Compiler with Level Zero support. @@ -34,7 +34,7 @@ To install, download the oneAPI Level Zero from the repository. git clone https://github.com/oneapi-src/level-zero.git ``` -Build Level Zero following instructions below. +Build Level Zero following instructions below. ``` cd level-zero @@ -44,16 +44,16 @@ cmake -DCMAKE_INSTALL_PREFIX= .. make -j make install ``` -### The Host Back-End Library -Intel® SHMEM requires a host OpenSHMEM or MPI back-end to be used for host-sided operations support. In particular, the OpenSHMEM back-end relies on a collection of extension APIs (`shmemx_heap_create`, `shmemx_heap_preinit`, and `shmemx_heap_postinit`) to coordinate the Intel® SHMEM and OpenSHMEM heaps. We recommend [Sandia OpenSHMEM v1.5.3rc1](https://github.com/Sandia-OpenSHMEM/SOS/releases/tag/v1.5.3rc1) or newer for this purpose. A [work-in-progress branch](https://github.com/davidozog/oshmpi/tree/wip/ishmem) of [OSHMPI](https://github.com/pmodels/oshmpi.git) is also supported but is currently considered experimental. See the [Building OSHMPI](#building-oshmpi-optional-and-experimental) section before for more details. +### The Host Backend Library +Intel® SHMEM requires a host OpenSHMEM or MPI backend to be used for scale-out communication. In particular, the OpenSHMEM backend relies on a collection of extension APIs (`shmemx_heap_create`, `shmemx_heap_preinit`, and `shmemx_heap_postinit`) to coordinate the Intel® SHMEM and OpenSHMEM heaps. We recommend [Sandia OpenSHMEM v1.5.3](https://github.com/Sandia-OpenSHMEM/SOS/releases/tag/v1.5.3) or newer for this purpose. A [work-in-progress branch](https://github.com/davidozog/oshmpi/tree/wip/ishmem) of [OSHMPI](https://github.com/pmodels/oshmpi.git) is also supported but is currently considered experimental. See the [Building OSHMPI](#building-oshmpi-optional-and-experimental) section before for more details. -We recommend the Intel® MPI Library as the MPI back-end option for the current version of Intel® SHMEM. See the [Building Intel® SHMEM](#building-intel-shmem) section below for more details. +We recommend the Intel® MPI Library as the MPI backend option for the current version of Intel® SHMEM. See the [Building Intel® SHMEM](#building-intel-shmem) section below for more details. ### Building Sandia OpenSHMEM (SOS) -Download the SOS repo to be configured as a back-end for Intel® SHMEM. +Download the SOS repo to be configured as a backend for Intel® SHMEM. ``` -git clone --recurse-submodules https://github.com/Sandia-OpenSHMEM/SOS.git SOS +git clone -b v1.5.3 --recurse-submodules https://github.com/Sandia-OpenSHMEM/SOS.git SOS ``` Build SOS following instructions below. `FI_HMEM` support in the provider is required for use with Intel® SHMEM. To enable `FI_HMEM` with a supported provider, we recommend a specific set of config flags. Below are two examples for configuring and building SOS with two providers supporting `FI_HMEM`. To configure SOS with the `verbs;ofi_rxm` provider, use the following instructions: @@ -93,8 +93,7 @@ To download the OSHMPI repository: ``` git clone -b wip/ishmem --recurse-submodules https://github.com/davidozog/oshmpi.git oshmpi ``` -After ensuring Intel® MPI Library is enabled (for example, by sourcing the `/opt/intel/oneapi/setvars.sh` script), -please build OSHMPI following the instructions below. +After ensuring the Intel® MPI Library is present in the environment, please build OSHMPI following the instructions below. ``` cd oshmpi @@ -107,27 +106,24 @@ make install ### Building Intel® SHMEM Check that the SOS build process has successfully created a `` directory with `include` and `lib` as subdirectories. Please find `shmem.h` and `shmemx.h` in `include`. -Build Intel® SHMEM with an OpenSHMEM back-end using the following instructions: +Build Intel® SHMEM with an OpenSHMEM backend using the following instructions: ``` cd ishmem mkdir build cd build -CC=icx CXX=icpx cmake .. -DENABLE_OPENSHMEM=ON -DSHMEM_DIR= -DCMAKE_INSTALL_PREFIX= +cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DENABLE_OPENSHMEM=ON -DSHMEM_DIR= -DCMAKE_INSTALL_PREFIX= make -j ``` -Alternatively, Intel® SHMEM can be built by enabling an Intel® MPI Library back-end. +Alternatively, Intel® SHMEM can be built by enabling an Intel® MPI Library backend. Here is information on how to [Get Started with Intel® MPI Library on Linux](https://www.intel.com/content/www/us/en/docs/mpi-library/get-started-guide-linux/2021-11/overview.html). ``` -CC=icx CXX=icpx cmake .. -DENABLE_OPENSHMEM=OFF -DENABLE_MPI=ON -DMPI_DIR= -DCMAKE_INSTALL_PREFIX= +cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DENABLE_MPI=ON -DMPI_DIR= -DCMAKE_INSTALL_PREFIX= ``` where `` is the path to the Intel® MPI Library installation. -Enabling both the OpenSHMEM and MPI back-ends is also supported. In this case, -the desired backend can be selected via the environment variable, -`ISHMEM_RUNTIME`, which can be set to either "OpenSHMEM" or "MPI". -The default value for `ISHMEM_RUNTIME` is "OpenSHMEM". +Enabling both the OpenSHMEM and MPI backends is also supported. You may specify the default runtime at configure time with `-DISHMEM_DEFAULT_RUNTIME=` where `` is `MPI` or `OPENSHMEM` (case-insensitive). Furthermore, the desired backend can be selected at runtime via the environment variable `ISHMEM_RUNTIME=` where is `MPI` or `OPENSHMEM` (case-insensitive). If a default runtime is not specified, it will be automatically selected from the enabled backends in the following order: `OPENSHMEM` then `MPI`. ## Usage @@ -135,29 +131,23 @@ The default value for `ISHMEM_RUNTIME` is "OpenSHMEM". Validate that Intel® SHMEM was built correctly by running an example program. -1. Add the path for the back-end library to the environment, for example: +1. Add the path for the backend library to the environment, for example: ``` export LD_LIBRARY_PATH=/lib:$LD_LIBRARY_PATH +source /env/vars.sh ``` -When enabling only the Intel® MPI Library back-end, simply source the appropriate -`setvars.sh` script. When enabling both OpenSHMEM and MPI back-ends, first -source the `setvars.sh` script, then configure the dynamic linker to load the -OpenSHMEM library (for example by prepending `/lib` to -`LD_LIBRARY_PATH`). - 2. Run the example program or test on an allocated node using a process launcher: ``` -ISHMEM_RUNTIME= mpiexec.hydra -n 2 -hosts ./scripts/ishmrun ./test/unit/int_get_device +ISHMEM_RUNTIME= mpiexec.hydra -n 2 -hosts ./scripts/ishmrun ./test/unit/int_get_device ``` -where `` is the selected host back-end library. - *Note:* Current supported launchers include: MPI process launchers (i.e. `mpiexec`, `mpiexec.hydra`, `mpirun`, etc.), Slurm (i.e. `srun`, `salloc`, etc.), and PBS (i.e. `qsub`). -- *Note:* Intel® SHMEM execution model requires applications to use a 1:1 mapping between PEs and GPU devices. Attempting to run an application without the ishmrun launch script may result in undefined behavior if this mapping is not maintained. - - For further details on the device selection, please see [the ONEAPI_DEVICE_SELECTOR](https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#oneapi_device_selector). +- *Note:* Intel® SHMEM execution model requires applications to use a 1:1 mapping between PEs and GPU devices. Attempting to run an application without the `ishmrun` launch script may result in failure if this mapping is not maintained. + - For further details on device selection, please see [the ONEAPI_DEVICE_SELECTOR](https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#oneapi_device_selector). 3. Validate the application ran successfully; example output: @@ -211,6 +201,18 @@ The following values may be assigned to `CTEST_LAUNCHER` at configure-time (ex. ``` export JOB_QUEUE= ``` +### Hardware-specific Environment Settings +The following environment settings are either **required** or *recommended* when running Intel® SHMEM on the specified hardware. For GPU-specific environment settings, the launch script `ishmrun` will automatically detect and set the appropriate environment. For interconnect-specific environment settings, it is up to the user to ensure the appropriate environment is set: + +- HPE Slingshot Interconnect + - `FI_CXI_OPTIMIZED_MRS=0` is **required** when running with an OpenSHMEM backend. + - `FI_CXI_DEFAULT_CQ_SIZE=131072` is *recommended* for all backends. +- Mellanox ConnectX® Interconnects + - `MLX5_SCATTER_TO_CQE=0` is **required** when running with an OpenSHMEM backend. +- Intel® Data Center GPU Max Series + - `EnableImplicitScaling=0` is **required**. *Note:* you will also need to ensure `NEOReadDebugKeys=1` in case it is not already set. +- Intel® Arc™ B-Series GPUs + - `RenderCompressedBuffersEnabled=0` is **required**. *Note:* you will also need to ensure `NEOReadDebugKeys=1` in case it is not already set. ## Additional Resources diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 44be292..82f3178 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,6 +1,26 @@ # Release Notes This document contains a list of new features and known limitations of Intel® SHMEM releases. +## Release 1.5.0 + +### New Features and Enhancements +- Support for new collectives: inclusive and exclusive scan. +- Improved affinity assignment through launcher script `ishmrun`. +- Preliminary support for Intel® Arc™ B-Series GPUs. +- Bug fixes improving functionality. + +### Known Limitations +- Only [Sandia OpenSHMEM](https://github.com/Sandia-OpenSHMEM/SOS) and [Intel® MPI Library](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) are currently supported as the host back-end. +- Not all APIs from OpenSHMEM standard are supported. Please refer to [Supported/Unsupported Features](https://oneapi-src.github.io/ishmem/supported_features.html) to get a complete view. +- Intel® SHMEM requires a one-to-one mapping of PEs to SYCL devices. This implies that Intel® SHMEM executions must launch with a number of processes on each compute node that is no more than the number of available SYCL devices on each one of those nodes. By default, the Intel® SHMEM runtime considers each individual device tile to make up a single SYCL device and assigns a tile per PE. +- All collective operations within a kernel must complete before invoking subsequent kernel-initiated collective operation. +- To run Intel® SHMEM with SOS enabling the Slingshot provider in OFI, environment variable `FI_CXI_OPTIMIZED_MRS=0` must be used. It is also recommended to use `FI_CXI_DEFAULT_CQ_SIZE=131072`. +- To run Intel® SHMEM with SOS enabling the verbs provider, environment variable `MLX5_SCATTER_TO_CQE=0` must be used. +- To run Intel® SHMEM with Intel® MPI Library, environment variable `I_MPI_OFFLOAD=1` must be used. Additionally, `I_MPI_OFFLOAD_RDMA=1` may be necessary for GPU RDMA depending on the OFI provider. Please refer to the [reference guide](https://www.intel.com/content/www/us/en/docs/mpi-library/developer-reference-linux/2021-16/gpu-buffers-support.html) for further details. +- Inter-node communication in Intel® SHMEM requires [dma-buf](https://www.kernel.org/doc/html/latest/driver-api/dma-buf.html) support in the Linux kernel. Inter-node functionality in Intel® SHMEM Release 1.5.0 is tested with SUSE Linux Enterprise Server 15 SP4. +- Support for Intel® Arc™ B-Series GPUs is preliminary. As such, not all APIs are currently supported. +- When using Intel® Arc™ B-Series GPUs, environment variable `RenderCompressedBuffersEnabled=0` is required. This is automatically set when running with the launcher script `ishmrun`. + ## Release 1.4.0 ### New Features and Enhancements diff --git a/SECURITY.md b/SECURITY.md index d85d435..ccbbdc5 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,5 +1,5 @@ # Security Policy -Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. +Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. ## Reporting a Vulnerability Please report any security vulnerabilities in this project [utilizing the guidelines here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). diff --git a/cmake/ishmem b/cmake/ishmem index 6e83e39..f28d97a 100644 --- a/cmake/ishmem +++ b/cmake/ishmem @@ -1,7 +1,7 @@ #%Module1.0 ############################################################################## -# Copyright (c) 2025, Intel Corporation +# Copyright (c) 2024, Intel Corporation # SPDX-License-Identifier: BSD-3-Clause ############################################################################## @@ -79,5 +79,3 @@ prepend-path LD_LIBRARY_PATH "$topdir/lib" prepend-path LIBRARY_PATH "$topdir/lib" prepend-path C_INCLUDE_PATH "$topdir/include" prepend-path CPLUS_INCLUDE_PATH "$topdir/include" -prepend-path PKG_CONFIG_PATH "$topdir/lib/pkgconfig" -prepend-path CMAKE_PREFIX_PATH "$topdir/lib/cmake/ishmem" diff --git a/cmake/utils.cmake b/cmake/utils.cmake index e45df31..e7b6bf1 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2025 Intel Corporation +# Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause # # CMake utility functions @@ -33,13 +33,15 @@ function(setup_compiler_options) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}") # Set compiler settings - set(COMPILER_WARN_FLAGS "-Werror -Wuninitialized -Wunused-variable") + set(COMPILER_WARN_FLAGS "-Wall -Wextra -Wconversion -Wno-unused-parameter -Wformat -Wformat-security") set(COMPILER_DEFAULT_FLAGS "-D_GNU_SOURCE -fvisibility=internal") set(COMPILER_DEBUG_FLAGS "-g -DENABLE_DEBUG -Rno-debug-disables-optimization") set(COMPILER_RELEASE_FLAGS "-O3") set(COMPILER_RELWITH_DEBINFO_FLAGS "-O2 -g") + set(LINKER_RELEASE_FLAGS "-Wl,-z,noexecstack -Wl,-z,nodlopen") + if (ENABLE_AOT_COMPILATION) set(COMPILER_DEFAULT_FLAGS "${COMPILER_DEFAULT_FLAGS} -fsycl-targets=spir64_gen") set(COMPILER_DEFAULT_FLAGS "${COMPILER_DEFAULT_FLAGS} --start-no-unused-arguments -Xs \"-device ${ISHMEM_AOT_DEVICE_TYPES}\" --end-no-unused-arguments") @@ -51,6 +53,8 @@ function(setup_compiler_options) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} ${COMPILER_DEBUG_FLAGS}" PARENT_SCOPE) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} ${COMPILER_RELEASE_FLAGS}" PARENT_SCOPE) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} ${COMPILER_RELWITHDEBINFO_FLAGS}" PARENT_SCOPE) + + set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} ${LINKER_RELEASE_FLAGS}" PARENT_SCOPE) endfunction(setup_compiler_options) function(setup_dependencies) @@ -76,7 +80,7 @@ function(setup_dependencies) endfunction(setup_dependencies) function(setup_runtime_backends) - option(ENABLE_OPENSHMEM "Enable OpenSHMEM support" TRUE) + option(ENABLE_OPENSHMEM "Enable OpenSHMEM support" FALSE) option(ENABLE_MPI "Enable MPI support" FALSE) # At least one of the runtimes must be enabled @@ -84,6 +88,10 @@ function(setup_runtime_backends) message(FATAL_ERROR "At least one of 'ENABLE_OPENSHMEM' and 'ENABLE_MPI' must be enabled") endif() + if (DEFINED ISHMEM_DEFAULT_RUNTIME) + string(TOUPPER "${ISHMEM_DEFAULT_RUNTIME}" ISHMEM_DEFAULT_RUNTIME_UPPER) + endif() + if (ENABLE_OPENSHMEM) # Keep support for SHMEM_DIR for backward-compatibility if (EXISTS ${SHMEM_DIR}) @@ -101,6 +109,15 @@ function(setup_runtime_backends) string(REPLACE "${SANDIA_OPENSHMEM_PREFIX}" "${SANDIA_OPENSHMEM_DIR}" SANDIA_OPENSHMEM_INCLUDE_DIRS "${SANDIA_OPENSHMEM_INCLUDE_DIRS}") set(OPENSHMEM_INCLUDE_DIRS "${SANDIA_OPENSHMEM_INCLUDE_DIRS}" PARENT_SCOPE) + + if (NOT DEFINED ISHMEM_DEFAULT_RUNTIME_UPPER) + set(ISHMEM_DEFAULT_RUNTIME_UPPER "OPENSHMEM") + endif() + if ("${ISHMEM_DEFAULT_RUNTIME_UPPER}" STREQUAL "OPENSHMEM") + set(DEFAULT_CONFIRMED TRUE) + set(ISHMEM_DEFAULT_RUNTIME_STR "OPENSHMEM" PARENT_SCOPE) + set(ISHMEM_DEFAULT_RUNTIME_VAL ISHMEMX_RUNTIME_OPENSHMEM PARENT_SCOPE) + endif() endif() if (ENABLE_MPI) @@ -119,5 +136,25 @@ function(setup_runtime_backends) set(MPI_CXX_SKIP_MPICXX TRUE) find_package(MPI COMPONENTS REQUIRED CXX) + + if (NOT DEFINED ISHMEM_DEFAULT_RUNTIME_UPPER) + set(ISHMEM_DEFAULT_RUNTIME_UPPER "MPI") + endif() + if ("${ISHMEM_DEFAULT_RUNTIME_UPPER}" STREQUAL "MPI") + set(DEFAULT_CONFIRMED TRUE) + set(ISHMEM_DEFAULT_RUNTIME_STR "MPI" PARENT_SCOPE) + set(ISHMEM_DEFAULT_RUNTIME_VAL "ISHMEMX_RUNTIME_MPI" PARENT_SCOPE) + endif() + endif() + + if (NOT DEFINED DEFAULT_CONFIRMED) + if (NOT ENABLE_OPENSHMEM AND "${ISHMEM_DEFAULT_RUNTIME}" STREQUAL "OPENSHMEM") + message(FATAL_ERROR "Attempted to set '${ISHMEM_DEFAULT_RUNTIME}' as default when ENABLE_OPENSHMEM is disabled.") + elseif (NOT ENABLE_MPI AND "${ISHMEM_DEFAULT_RUNTIME}" STREQUAL "MPI") + message(FATAL_ERROR "Attempted to set '${ISHMEM_DEFAULT_RUNTIME}' as default when ENABLE_MPI is disabled.") + else() + message(FATAL_ERROR " Attempted to set unknown runtime '${ISHMEM_DEFAULT_RUNTIME}' as default.\n" + " Supported options: \"OPENSHMEM\", \"MPI\".\n") + endif() endif() endfunction(setup_runtime_backends) diff --git a/docs/source/collectives.rst b/docs/source/collectives.rst index d8afd3a..3de9c32 100644 --- a/docs/source/collectives.rst +++ b/docs/source/collectives.rst @@ -1242,3 +1242,129 @@ All threads in **group** must call the routine with identical arguments. reduction may not be the same across all participating PEs, so the results for floating point datatypes may differ slightly. This is because floating addition and multiplication are not associative operations. + + +.. _ishmem_inscan: + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +ISHMEM_INSCAN, ISHMEM_EXSCAN +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Performs inclusive or exclusive prefix sum operations. + +In the functions below, TYPE is one of the integer or real types supported for +the SUM reduction operation and has a corresponding TYPENAME specified by Table +:ref:`Reduction Types, Names, and Supporting Operations`. + +.. cpp:function:: template int ishmem_sum_inscan(TYPE* dest, const TYPE* source, size_t nelems) + +.. cpp:function:: template int ishmem_sum_inscan(ishmem_team_t team, TYPE* dest, const TYPE* source, size_t nelems) + +.. cpp:function:: template int ishmem_sum_exscan(TYPE* dest, const TYPE* source, size_t nelems) + +.. cpp:function:: template int ishmem_sum_exscan(ishmem_team_t team, TYPE* dest, const TYPE* source, size_t nelems) + +.. cpp:function:: int ishmem_TYPENAME_sum_inscan(TYPE* dest, const TYPE* source, size_t nelems) + +.. cpp:function:: int ishmem_TYPENAME_sum_inscan(ishmem_team_t team, TYPE* dest, const TYPE* source, size_t nelems) + +.. cpp:function:: int ishmem_TYPENAME_sum_exscan(TYPE* dest, const TYPE* source, size_t nelems) + +.. cpp:function:: int ishmem_TYPENAME_sum_exscan(ishmem_team_t team, TYPE* dest, const TYPE* source, size_t nelems) + + :param dest: Symmetric address of an array, of length **nelems** elements, to receive the result of the scan operation. The type of **dest** should match the TYPE and TYPENAME according to the supported integer or real types for the SUM operation described in table :ref:`Reduction Types`. + :param source: Symmetric address of an array, of length **nelems** elements, that contains one element for each separate scan operation. The type of **source** should match the TYPE and TYPENAME according to the supported integer or real types for the SUM operation described in table :ref:`Reduction Types`. + :param nelems: The number of elements in the **dest** and **source** arrays. **nelems** must be of type **size_t** and have the same value across all PEs. + :param team: A valid ``ishmem`` team handle to a team. + :returns: Zero on successful local completion. Nonzero otherwise. + +Callable from the **host** and **device**. + +**Description:** +The ``ishmem_sum_inscan`` and ``ishmem_sum_exscan`` routines compute one or +more collective scan (or prefix sum) operations across symmetric arrays on +multiple PEs. The operations are performed with the **SUM** operator. + + +The **nelems** argument specifies the number of separate scan operations to +perform. The **source** array provides one element for each scan operation. +The result of the scan operations are placed in **dest** on all participating +PEs. + +The same **dest** and **source** arrays must be passed by all PEs that +participate in the operation. Additionally, The **source** and **dest** +arguments must either be the same symmetric address, or two different +symmetric addresses corresponding to buffers that do not overlap in memory. +That is, they must be completely overlapping or completely disjoint. + +If no **team** argument is passed to either ``ishmem_sum_inscan`` or +``ishmem_sum_exscan``, then all PEs in the world team must participate in the +collective. +Inclusive and exclusive scan routines that accept a **team** argument operate +over all PEs in the provided team. +All PEs in the provided team must participate in the collective. +If **team** compares equal to ``ISHMEM_TEAM_INVALID`` or is otherwise invalid, +the behavior is undefined. + +Upon return from a collective routine, the following are true for the local +PE: + +* The **dest** array is updated and the **source** array may be safely + reused. + + +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +ISHMEMX_SUM_INSCAN_ON_QUEUE, ISHMEMX_SUM_EXSCAN_ON_QUEUE +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Performs inclusive or exclusive prefix sum operations. + +In the functions below, TYPE is one of the integer or real types supported for +the SUM reduction operation and has a corresponding TYPENAME specified by Table +:ref:`Reduction Types, Names, and Supporting Operations`. + + +.. cpp:function:: template sycl::event ishmemx_sum_inscan_on_queue(TYPE* dest, const TYPE* source, size_t nelems, int* ret, sycl::queue& q, const std::vector& deps) + +.. cpp:function:: template sycl::event ishmemx_sum_inscan_on_queue(ishmem_team_t team, TYPE* dest, const TYPE* source, size_t nelems, int* ret, sycl::queue& q, const std::vector& deps) + +.. cpp:function:: template sycl::event ishmemx_sum_exscan_on_queue(TYPE* dest, const TYPE* source, size_t nelems, int* ret, sycl::queue& q, const std::vector& deps) + +.. cpp:function:: template sycl::event ishmemx_sum_exscan_on_queue(ishmem_team_t team, TYPE* dest, const TYPE* source, size_t nelems, int* ret, sycl::queue& q, const std::vector& deps) + +.. cpp:function:: sycl::event ishmemx_TYPENAME_sum_inscan_on_queue(TYPE* dest, const TYPE* source, size_t nelems, int* ret, sycl::queue& q, const std::vector& deps) + +.. cpp:function:: sycl::event ishmemx_TYPENAME_sum_inscan_on_queue(ishmem_team_t team, TYPE* dest, const TYPE* source, size_t nelems, int* ret, sycl::queue& q, const std::vector& deps) + +.. cpp:function:: sycl::event ishmemx_TYPENAME_sum_exscan_on_queue(TYPE* dest, const TYPE* source, size_t nelems, int* ret, sycl::queue& q, const std::vector& deps) + +.. cpp:function:: sycl::event ishmemx_TYPENAME_sum_exscan_on_queue(ishmem_team_t team, TYPE* dest, const TYPE* source, size_t nelems, int* ret, sycl::queue& q, const std::vector& deps) + + + :param dest: Symmetric address of an array, of length **nelems** elements, to receive the result of the scan operation. The type of **dest** should match the TYPE and TYPENAME according to the supported integer or real types for the SUM operation described in table :ref:`Reduction Types`. + :param source: Symmetric address of an array, of length **nelems** elements, that contains one element for each separate scan operation. The type of **source** should match the TYPE and TYPENAME according to the supported integer or real types for the SUM operation described in table :ref:`Reduction Types`. + :param nelems: The number of elements in the **dest** and **source** arrays. **nelems** must be of type **size_t** and have the same value across all PEs. + :param ret: A pointer whose contents will be set to zero on successful local completion; otherwise, nonzero. **ret** must be accessible from both the host and the device. + :param q: The SYCL queue on which to execute the operation. **q** must be mapped to the GPU tile assigned to the calling PE. + :param deps: An optional vector of SYCL events that the operation depends on. + :param team: A valid ``ishmem`` team handle to a team. + :returns: The SYCL event created upon submitting the operation to the SYCL runtime. + +Callable from the **host**. + +**Description:** +The ``ishmemx_sum_inscan_on_queue`` and ``ishmemx_sum_exscan_on_queue`` +routines have similar semantics and requirements as the ``ishmem_sum_inscan`` +and ``ishmem_sum_exscan`` routines, respectively. +If no **team** argument is passed, then all PEs in the world team must +participate in the collective. +Inclusive and exclusive scan routines that accept a **team** argument operate +over all PEs in the provided team. +All PEs in the provided team must participate in the collective. +If **team** compares equal to ``ISHMEM_TEAM_INVALID`` or is otherwise invalid, +the behavior is undefined. + +To ensure the contents of **dest** and **ret** are valid, refer to the +:ref:`on_queue API Completion Semantics` +section. + diff --git a/docs/source/compiling_and_running_programs.rst b/docs/source/compiling_and_running_programs.rst index 17ed1fa..40e5199 100644 --- a/docs/source/compiling_and_running_programs.rst +++ b/docs/source/compiling_and_running_programs.rst @@ -4,29 +4,32 @@ Compiling and Running Programs ============================== -Let's consider the simple example program from Section :ref:`Writing Intel® -SHMEM Programs` and assume the code is in a file called +Consider the simple example program from Section :ref:`Writing Intel® SHMEM +Programs` and assume the code is in a file called ``ishmem_example.cpp``. -To compile the program, we must pass the necessary flags to the Intel® -oneAPI DPC++/C++ Compiler. -For example:: +To compile the program, the necessary flags must be passed to the Intel® +oneAPI DPC++/C++ Compiler. For example:: -$ icpx -I${ISHMEM_INSTALL_DIR}/include -L${ISHMEM_INSTALL_DIR}/lib -fsycl -std=gnu++1z ishmem_example.cpp -o ishmem_example -lsma -lpmi -lze_loader -ldl +$ icpx -I${ISHMEM_INSTALL_DIR}/include -fsycl -std=gnu++1z ishmem_example.cpp ${ISHMEM_INSTALL_DIR}/lib/libishmem.a -o ishmem_example -lpthread -lze_loader -where ``ISHMEM_INSTALL_DIR`` is the path to the Intel® SHMEM -installation directory. +where ``ISHMEM_INSTALL_DIR`` is the path to the Intel® SHMEM installation +directory. -While building Intel® SHMEM with the ``ENABLE_OPENSHMEM`` CMake option enabled, it -may be convenient to use the ``oshc++`` compiler wrapper (instead of ``icpx`` -directly) to easily include the necessary compilation flags that enable the host -OpenSHMEM back-end. +Alternatively, when building with CMake, the ``find_package`` command may be +used to define all necessary compiler flags. For example:: -Intel® SHMEM provides a launcher script, ``ishmrun`` that -assigns the environment variable **ZE_AFFINITY_MASK** so that each PE is -assigned a single SYCL device. -To invoke the ``ishmrun`` script, pass it as the first argument to your -process launcher. + find_package(ISHMEM REQUIRED) + add_executable(ishmem_example ishmem_example.cpp) + target_link_libraries(ishmem_example PRIVATE ISHMEM::ISHMEM) + +If Intel® SHMEM is not sourced via the installed environment script, it may +be necessary to prepend the installation path to ``CMAKE_PREFIX_PATH``. + +Intel® SHMEM provides a launcher script, ``ishmrun``, that sets CPU and GPU +affinity so that each PE is assigned a single SYCL device and a corresponding +set of CPU cores with close affinity. To invoke the ``ishmrun`` script, pass it +as the first argument to your process launcher. The following example assumes the ``ISHMEM_INSTALL_DIR/bin`` directory is on your user path and use of the Portable Batch System launcher:: @@ -39,6 +42,7 @@ following environment variables may be required for execution, depending on the Intel® SHMEM build configuration:: ISHMEM_RUNTIME + ISHMEM_MPI_LIB_NAME ISHMEM_SHMEM_LIB_NAME ISHMEM_RUNTIME_USE_OSHMPI @@ -49,7 +53,7 @@ Selecting SPIR-V Compilation Targets ------------------------------------ On some systems, you may encounter an error in which the correct SPIR-V targets -are not successfully selected when linking with Intel® SHMEM. This may result in +are not successfully selected when linking with Intel® SHMEM. This may result in problems when using device-initiated communication including compilation warnings: :: @@ -65,8 +69,14 @@ as well as runtime errors: :: Module <0x29941d0>: Unresolved Symbol <_Z13ishmem_putmemPvPKvmi> Module <0x29941d0>: Unresolved Symbol <_Z13ishmem_putmemPvPKvmi> -11 (PI_ERROR_BUILD_PROGRAM_FAILURE) -This error can be resolved by indicating the desired target at compile time. To -compile with the appropriate target for a Intel® Data Center GPU Max 1550 (PVC) -GPU, add the following flags when linking: :: +These errors can be resolved by ensuring the desired target(s) match those +compiled into the Intel® SHMEM library. The target(s) are specified at +Intel® SHMEM's configure time using ``-DISHMEM_AOT_DEVICE_TYPES``. The default +value is ``xe-hpc,xe2`` to target Intel® Data Center Max and Intel® Arc™ +B-Series GPUs, respectively. Below is an example set of flags to add to the +linking process for adding these target devices:: + + -fsycl-targets=spir64_gen --start-no-unused-arguments -Xs "-device xe-hpc,xe2" --end-no-unused-arguments --start-no-unused-arguments -Xsycl-target-backend "-q" --end-no-unused-arguments - -fsycl-targets=spir64_gen --start-no-unused-arguments -Xs "-device pvc" --end-no-unused-arguments --start-no-unused-arguments -Xsycl-target-backend "-q" --end-no-unused-arguments +When building with CMake, the ``ISHMEM::ISHMEM`` interface automatically adds +the corresponding target devices to the compilation command. diff --git a/docs/source/conf.py b/docs/source/conf.py index 9f6a1e0..a2584ef 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -16,7 +16,7 @@ project = u'Intel® SHMEM' copyright = u'2024 Intel Corporation licensed under Creative Commons BY 4.0' author = u'Intel Corporation' -release = u'1.3.0' +release = u'1.5.0' version = release # -- General configuration --------------------------------------------------- diff --git a/docs/source/supported_features.rst b/docs/source/supported_features.rst index 2f3963a..6dcbd3e 100644 --- a/docs/source/supported_features.rst +++ b/docs/source/supported_features.rst @@ -422,6 +422,18 @@ releases of Intel® SHMEM. +------------------------------------------------+---------------+ | ``ishmemx_TYPENAME_OP_reduce_work_group`` | Yes | +------------------------------------------------+---------------+ +| ``ishmem_TYPENAME_inscan`` | Yes | ++------------------------------------------------+---------------+ +| ``ishmemx_TYPENAME_inscan_on_queue`` | Yes | ++------------------------------------------------+---------------+ +| ``ishmemx_TYPENAME_inscan_work_group`` | No | ++------------------------------------------------+---------------+ +| ``ishmem_TYPENAME_exscan`` | Yes | ++------------------------------------------------+---------------+ +| ``ishmemx_TYPENAME_exscan_on_queue`` | Yes | ++------------------------------------------------+---------------+ +| ``ishmemx_TYPENAME_exscan_work_group`` | No | ++------------------------------------------------+---------------+ | C++ function template routines* | Yes | +------------------------------------------------+---------------+ diff --git a/scripts/ishmrun b/scripts/ishmrun index 72fbc97..1986807 100755 --- a/scripts/ishmrun +++ b/scripts/ishmrun @@ -1,105 +1,394 @@ #!/bin/bash - -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause -assignment="device" -parent_process=$(ps -p $PPID -o comm=) -if [ $parent_process = "slurmstepd" ]; then - local_id=$SLURM_LOCALID -elif [ $parent_process = "hydra_pmi_proxy" ]; then - local_id=$MPI_LOCALRANKID -elif [ $parent_process = "palsd" ]; then - local_id=$PALS_LOCAL_RANKID -else - echo -e "\033[33mWARNING: Process not launched with a supported job launcher.\033[0m" >&2 -fi +# ================================================================================================ # +# Globals +# ================================================================================================ # -# See https://spec.oneapi.io/level-zero/latest/core/PROG.html#device-hierarchy regarding possible -# device hierarchy options. -if [ -z "$ZE_FLAT_DEVICE_HIERARCHY" -o "$ZE_FLAT_DEVICE_HIERARCHY" == "COMPOSITE" ]; then - assignment="tile" - echo -e "\033[33mWARNING: Assigning a single tile per PE due to ZE_FLAT_DEVICE_HIERARCHY=${ZE_FLAT_DEVICE_HIERARCHY}.\033[0m" >&2 -fi +script_name=$(basename "$0") +cpu_bind=1 +gpu_bind=1 +use_xpu_smi=0 +show_help=0 +prefix_command="" +local_id=0 +local_size=0 -#Unset ZE_AFFINITY_MASK if previously set by user, so as not to impact results returned from sycl-ls -if [ -n "$ZE_AFFINITY_MASK" ]; then - echo -e "\033[33mWARNING: Previous assignment of ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK} being unset.\033[0m" >&2 - unset ZE_AFFINITY_MASK -fi +which_sycl_ls=1 +which_clinfo=1 +which_xpu_smi=1 -#Unset SYCL_DEVICE_FILTER if previously set by user, so as not to impact results returned from sycl-ls -if [ -n "$SYCL_DEVICE_FILTER" ]; then - echo -e "\033[33mWARNING: Previous assignment of SYCL_DEVICE_FILTER=${SYCL_DEVICE_FILTER} being unset.\033[0m" >&2 - unset SYCL_DEVICE_FILTER -fi +# ================================================================================================ # +# Functions +# ================================================================================================ # + +usage() +{ + cat << EOF +Usage: $script_name [-hcgn] + +Options: + -h, --help Show this message + -c, --disable-cpu-bind Don't perform CPU binding + -g, --disable-gpu-bind Don't perform GPU binding + -n, --disable-bind Don't perform CPU or GPU binding + -x, --enable-xpu-smi Use xpu-smi for CPU/GPU affinity checks +EOF +} + +# Parse long options +# Return value indicates if option consumed an argument +parse_long_opt() +{ + local arg="$1" + local next="$2" + + case "$arg" in + --help) + show_help=1 + return 0 + ;; + --disable-cpu-bind) + cpu_bind=0 + return 0 + ;; + --disable-gpu-bind) + gpu_bind=0 + return 0 + ;; + --disable-bind) + cpu_bind=0 + gpu_bind=0 + return 0 + ;; + --enable-xpu-smi) + use_xpu_smi=1 + return 0 + ;; + *) + echo "Error: Unknown option '$arg'" >&2 + usage + exit 1 + ;; + esac +} + +parse_short_opt() +{ + local arg="$1" + + i=0 + while [ $i -lt ${#arg} ]; do + char="${arg:$i:1}" + + case "$char" in + h) + show_help=1 + ;; + c) + cpu_bind=0 + ;; + g) + gpu_bind=0 + ;; + n) + cpu_bind=0 + gpu_bind=0 + ;; + x) + use_xpu_smi=1 + ;; + *) + echo "Error: Unknown option '$arg'" >&2 + usage + exit 1 + ;; + esac + i=$((i + 1)) + done +} + +cpu_binding() +{ + # Note: currently assumes round-robin core numbering + local numactl_output=$(numactl -H) + local cpu_numa_nodes=$(echo "$numactl_output" | grep -Po "^node [0-9]+ cpus: [0-9 ]+" | wc -l) + local numa_split=$((local_size / cpu_numa_nodes)) + local numa_remainder=$((local_size % cpu_numa_nodes)) -#Unset ONEAPI_DEVICE_SELECTOR if previously set by user -if [ -n "$ONEAPI_DEVICE_SELECTOR" ]; then - echo -e "\033[33mWARNING: Previous assignment of ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR} being unset.\033[0m" >&2 + if [ $cpu_numa_nodes -eq 0 ]; then + echo -e "\033[33mWARNING: Could not process NUMA information. Consider running with '--disable-cpu-bind' and using external binding mechanisms to ensure best performance.\033[0m" >&2 + return + fi + + if [ $cpu_numa_nodes -gt 2 ]; then + echo -e "\033[33mWARNING: More than 2 NUMA nodes detected. Consider running with '--disable-cpu-bind' and using external binding mechanisms to ensure best performance.\033[0m" >&2 + return + fi + + declare -a node_cpus + declare -a node_core_counts + for idx in $(seq 0 $((cpu_numa_nodes - 1))); do + local cpus=$(echo "$numactl_output" | grep -Po "node $idx cpus: \K[0-9 ]+") + node_cpus[$idx]="$cpus" + node_core_counts[$idx]=$(echo $cpus | wc -w) + done + + # Split procs across socket(s) + local prev=0 + for idx in $(seq 0 $((cpu_numa_nodes - 1))); do + local add=0 + if [ $numa_remainder -gt 0 ]; then + local add=1 + local numa_remainder=$((numa_remainder - 1)) + fi + if [ $local_id -lt $((prev + numa_split + add)) ]; then + # local_numa_id is used in GPU binding as well + local_numa_id=$idx + local plow=$prev + local phigh=$((prev + numa_split + add)) + break + fi + local prev=$((prev + numa_split + add)) + done + + # Split cores across procs + # procs with affinity to $local_numa_id are [$plow, $phigh) + # skip first proc on each socket (reserved for OS) + # skip last proc on each socket (reserved for proxy thread) + # TODO: assumes maximum 2 sockets + local num_skip=2 + local cores_per_proc=$(((node_core_counts[local_numa_id] - num_skip) / (phigh - plow) / 2)) + local core_low=$(echo ${node_cpus[$local_numa_id]} | cut -d ' ' -f $((cores_per_proc * (local_id - plow) + 2))) + local core_high=$((core_low + cores_per_proc - 1)) + local core_assignment="$core_low-$core_high" + + prefix_command="numactl --all -C ${core_assignment}" +} + +gpu_binding() +{ + # Reset Level-zero/SYCL environment + unset ZE_AFFINITY_MASK unset ONEAPI_DEVICE_SELECTOR -fi + unset SYCL_DEVICE_FILTER -#Determine which utilities are available for device/sub-device detection -which clinfo > /dev/null 2>&1 -which_clinfo_exit_code=$? -which sycl-ls > /dev/null 2>&1 -which_sycl_ls_exit_code=$? - -#Identify sub-devices available for use -all_sub_devices=() -if [ $which_sycl_ls_exit_code -eq 0 ]; then - sycl_ls_output=$(ONEAPI_DEVICE_SELECTOR=level_zero:* sycl-ls 2>/dev/null) - num_root_devices=$(grep -P -o "\[level_zero:gpu\]" <<< ${sycl_ls_output} | wc -l) - if [ ${num_root_devices} -gt 0 ]; then - root_devices=($(seq 0 $((${num_root_devices} - 1)))) + local root_count=0 + local sub_count=0 + + # Determine how to set ZE_AFFINITY_MASK + if [ $which_sycl_ls -eq 0 ]; then + local root_count=$(ONEAPI_DEVICE_SELECTOR=level_zero:* sycl-ls 2>/dev/null | wc -l) + local sub_count=$(ONEAPI_DEVICE_SELECTOR=level_zero:*.* sycl-ls 2>/dev/null | wc -l) + elif [ $which_clinfo -eq 0 ]; then + local platforms=$(clinfo -l | grep -i Platform | wc -l) + + for ((platform=0; platform < $platforms; platform++)); do + if [ "$(clinfo -d ${platform}:0 --prop DEVICE_TYPE 2>/dev/null | grep -Po GPU | wc -l)" -gt 0 ]; then + break + fi + done + + local root_count=$(clinfo --prop DEVICE_TYPE 2>/dev/null | grep -Po GPU | wc -l) + local sub_count=$(clinfo -d ${platform}:0 --prop MAX_SUB_DEVICES 2>/dev/null | awk '{print $2}') else - root_devices=($(grep -P -o "(?<=gpu:)[0-9]+" <<< ${sycl_ls_output})) + # Can't use xpu-smi here because we can't detect the format for ZE_AFFINITY_MASK + echo -e "\033[33mWARNING: Could not detect devices on system. Ensure either clinfo or sycl-ls is available.\033[0m" >&2 + return fi - if [ "$assignment" == "tile" ]; then - for root_device in ${root_devices[@]} - do - sycl_ls_output=$(ONEAPI_DEVICE_SELECTOR=level_zero:${root_device}.* sycl-ls 2>/dev/null) - num_sub_devices=$(grep -P -o "level_zero:gpu" <<< ${sycl_ls_output} | wc -l) - if [ ${num_sub_devices} -gt 0 ]; then - for (( sub_device=0; sub_device<${num_sub_devices}; sub_device++ )); do - all_sub_devices+=("${root_device}.${sub_device}") - done - else - #Root device is not partitionable / has no sub-devices - all_sub_devices+=("${root_device}") - fi + + if [ "$cpu_bind" -eq 0 ]; then + # Get the local numa node in case cpu binding is disabled + local local_numa_id=$(numactl --show | grep -Po "(?<=^nodebind: ).*" | awk '{print $NF}') + fi + + declare -a all_devs + if [ $sub_count -eq 0 ]; then + for root_dev in $(seq 0 $((root_count - 1))); do + all_devs+=("${root_dev}") + done + else + # Current sub_count is total subdevices in the node - fix it to be per device + local sub_count=$((sub_count / root_count)) + for root_dev in $(seq 0 $((root_count - 1))); do + for sub_dev in $(seq 0 $((sub_count - 1))); do + all_devs+=("${root_dev}.${sub_dev}") + done done fi -elif [ $which_clinfo_exit_code -eq 0 ]; then - root_devices=( $(seq 0 $(( $(clinfo | grep -P -o "(?<=Device Type)[ \t]*GPU" | wc -l) - 1 )) ) ) - if [ "$assignment" == "tile" ]; then - for root_device in "${root_devices[@]}"; do - num_sub_devices=$(ZE_AFFINITY_MASK=$root_device clinfo | grep -P -A 10 "(?<=Device Type)[ \t]*GPU" | grep -P -o "(?<=Max number of sub-devices)[ \t]*[0-9]+" | xargs) - if [ $num_sub_devices -eq 0 ]; then - #Root device is not partitionable / has no sub-devices - all_sub_devices+=("${root_device}") + + local idx=0 + local leftover_devs_per_numa=$(((${#all_devs[@]} - local_size ) / 2)) + + # Use xpu-smi to match GPU and CPU affinity + if [ $which_xpu_smi -eq 0 ]; then + # xpu-smi will always provide hierarchy info regarding root and sub devices, thus + # xpu_root_count may not equal root_count (i.e. when ZE_FLAT_DEVICE_HIERARCHY=FLAT) + local xpu_root_count=$(xpu-smi discovery --dump 1 2>/dev/null | grep -P [0-9]+ | wc -l) + local xpu_sub_count=$(xpu-smi discovery -d 0 2>/dev/null | grep -Po "(?<=Number of Tiles: )[0-9]+") + + # Check for homogeneity + for root_dev in $(seq 10000 $((xpu_root_count - 1))); do + local sub_count=$(xpu-smi discovery -d ${root_dev} 2>/dev/null | grep -Po "(?<=Number of Tiles: )[0-9]+") + if [ $sub_count -ne $xpu_sub_count ]; then + echo -e "\033[33mWARNING: Detected different accelerator architectures. Consider running with '--disable-gpu-bind' and using external binding mechanisms to ensure best performance.\033[0m" >&2 + return + fi + done + + local xpu_total_dev=$((xpu_root_count * xpu_sub_count)) + + declare -a s0_devs + declare -a s1_devs + for root_dev in $(seq 0 $((xpu_root_count - 1))); do + local dev_cpu_list=$(xpu-smi topology -d ${root_dev} 2>/dev/null | grep -Po "(?<=Local CPU List: )[0-9,-]+") + local dev_numa=$(numactl --all -C ${dev_cpu_list} --show | grep -Po "(?<=^nodebind: ).*" | awk '{print $NF}') + for sub_dev in $(seq 0 $((xpu_sub_count - 1))); do + if [ $dev_numa -eq 0 ]; then + s0_devs+=("${all_devs[${idx}]}") else - for (( sub_device=0; sub_device<${num_sub_devices}; sub_device++ )); do - all_sub_devices+=("${root_device}.${sub_device}") - done + s1_devs+=("${all_devs[${idx}]}") fi + local idx=$((idx + 1)) + done done + + local s0_count=${#s0_devs[@]} + local idx_offset=$((s0_count - leftover_devs_per_numa)) + if [ $local_id -lt $idx_offset ]; then + local mask=${s0_devs[${local_id}]} + else + local mask=${s1_devs[$((local_id - idx_offset))]} + fi + else + local s0_count=$((${#all_devs[@]} / 2)) + local idx_offset=$((s0_count - leftover_devs_per_numa)) + if [ $local_id -lt $idx_offset ]; then + local mask=${all_devs[${local_id}]} + else + local mask=${all_devs[$((s0_count + local_id - idx_offset))]} + fi fi -else - echo -e "\033[33mWARNING: Could not detect devices on system. Ensure either clinfo or sycl-ls is available.\033[0m" >&2 + + export ZE_AFFINITY_MASK=$mask + export ONEAPI_DEVICE_SELECTOR="level_zero:0" +} + +gpu_env() +{ + export NEOReadDebugKeys=1 + export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1 + + # Determine which utilities are available for device detection and topology + which clinfo > /dev/null 2>&1 + which_clinfo=$? + which sycl-ls > /dev/null 2>&1 + which_sycl_ls=$? + if [ $use_xpu_smi -eq 1 ]; then + which xpu-smi > /dev/null 2>&1 + which_xpu_smi=$? + else + which_xpu_smi=1 + fi + + local output="" + + if [ $which_sycl_ls -eq 0 ]; then + local output=$(ONEAPI_DEVICE_SELECTOR=level_zero:* sycl-ls 2>/dev/null) + elif [ $which_clinfo -eq 0 ]; then + local platforms=$(clinfo -l | grep -i Platform | wc -l) + for ((platform=0; platform < $platforms; platform++)); do + if [ "$(clinfo -d ${platform}:0 --prop DEVICE_TYPE 2>/dev/null | grep -Po GPU | wc -l)" -gt 0 ]; then + local output=$(clinfo -d ${platform}:0 --prop DEVICE_NAME 2>/dev/null) + break + fi + done + elif [ $which_xpu_smi -eq 0 ]; then + # Property ID 2 is Device Name + local output=$(xpu-smi discovery --dump 2 2>/dev/null) + fi + + if [ $(echo $output | grep -Po "Intel.*Arc.*B[0-9]+ Graphics" | wc -l) -gt 0 ]; then + # Intel(R) Arc(TM) B-Series GPU Family + # Necessary for GPU IPC + export RenderCompressedBuffersEnabled=0 + elif [ $(echo $output | grep -Po "Intel.*Data.*Center.*GPU" | wc -l) -gt 0 ]; then + # Intel(R) Data Center GPU Max Series + # No support for Implicit Scaling + export EnableImplicitScaling=0 + fi +} + +# ================================================================================================ # +# Main script +# ================================================================================================ # + +# Parse input arguments +while [ $# -gt 0 ]; do + case "$1" in + # Long options + --*) + if parse_long_opt "$1" "$2"; then + shift 1 + else + shift 2 + fi + ;; + + # Short options + -*) + parse_short_opt "${1#-}" + shift + ;; + + # End of options marker + --) + shift + break + ;; + + # Non-option args + *) + break + ;; + esac +done + +if [ "$show_help" -eq 1 ]; then + usage + exit 0 fi -#Set visibility of devices for Level Zero -if [ "$assignment" == "tile" ]; then - export ZE_AFFINITY_MASK=${all_sub_devices[${local_id}]} +# Detect process manager +parent_process=$(ps -p $PPID -o comm=) + +if [ $parent_process = "slurmstepd" ]; then + local_id=$SLURM_LOCALID + local_size=$(echo $SLURM_STEP_TASKS_PER_NODE | awk -F "(" '{print $1}') +elif [ $parent_process = "hydra_pmi_proxy" ]; then + local_id=$MPI_LOCALRANKID + local_size=$MPI_LOCALNRANKS +elif [ $parent_process = "palsd" ]; then + local_id=$PALS_LOCAL_RANKID + local_size=$PALS_LOCAL_SIZE else - export ZE_AFFINITY_MASK=${root_devices[${local_id}]} + if [ "$cpu_bind" -eq 1 ] || [ "$gpu_bind" -eq 1 ]; then + echo -e "\033[33mWARNING: Process not launched with a supported process manager.\033[0m" >&2 + fi +fi + +# Perform binding (if applicable) +if [ "$cpu_bind" -eq 1 ]; then + cpu_binding fi -#Set visibility of devices for SYCL -#From SYCL's perspective, device 0 (level_zero:0) will now correspond to the device previously assigned to ZE_AFFINITY_MASK -export ONEAPI_DEVICE_SELECTOR="level_zero:0" +# Even if gpu binding is disabled, some environment variables may need to be set for specific GPUs +gpu_env + +if [ "$gpu_bind" -eq 1 ]; then + gpu_binding +fi # Invoke the main program -NEOReadDebugKeys=1 UseKmdMigration=1 numactl --cpunodebind=all $* +${prefix_command} $* diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index faeb1b0..e19ca8d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,7 +56,6 @@ set(ISHMEM_SRC proxy.cpp proxy_func.cpp runtime.cpp - collectives.cpp nbi.cpp signaling.cpp synchronization.cpp @@ -66,6 +65,7 @@ set(ISHMEM_SRC collectives/broadcast.cpp collectives/collect.cpp collectives/reduce.cpp + collectives/scan.cpp collectives/sync.cpp err.cpp timestamp.cpp diff --git a/src/accelerator.cpp b/src/accelerator.cpp index feb43bb..26f0274 100644 --- a/src/accelerator.cpp +++ b/src/accelerator.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -14,61 +14,71 @@ #include #endif -ze_driver_handle_t *all_drivers = nullptr; -ze_device_handle_t **all_devices = nullptr; -uint32_t driver_count = 0; -bool ishmemi_accelerator_preinitialized = false; -bool ishmemi_accelerator_initialized = false; +namespace { + /* L0 driver */ + ze_driver_handle_t *all_drivers = nullptr; + ze_device_handle_t **all_devices = nullptr; + uint32_t driver_count = 0; + uint32_t driver_idx = 0; + bool driver_found = false; + + /* L0 device */ + ze_device_properties_t device_properties = {}; + + /* L0 queues */ + uint32_t link_queue_count = 0; + std::atomic link_index = 0; /* Used for round-robining link engines */ + ze_command_queue_handle_t compute_queue = {}; + ze_command_queue_handle_t copy_queue = {}; + ze_command_queue_handle_t *link_queues = nullptr; + uint32_t compute_ordinal = 0; + uint32_t copy_ordinal = 0; + uint32_t link_ordinal = 0; + + /* L0 lists */ + ishmemi_thread_safe_vector compute_lists; + ishmemi_thread_safe_vector copy_lists; + ishmemi_thread_safe_vector *link_lists; + + /* Misc */ + bool ishmemi_accelerator_preinitialized = false; + bool ishmemi_accelerator_initialized = false; +} // namespace + +/* L0 Context */ +ze_context_handle_t ishmemi_ze_context = nullptr; +ze_context_desc_t ishmemi_ze_context_desc = {}; +/* L0 device */ ze_driver_handle_t ishmemi_gpu_driver = nullptr; -ze_driver_handle_t ishmemi_fpga_driver = nullptr; ze_device_handle_t ishmemi_gpu_device = nullptr; -ze_device_handle_t ishmemi_fpga_device = nullptr; -ze_context_handle_t ishmemi_ze_context = nullptr; -ze_context_desc_t ishmemi_ze_context_desc = {}; +/* L0 events */ ze_event_pool_handle_t ishmemi_ze_event_pool; -unsigned int ishmemi_link_engine_index = 0; -#ifdef ENABLE_REDUCED_LINK_ENGINES -unsigned int ishmemi_link_engine[NUM_LINK_QUEUE] = {2, 4}; -#else -unsigned int ishmemi_link_engine[NUM_LINK_QUEUE] = {2, 4, 6}; -#endif - -ze_command_queue_handle_t ishmemi_ze_cmd_queue; -ze_command_queue_handle_t ishmemi_ze_all_cmd_queue; -ze_command_queue_handle_t ishmemi_ze_link_cmd_queue[NUM_LINK_QUEUE]; -ishmemi_thread_safe_vector ishmemi_ze_cmd_lists; -ishmemi_thread_safe_vector ishmemi_ze_link_cmd_lists[NUM_LINK_QUEUE]; - -uint32_t ishmemi_gpu_driver_idx = 0; -uint32_t ishmemi_fpga_driver_idx = 0; -bool ishmemi_gpu_driver_found = false; -bool ishmemi_fpga_driver_found = false; - /* this should be thread safe because we query the size, then sync * then destroy the first size items, then erase them from the list */ -static int ishmemi_sync_cmd_queue(ze_command_queue_handle_t &queue, - ishmemi_thread_safe_vector &cmd_lists) +static int sync_cq(ze_command_queue_handle_t &queue, + ishmemi_thread_safe_vector &cmd_lists) { + static std::atomic size; size_t cur_size = 0; + int ret = 0; + std::vector::iterator first, last; cmd_lists.mtx.lock(); - int ret = 0; - static std::atomic size; size.store(cmd_lists.size()); cmd_lists.mtx.unlock(); - std::vector::iterator first, last; ZE_CHECK(zeCommandQueueSynchronize(queue, UINT64_MAX)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); cmd_lists.mtx.lock(); cur_size = size.load(); - for (size_t i = 0; i < cur_size; i += 1) + for (size_t i = 0; i < cur_size; ++i) { ZE_CHECK(zeCommandListDestroy(cmd_lists[i])); + } first = cmd_lists.begin(); last = first + static_cast(cur_size); @@ -79,6 +89,12 @@ static int ishmemi_sync_cmd_queue(ze_command_queue_handle_t &queue, return ret; } +static inline uint32_t get_next_link_index() +{ + uint32_t index = link_index.fetch_add(1, std::memory_order_relaxed) % link_queue_count; + return index; +} + int ishmemi_accelerator_preinit() { int ret = 0; @@ -125,31 +141,27 @@ int ishmemi_accelerator_preinit() ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); if (device_count == 0) continue; - // Only a single device will be returned because of setting in ishmrun launcher + /* Ensure a single device is detected */ ISHMEM_CHECK_GOTO_MSG(device_count != 1, fn_fail, "Detected more than one device\n"); all_devices[i] = (ze_device_handle_t *) ::malloc(device_count * sizeof(ze_device_handle_t)); ISHMEM_CHECK_GOTO_MSG(all_devices == nullptr, fn_fail, "Allocation of all_drivers[%d] failed\n", i); ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, all_devices[i])); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - ze_device_properties_t device_properties; ZE_CHECK(zeDeviceGetProperties(all_devices[i][0], &device_properties)); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - /* Storing gpu and fpga devices only for now */ - if (ZE_DEVICE_TYPE_GPU == device_properties.type && !ishmemi_gpu_driver_found) { + + if (ZE_DEVICE_TYPE_GPU == device_properties.type && !driver_found) { ishmemi_gpu_driver = all_drivers[i]; - ishmemi_gpu_driver_idx = i; - ishmemi_gpu_driver_found = true; - } else if (ZE_DEVICE_TYPE_FPGA == device_properties.type && !ishmemi_fpga_driver_found) { - ishmemi_fpga_driver = all_drivers[i]; - ishmemi_fpga_driver_idx = i; - ishmemi_fpga_driver_found = true; + driver_idx = i; + driver_found = true; } } - if (!ishmemi_gpu_driver_found && !ishmemi_fpga_driver_found) { - ISHMEM_ERROR_MSG("No ZE driver found for GPU or FPGA\n"); + if (!driver_found) { + ISHMEM_ERROR_MSG("No ZE driver found for GPU\n"); ret = ISHMEMI_NO_DEVICES; goto fn_fail; } @@ -171,110 +183,104 @@ int ishmemi_accelerator_preinit() int ishmemi_accelerator_init() { - uint32_t device_count = 0; - uint32_t i, j; int ret = 0; - ze_command_queue_desc_t cmdq_desc; + uint32_t i, j; + uint32_t cq_group_count = 0; ze_event_pool_desc_t event_pool_desc; + ze_command_queue_group_properties_t *cq_group_prop = nullptr; ret = ishmemi_accelerator_preinit(); - if (ret != 0) goto fn_exit; - /* set default interval for cmd_list garbage collection */ - ishmemi_ze_cmd_lists.reserve(ishmemi_params.NBI_COUNT); - for (int i = 0; i < NUM_LINK_QUEUE; i += 1) { - ishmemi_ze_link_cmd_lists[i].reserve(ishmemi_params.NBI_COUNT); - } + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - if (ishmemi_gpu_driver_found) { - /* TODO: Make default device assignment topology-aware instead of round-robin */ - /* Set the default device for GPU */ - /* TODO: This currently assumes all devices for the driver are GPU devices */ - ishmemi_gpu_device = all_devices[ishmemi_gpu_driver_idx][0]; - } + if (driver_found) { + /* Set the default GPU */ + ishmemi_gpu_device = all_devices[driver_idx][0]; - if (ishmemi_fpga_driver_found) { - /* Set the default device for FPGA */ - /* TODO: This currently assumes all devices for the driver are FPGA devices */ - ishmemi_fpga_device = all_devices[ishmemi_fpga_driver_idx][0]; - } + /* Discover command queue groups */ + ZE_CHECK( + zeDeviceGetCommandQueueGroupProperties(ishmemi_gpu_device, &cq_group_count, nullptr)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - if (ishmemi_gpu_driver_found) { - /* Get P2P properties between the local device and each GPU device */ - for (i = 0; i < driver_count; i++) { - device_count = 0; - ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, nullptr)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - if (device_count == 0) continue; - - for (j = 0; j < device_count; j++) { - ze_device_properties_t device_properties; - ZE_CHECK(zeDeviceGetProperties(all_devices[i][j], &device_properties)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - } + cq_group_prop = (ze_command_queue_group_properties_t *) ::malloc( + cq_group_count * sizeof(ze_command_queue_group_properties_t)); + ISHMEM_CHECK_GOTO_MSG(cq_group_prop == nullptr, fn_fail, + "Allocation of cq_group_prop failed\n"); + + for (i = 0; i < cq_group_count; ++i) { + cq_group_prop[i] = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES, + .pNext = nullptr, + .flags = 0, + .maxMemoryFillPatternSize = 0, + .numQueues = 0, + }; } - } - if (ishmemi_params.DEBUG) { - ze_device_properties_t device_properties; + ZE_CHECK(zeDeviceGetCommandQueueGroupProperties(ishmemi_gpu_device, &cq_group_count, + cq_group_prop)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - if (ishmemi_gpu_driver_found) { - ZE_CHECK(zeDeviceGetProperties(ishmemi_gpu_device, &device_properties)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - ishmemi_print_device_properties(device_properties); + /* Setup all command queues */ + for (i = 0; i < cq_group_count; ++i) { + ze_command_queue_desc_t desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .pNext = nullptr, + .ordinal = i, + .index = 0, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + }; + + if (cq_group_prop[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { + ZE_CHECK(zeCommandQueueCreate(ishmemi_ze_context, ishmemi_gpu_device, &desc, + &compute_queue)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + + compute_ordinal = i; + } else if (cq_group_prop[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY && + cq_group_prop[i].numQueues == 1) { + ZE_CHECK(zeCommandQueueCreate(ishmemi_ze_context, ishmemi_gpu_device, &desc, + ©_queue)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + + copy_ordinal = i; + } else if (cq_group_prop[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY && + cq_group_prop[i].numQueues > 1) { + link_queues = (ze_command_queue_handle_t *) ::malloc( + cq_group_prop[i].numQueues * sizeof(ze_command_queue_handle_t)); + ISHMEM_CHECK_GOTO_MSG(link_queues == nullptr, fn_fail, + "Allocation of link_queues failed\n"); + + for (j = 0; j < cq_group_prop[i].numQueues; ++j) { + desc.index = j; + ZE_CHECK(zeCommandQueueCreate(ishmemi_ze_context, ishmemi_gpu_device, &desc, + &link_queues[j])); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + } + + link_ordinal = i; + } } - if (ishmemi_fpga_driver_found) { - ZE_CHECK(zeDeviceGetProperties(ishmemi_fpga_device, &device_properties)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + if (ishmemi_params.DEBUG) { ishmemi_print_device_properties(device_properties); } } - /* Create the ZE command queue */ - cmdq_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = nullptr, - .ordinal = 1, - .index = 0, - .flags = 0, - .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, - }; - ZE_CHECK(zeCommandQueueCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmdq_desc, - &ishmemi_ze_cmd_queue)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - - /* create link queue for group command lists */ - cmdq_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = nullptr, - .ordinal = 2, - .index = 0, - .flags = 0, - .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, - }; - ZE_CHECK(zeCommandQueueCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmdq_desc, - &ishmemi_ze_all_cmd_queue)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - - for (uint32_t i = 0; i < NUM_LINK_QUEUE; i += 1) { - cmdq_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = nullptr, - .ordinal = 2, - .index = 2U + (i * 2), // 2 4 6 - .flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, - .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, - }; - ZE_CHECK(zeCommandQueueCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmdq_desc, - &ishmemi_ze_link_cmd_queue[i])); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + /* Set the default interval for garbage collection for lists */ + compute_lists.reserve(ishmemi_params.NBI_COUNT); + copy_lists.reserve(ishmemi_params.NBI_COUNT); + link_lists = (ishmemi_thread_safe_vector *) ::malloc( + link_queue_count * sizeof(ishmemi_thread_safe_vector)); + for (i = 0; i < link_queue_count; ++i) { + link_lists[i].reserve(ishmemi_params.NBI_COUNT); } + /* Create the ZE event pool */ event_pool_desc = { .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .pNext = nullptr, .flags = 0, .count = 1, }; @@ -294,36 +300,37 @@ int ishmemi_accelerator_fini(void) { int ret = 0; - if (ishmemi_ze_cmd_queue) { - ishmemi_sync_cmd_queue(ishmemi_ze_cmd_queue, ishmemi_ze_cmd_lists); - ZE_CHECK(zeCommandQueueDestroy(ishmemi_ze_cmd_queue)); - ishmemi_ze_cmd_queue = nullptr; + if (compute_queue) { + sync_cq(compute_queue, compute_lists); + ZE_CHECK(zeCommandQueueDestroy(compute_queue)); + compute_queue = {}; } - if (ishmemi_ze_all_cmd_queue) { - ZE_CHECK(zeCommandQueueDestroy(ishmemi_ze_all_cmd_queue)); - ishmemi_ze_all_cmd_queue = nullptr; + if (copy_queue) { + sync_cq(copy_queue, copy_lists); + ZE_CHECK(zeCommandQueueDestroy(copy_queue)); + copy_queue = {}; } - for (int i = 0; i < NUM_LINK_QUEUE; i += 1) { - if (ishmemi_ze_link_cmd_queue[i]) { - ishmemi_sync_cmd_queue(ishmemi_ze_link_cmd_queue[i], ishmemi_ze_link_cmd_lists[i]); - ZE_CHECK(zeCommandQueueDestroy(ishmemi_ze_link_cmd_queue[i])); + for (uint32_t i = 0; i < link_queue_count; ++i) { + if (link_queues[i]) { + sync_cq(link_queues[i], link_lists[i]); + ZE_CHECK(zeCommandQueueDestroy(link_queues[i])); + link_queues[i] = {}; } - ishmemi_ze_link_cmd_queue[i] = nullptr; } + ISHMEMI_FREE(::free, link_queues); + link_queues = nullptr; - for (int i = 0; i < driver_count; i++) + for (size_t i = 0; i < driver_count; i++) ISHMEMI_FREE(::free, all_devices[i]); ISHMEMI_FREE(::free, all_devices); ISHMEMI_FREE(::free, all_drivers); ishmemi_accelerator_preinitialized = false; ishmemi_accelerator_initialized = false; - ishmemi_gpu_driver_found = false; - ishmemi_fpga_driver_found = false; - ishmemi_gpu_driver_idx = 0; - ishmemi_fpga_driver_idx = 0; + driver_found = false; + driver_idx = 0; driver_count = 0; if (ishmemi_ze_context) { @@ -337,6 +344,189 @@ int ishmemi_accelerator_fini(void) return ret; } +int ishmemi_create_command_list(ishmemi_queue_type_t queue_type, bool immediate, + ze_command_list_handle_t *list, ze_command_list_flags_t flags) +{ + int ret = 0; + uint32_t ordinal = 0; + uint32_t index = 0; + ze_command_list_desc_t list_desc = {}; + ze_command_queue_desc_t queue_desc = {}; + + ISHMEM_CHECK_GOTO_MSG(list == nullptr, fn_fail, + "Failed to create command list - nullptr provided\n"); + + switch (queue_type) { + case COMPUTE_QUEUE: + ordinal = compute_ordinal; + break; + case COPY_QUEUE: + ordinal = copy_ordinal; + break; + case LINK_QUEUE: + if (link_queue_count == 0) { + ordinal = copy_ordinal; + } else { + if (link_queue_count > 1) { + index = get_next_link_index(); + } + ordinal = link_ordinal; + } + break; + default: + ISHMEM_CHECK_GOTO_MSG( + true, fn_fail, "Failed to create command list - undefined queue type provided\n"); + break; + } + + if (immediate) { + /* Currently only use synchronous and normal priority - may need to extend later */ + queue_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .pNext = nullptr, + .ordinal = ordinal, + .index = index, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + }; + + ZE_CHECK(zeCommandListCreateImmediate(ishmemi_ze_context, ishmemi_gpu_device, &queue_desc, + list)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + } else { + list_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .pNext = nullptr, + .commandQueueGroupOrdinal = ordinal, + .flags = flags, + }; + + ZE_CHECK(zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &list_desc, list)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + } + +fn_exit: + return ret; +fn_fail: + ret = 1; + goto fn_exit; +} + +int ishmemi_create_command_list_nbi(ishmemi_queue_type_t queue_type, ze_command_list_handle_t *list, + ze_command_list_flags_t flags) +{ + int ret = 0; + uint32_t ordinal = 0; + uint32_t index = 0; + ze_command_list_desc_t list_desc = {}; + + ISHMEM_CHECK_GOTO_MSG(list == nullptr, fn_fail, + "Failed to create command list - nullptr provided\n"); + + switch (queue_type) { + case COMPUTE_QUEUE: + ordinal = compute_ordinal; + break; + case COPY_QUEUE: + ordinal = copy_ordinal; + break; + case LINK_QUEUE: + if (link_queue_count == 0) { + ordinal = copy_ordinal; + } else { + if (link_queue_count > 1) { + index = get_next_link_index(); + } + ordinal = link_ordinal; + } + break; + default: + ISHMEM_CHECK_GOTO_MSG( + true, fn_fail, "Failed to create command list - undefined queue type provided\n"); + break; + } + + list_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .pNext = nullptr, + .commandQueueGroupOrdinal = ordinal, + .flags = flags, + }; + + ZE_CHECK(zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &list_desc, list)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + + switch (queue_type) { + case COMPUTE_QUEUE: + compute_lists.push_back_thread_safe(*list); + break; + case COPY_QUEUE: + copy_lists.push_back_thread_safe(*list); + break; + case LINK_QUEUE: + if (link_queue_count == 0) { + copy_lists.push_back_thread_safe(*list); + } else { + link_lists[index].push_back_thread_safe(*list); + } + break; + default: + ISHMEM_CHECK_GOTO_MSG(true, fn_fail, + "Failed to store command list - undefined queue type provided\n"); + break; + } + +fn_exit: + return ret; +fn_fail: + ret = 1; + goto fn_exit; +} + +int ishmemi_execute_command_lists(ishmemi_queue_type_t queue_type, uint32_t list_count, + ze_command_list_handle_t *lists, ze_fence_handle_t fence) +{ + int ret = 0; + uint32_t index = 0; + ze_command_queue_handle_t queue = {}; + + ISHMEM_CHECK_GOTO_MSG(lists == nullptr, fn_fail, + "Failed to execute command list - nullptr provided\n"); + + switch (queue_type) { + case COMPUTE_QUEUE: + queue = compute_queue; + break; + case COPY_QUEUE: + queue = copy_queue; + break; + case LINK_QUEUE: + if (link_queue_count == 0) { + queue = copy_queue; + } else { + if (link_queue_count > 1) { + index = get_next_link_index(); + } + queue = link_queues[index]; + } + break; + default: + ISHMEM_CHECK_GOTO_MSG( + true, fn_fail, "Failed to execute command list - undefined queue type provided\n"); + break; + } + + ZE_CHECK(zeCommandQueueExecuteCommandLists(queue, list_count, lists, fence)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + +fn_exit: + return ret; +fn_fail: + ret = 1; + goto fn_exit; +} + int ishmemi_get_memory_type(const void *ptr, ze_memory_type_t *type) { int ret = 0; @@ -355,9 +545,10 @@ int ishmemi_get_memory_type(const void *ptr, ze_memory_type_t *type) void ishmemi_level_zero_sync() { - ishmemi_sync_cmd_queue(ishmemi_ze_cmd_queue, ishmemi_ze_cmd_lists); - for (int i = 0; i < NUM_LINK_QUEUE; i += 1) { - ishmemi_sync_cmd_queue(ishmemi_ze_link_cmd_queue[i], ishmemi_ze_link_cmd_lists[i]); + sync_cq(compute_queue, compute_lists); + sync_cq(copy_queue, copy_lists); + for (uint32_t i = 0; i < link_queue_count; ++i) { + sync_cq(link_queues[i], link_lists[i]); } } diff --git a/src/accelerator.h b/src/accelerator.h index bb1240f..9512e9e 100644 --- a/src/accelerator.h +++ b/src/accelerator.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -19,50 +19,27 @@ #define ISHMEMI_NO_DRIVERS -2 #define ISHMEMI_NO_DEVICE_ACCESS -3 -extern ze_driver_handle_t *all_drivers; -extern ze_device_handle_t **all_devices; - extern ze_driver_handle_t ishmemi_gpu_driver; -extern ze_driver_handle_t ishmemi_fpga_driver; extern ze_device_handle_t ishmemi_gpu_device; -extern ze_device_handle_t ishmemi_fpga_device; extern ze_context_handle_t ishmemi_ze_context; extern ze_context_desc_t ishmemi_ze_context_desc; -/* ishmemi_ze_cmd_queue is the main copy engine */ -extern ze_command_queue_handle_t ishmemi_ze_cmd_queue; -extern ze_command_queue_handle_t ishmemi_ze_all_cmd_queue; - -/* ishmemi_ze_link_cmd_queue are the bandwidth link copy engines */ -#ifdef ENABLE_REDUCED_LINK_ENGINES -constexpr int NUM_LINK_QUEUE = 2; -#else -constexpr int NUM_LINK_QUEUE = 3; -#endif -extern unsigned int ishmemi_link_engine_index; -extern ze_command_queue_handle_t ishmemi_ze_link_cmd_queue[NUM_LINK_QUEUE]; -extern unsigned int ishmemi_link_engine[NUM_LINK_QUEUE]; - -extern ishmemi_thread_safe_vector ishmemi_ze_cmd_lists; -extern ishmemi_thread_safe_vector - ishmemi_ze_link_cmd_lists[NUM_LINK_QUEUE]; + extern ze_event_pool_handle_t ishmemi_ze_event_pool; -extern uint32_t ishmemi_gpu_driver_idx; -static inline unsigned int ishmemi_next_link_engine_index() -{ - unsigned int next = ishmemi_link_engine_index + 1; - if (next >= NUM_LINK_QUEUE) next = 0; - ishmemi_link_engine_index = next; - return (next); -} +typedef enum : uint32_t { + COMPUTE_QUEUE = 0, + COPY_QUEUE, + LINK_QUEUE, + UNDEFINED_QUEUE, +} ishmemi_queue_type_t; static inline void ishmemi_print_device_properties(const ze_device_properties_t &props) { std::stringstream stream; stream << "PE : " << ishmemi_my_pe << " Device info: " << std::endl << " name : " << props.name << std::endl - << " type : " << ((props.type == ZE_DEVICE_TYPE_GPU) ? "GPU" : "FPGA") << std::endl + << " type : " << ((props.type == ZE_DEVICE_TYPE_GPU) ? "GPU" : "Unknown") << std::endl << " vendorId : " << props.vendorId << std::endl << " deviceId : " << props.deviceId << std::endl << " subdeviceId : " << props.subdeviceId << std::endl @@ -99,18 +76,25 @@ int ishmemi_usm_alloc_host(void **, size_t); int ishmemi_usm_alloc_device(void **, size_t); int ishmemi_usm_free(void *); +/* List/queue helper functions */ +int ishmemi_create_command_list(ishmemi_queue_type_t, bool, ze_command_list_handle_t *, + ze_command_list_flags_t flags = 0); +int ishmemi_create_command_list_nbi(ishmemi_queue_type_t, ze_command_list_handle_t *, + ze_command_list_flags_t flags = 0); +int ishmemi_execute_command_lists(ishmemi_queue_type_t, uint32_t, ze_command_list_handle_t *, + ze_fence_handle_t fence = nullptr); + template -T *ishmemi_get_mmap_address(T *device_ptr, size_t size) +T *ishmemi_get_mmap_address(T *device_ptr, size_t size, ze_ipc_mem_handle_t *ze_ipc_handle) { int ret = 0; int fd; int flags = MAP_SHARED; void *base; - ze_ipc_mem_handle_t ze_ipc_handle; - ZE_CHECK(zeMemGetIpcHandle(ishmemi_ze_context, device_ptr, &ze_ipc_handle)); + ZE_CHECK(zeMemGetIpcHandle(ishmemi_ze_context, device_ptr, ze_ipc_handle)); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - memcpy(&fd, &ze_ipc_handle, sizeof(fd)); + memcpy(&fd, ze_ipc_handle, sizeof(fd)); base = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, fd, 0); if (base == (void *) MAP_FAILED) { ISHMEM_CHECK_GOTO_MSG(1, fn_fail, "mmap failed with description: %s\n", strerror(errno)); @@ -122,4 +106,22 @@ T *ishmemi_get_mmap_address(T *device_ptr, size_t size) return (T *) nullptr; } +template +int ishmemi_close_mmap_address(ze_ipc_mem_handle_t ze_ipc_handle, T *host_ptr, size_t size) +{ + int ret; + ret = munmap(host_ptr, size); + ISHMEM_CHECK_GOTO_MSG(ret != 0, fn_fail, "munmap failed with description: %s\n", + strerror(errno)); + + ZE_CHECK(zeMemPutIpcHandle(ishmemi_ze_context, ze_ipc_handle)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + +fn_exit: + return ret; +fn_fail: + ret = -1; + goto fn_exit; +} + #endif /* ISHMEM_ACCELERATOR_H */ diff --git a/src/collectives.h b/src/collectives.h index 8c716df..2314d24 100644 --- a/src/collectives.h +++ b/src/collectives.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -10,10 +10,4 @@ #define ISHMEM_REDUCE_BUFFER_SIZE (1L << 16) #define ISHMEM_SYNC_NUM_PSYNC_ARRS 4 -extern size_t *ishmemi_collect_sizes; -extern size_t *ishmemi_my_collect_size; - -int ishmemi_collectives_init(); -int ishmemi_collectives_fini(); - #endif // ifndef ISHMEM_COLLECTIVES_H diff --git a/src/collectives/alltoall_impl.h b/src/collectives/alltoall_impl.h index e13b14c..c67e2d5 100644 --- a/src/collectives/alltoall_impl.h +++ b/src/collectives/alltoall_impl.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -6,6 +6,7 @@ #define COLLECTIVES_ALLTOALL_IMPL_H #include "collectives.h" +#include "sync_impl.h" #include "runtime.h" #include "on_queue.h" @@ -43,8 +44,9 @@ int ishmem_alltoall(ishmem_team_t team, T *dest, const T *src, size_t nelems) /* compute our address of our section of dest in each PE */ int idx = 0; for (int pe = team_ptr->start; idx < team_ptr->size; pe += team_ptr->stride, idx++) { + uint8_t local_index = ISHMEMI_LOCAL_PES[pe]; dptr[idx] = ISHMEMI_ADJUST_PTR( - T, (pe + 1), &dest[nelems * static_cast(team_ptr->my_pe)]); + T, local_index, &dest[nelems * static_cast(team_ptr->my_pe)]); sptr[idx] = &src[nelems * static_cast(idx)]; } /* The idea for the inner loop being over local team PEs is that the outstanding stores @@ -56,7 +58,7 @@ int ishmem_alltoall(ishmem_team_t team, T *dest, const T *src, size_t nelems) dptr[pe][offset] = sptr[pe][offset]; } } - ishmem_team_sync(team); /* assure destination buffers complete */ + ishmemi_team_sync(team); /* assure destination buffers complete */ return ret; } } @@ -86,7 +88,7 @@ int ishmem_alltoall(ishmem_team_t team, T *dest, const T *src, size_t nelems) int ret = ishmemi_ipc_put_v(team_ptr->size, items); ISHMEM_CHECK_GOTO_MSG(ret, fn_fail, "ishmemi_ipc_put_v within team alltoall failed\n"); fn_fail: - ishmem_team_sync(team); /* assure destination buffers complete */ + ishmemi_team_sync(team); /* assure destination buffers complete */ return ret; } ishmemi_ringcompletion_t comp; @@ -120,7 +122,7 @@ sycl::event ishmemx_alltoall_on_queue(ishmem_team_t team, T *dest, const T *src, if (ret) *ret = tmp_ret; }); } else { - cgh.host_task([=]() { + cgh.single_task([=]() { int tmp_ret = ishmem_alltoall(team, dest, src, nelems); if (ret) *ret = tmp_ret; }); @@ -170,8 +172,9 @@ int ishmemx_alltoall_work_group(ishmem_team_t team, T *dest, const T *src, size_ T *dptr[MAX_LOCAL_PES]; /* destination pointer for each pe*/ int idx = 0; for (int pe = team_ptr->start; idx < team_ptr->size; pe += team_ptr->stride, idx++) { + uint8_t local_index = ISHMEMI_LOCAL_PES[pe]; dptr[idx] = ISHMEMI_ADJUST_PTR( - T, (pe + 1), &dest[nelems * static_cast(team_ptr->my_pe)]); + T, local_index, &dest[nelems * static_cast(team_ptr->my_pe)]); sptr[idx] = &src[nelems * static_cast(idx)]; } for (size_t offset = work_item_start_idx; diff --git a/src/collectives/barrier.cpp b/src/collectives/barrier.cpp index b8dd16c..e236454 100644 --- a/src/collectives/barrier.cpp +++ b/src/collectives/barrier.cpp @@ -1,10 +1,11 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ #include "ishmem/err.h" #include "proxy_impl.h" #include "collectives.h" +#include "sync_impl.h" #include "runtime.h" #include "on_queue.h" @@ -18,7 +19,7 @@ void ishmem_barrier_all() if (info->only_intra_node) req.op = QUIET; ishmemi_proxy_blocking_request(req); if (info->only_intra_node) { - ishmem_team_sync(ISHMEM_TEAM_WORLD); + ishmemi_team_sync(ISHMEM_TEAM_WORLD); } #else ishmemi_drain_ring(); @@ -34,7 +35,7 @@ sycl::event ishmemx_barrier_all_on_queue(sycl::queue &q, const std::vectorsecond->event, deps); - cgh.host_task([=]() { ishmem_barrier_all(); }); + cgh.single_task([=]() { ishmem_barrier_all(); }); }); ishmemi_on_queue_events_map[&q]->event = e; return e; diff --git a/src/collectives/broadcast_impl.h b/src/collectives/broadcast_impl.h index bed7efe..a8275d5 100644 --- a/src/collectives/broadcast_impl.h +++ b/src/collectives/broadcast_impl.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -6,6 +6,7 @@ #define COLLECTIVES_BROADCAST_IMPL_H #include "collectives.h" +#include "sync_impl.h" #include "runtime.h" #include "rma_impl.h" #include "on_queue.h" @@ -41,7 +42,8 @@ int ishmem_broadcast(ishmem_team_t team, T *dest, const T *src, size_t nelems, i int idx = 0; for (int pe = team_ptr->start; idx < team_ptr->size; pe += team_ptr->stride, idx++) { - ptr[idx] = ISHMEMI_ADJUST_PTR(T, (pe + 1), dest); + uint8_t local_index = ISHMEMI_LOCAL_PES[pe]; + ptr[idx] = ISHMEMI_ADJUST_PTR(T, local_index, dest); } /* The idea for the inner loop being over local PEs is that the outstanding stores * will use different links. @@ -54,13 +56,13 @@ int ishmem_broadcast(ishmem_team_t team, T *dest, const T *src, size_t nelems, i } } } - ishmem_team_sync(team); /* assure destination buffers complete */ + ishmemi_team_sync(team); /* assure destination buffers complete */ return ret; #else // BROADCAST_PULL - ishmem_team_sync(team); /* make sure that PE_root's source buffer is ready for use */ + ishmemi_team_sync(team); /* make sure that PE_root's source buffer is ready for use */ ishmem_internal_get(dest, src, nelems, ishmem_team_translate_pe(team, PE_root, ISHMEM_TEAM_WORLD)); - ishmem_team_sync(team); /* sync after to let PE_root know we are done */ + ishmemi_team_sync(team); /* sync after to let PE_root know we are done */ return ret; #endif } @@ -93,18 +95,18 @@ int ishmem_broadcast(ishmem_team_t team, T *dest, const T *src, size_t nelems, i ISHMEM_CHECK_GOTO_MSG(ret, fn_fail, "ishmemi_ipc_put_v within team broadcast failed\n"); } fn_fail: - ishmem_team_sync(team); /* assure all destination buffers complete */ + ishmemi_team_sync(team); /* assure all destination buffers complete */ return ret; } #else if (team_ptr->only_intra && ISHMEMI_HOST_IN_HEAP(src)) { - ishmem_team_sync(team); /* assure PE_root source buffer is ready for use */ + ishmemi_team_sync(team); /* assure PE_root source buffer is ready for use */ ret = ishmem_team_translate_pe(team, PE_root, ISHMEM_TEAM_WORLD); ISHMEM_CHECK_GOTO_MSG((ret < 0), fn_fail, "ishmem_team_translate_pe within team broadcast failed\n"); ret = ishmemi_ipc_get(dest, src, nelems, ret); fn_fail: - ishmem_team_sync(team); /* assure PE_root can reuse source buffer */ + ishmemi_team_sync(team); /* assure PE_root can reuse source buffer */ return ret; } #endif @@ -139,7 +141,7 @@ sycl::event ishmemx_broadcast_on_queue(ishmem_team_t team, T *dest, const T *src if (ret) *ret = tmp_ret; }); } else { - cgh.host_task([=]() { + cgh.single_task([=]() { int tmp_ret = ishmem_broadcast(team, dest, src, nelems, PE_root); if (ret) *ret = tmp_ret; }); @@ -186,7 +188,8 @@ int ishmemx_broadcast_work_group(ishmem_team_t team, T *dest, const T *src, size int idx = 0; for (int pe = team_ptr->start; idx < team_ptr->size; pe += team_ptr->stride, idx++) { - ptr[idx] = ISHMEMI_ADJUST_PTR(T, (pe + 1), dest); + uint8_t local_index = ISHMEMI_LOCAL_PES[pe]; + ptr[idx] = ISHMEMI_ADJUST_PTR(T, local_index, dest); } for (size_t offset = work_item_start_idx; offset < work_item_start_idx + my_nelems_work_item; offset += 1) { diff --git a/src/collectives/collect_impl.h b/src/collectives/collect_impl.h index ebe0946..8d0565e 100644 --- a/src/collectives/collect_impl.h +++ b/src/collectives/collect_impl.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -6,6 +6,7 @@ #define COLLECTIVES_COLLECT_IMPL_H #include "collectives.h" +#include "sync_impl.h" #include "runtime.h" #include "runtime_ipc.h" #include "on_queue.h" @@ -37,7 +38,7 @@ int ishmem_collect(ishmem_team_t team, T *dest, const T *src, size_t nelems) ishmemi_info_t *info = global_info; // duplicate load from above if (team_ptr->only_intra) { team_ptr->collect_mynelems = nelems; // save our nelems into symmetric space - ishmem_team_sync( + ishmemi_team_sync( team); // fcollect requires input buffer be ready everywhere when fcollect starts ishmem_fcollect(team, team_ptr->collect_nelems, &team_ptr->collect_mynelems, 1); size_t total_nelems = 0; @@ -59,7 +60,8 @@ int ishmem_collect(ishmem_team_t team, T *dest, const T *src, size_t nelems) /* compute our address of our section of dest in each PE */ for (int teampe = 0, globalpe = team_ptr->start; teampe < team_ptr->size; globalpe += team_ptr->stride, teampe += 1) { - ptr[teampe] = ISHMEMI_ADJUST_PTR(T, (globalpe + 1), &dest[base_nelems]); + uint8_t local_index = ISHMEMI_LOCAL_PES[globalpe]; + ptr[teampe] = ISHMEMI_ADJUST_PTR(T, local_index, &dest[base_nelems]); } /* The idea for the inner loop being over local PEs is that the outstanding stores * will use different links */ @@ -70,7 +72,7 @@ int ishmem_collect(ishmem_team_t team, T *dest, const T *src, size_t nelems) } } /* assure all destination buffers complete */ - ishmem_team_sync(team); + ishmemi_team_sync(team); return ret; } } @@ -92,7 +94,7 @@ int ishmem_collect(ishmem_team_t team, T *dest, const T *src, size_t nelems) struct put_item items[MAX_LOCAL_PES]; size_t base_nelems = 0; // nelem index of where our data goes team_ptr->collect_mynelems = nelems; // save our nelems into symmetric space - ishmem_team_sync( + ishmemi_team_sync( team); // fcollect requires input buffer be ready everywhere when fcollect starts ishmem_fcollect(team, team_ptr->collect_nelems, &team_ptr->collect_mynelems, 1); ISHMEM_CHECK_GOTO_MSG(ret, fn_fail, "shmem_size_fcollect on team collect failed\n"); @@ -108,7 +110,7 @@ int ishmem_collect(ishmem_team_t team, T *dest, const T *src, size_t nelems) ret = ishmemi_ipc_put_v(team_ptr->size, items); ISHMEM_CHECK_GOTO_MSG(ret, fn_fail, "ishmemi_ipc_put_v within team collect failed\n"); fn_fail: - ishmem_team_sync(team); /* assure all destination buffers complete */ + ishmemi_team_sync(team); /* assure all destination buffers complete */ return ret; } ishmemi_ringcompletion_t comp; @@ -172,8 +174,8 @@ int ishmemx_collect_work_group(ishmem_team_t team, T *dest, const T *src, size_t if (team_ptr->only_intra) { if (grp.leader()) { team_ptr->collect_mynelems = nelems; // save our nelems into symmetric space - ishmem_team_sync(team); // fcollect requires input buffer be ready everywhere when - // fcollect starts + ishmemi_team_sync(team); // fcollect requires input buffer be ready everywhere when + // fcollect starts ishmem_fcollect(team, team_ptr->collect_nelems, &team_ptr->collect_mynelems, 1); if constexpr (enable_error_checking) { /* this copy of total_nelems is only available to the leader thread */ @@ -202,7 +204,8 @@ int ishmemx_collect_work_group(ishmem_team_t team, T *dest, const T *src, size_t T *ptr[MAX_LOCAL_PES]; for (int teampe = 0, globalpe = team_ptr->start; teampe < team_ptr->size; globalpe += team_ptr->stride, teampe += 1) { - ptr[teampe] = ISHMEMI_ADJUST_PTR(T, (globalpe + 1), &dest[base_nelems]); + uint8_t local_index = ISHMEMI_LOCAL_PES[globalpe]; + ptr[teampe] = ISHMEMI_ADJUST_PTR(T, local_index, &dest[base_nelems]); } for (size_t offset = work_item_start_idx; offset < work_item_start_idx + my_nelems_work_item; offset += 1) { @@ -372,7 +375,8 @@ int ishmem_fcollect(ishmem_team_t team, T *dest, const T *src, size_t nelems) /* compute our address of our section of dest in each PE */ for (int teampe = 0, globalpe = team_ptr->start; teampe < team_ptr->size; globalpe += team_ptr->stride, teampe++) { - ptr[teampe] = ISHMEMI_ADJUST_PTR(T, (globalpe + 1), (&dest[base_nelems])); + uint8_t local_index = ISHMEMI_LOCAL_PES[globalpe]; + ptr[teampe] = ISHMEMI_ADJUST_PTR(T, local_index, (&dest[base_nelems])); } /* The idea for the inner loop being over local PEs is that the outstanding stores will * use different links */ @@ -382,7 +386,7 @@ int ishmem_fcollect(ishmem_team_t team, T *dest, const T *src, size_t nelems) ptr[teampe][offset] = data; } } - ishmem_team_sync(team); /* assure all destination buffers complete */ + ishmemi_team_sync(team); /* assure all destination buffers complete */ return ret; } } @@ -411,7 +415,7 @@ int ishmem_fcollect(ishmem_team_t team, T *dest, const T *src, size_t nelems) int ret = ishmemi_ipc_put_v(team_ptr->size, items); ISHMEM_CHECK_GOTO_MSG(ret, fn_fail, "ishmemi_ipc_put_v within team fcollect failed\n"); fn_fail: - ishmem_team_sync(team); /* assure all destination buffers complete */ + ishmemi_team_sync(team); /* assure all destination buffers complete */ return ret; } ishmemi_ringcompletion_t comp; @@ -445,7 +449,7 @@ sycl::event ishmemx_fcollect_on_queue(ishmem_team_t team, T *dest, const T *src, if (ret) *ret = tmp_ret; }); } else { - cgh.host_task([=]() { + cgh.single_task([=]() { int tmp_ret = ishmem_fcollect(team, dest, src, nelems); if (ret) *ret = tmp_ret; }); @@ -494,7 +498,8 @@ int ishmemx_fcollect_work_group(ishmem_team_t team, T *dest, const T *src, size_ T *ptr[MAX_LOCAL_PES]; for (int teampe = 0, globalpe = team_ptr->start; teampe < team_ptr->size; globalpe += team_ptr->stride, teampe++) { - ptr[teampe] = ISHMEMI_ADJUST_PTR(T, (globalpe + 1), &dest[base]); + uint8_t local_index = ISHMEMI_LOCAL_PES[globalpe]; + ptr[teampe] = ISHMEMI_ADJUST_PTR(T, local_index, &dest[base]); } for (size_t offset = work_item_start_idx; offset < work_item_start_idx + my_nelems_work_item; offset += 1) { diff --git a/src/collectives/reduce_impl.h b/src/collectives/reduce_impl.h index 456d99e..7169ac1 100644 --- a/src/collectives/reduce_impl.h +++ b/src/collectives/reduce_impl.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -8,6 +8,7 @@ #include "ishmem/err.h" #include "ishmem/copy.h" #include "collectives.h" +#include "sync_impl.h" #include #include "memory.h" #include "runtime.h" @@ -142,8 +143,8 @@ template inline void vector_reduce_work_group(T *d, const T *s, size_t count, const Group &grp) { size_t stride = grp.get_local_linear_range(); - long linear_id = static_cast(grp.get_local_linear_id()); - long idx = linear_id; + size_t linear_id = grp.get_local_linear_id(); + size_t idx = linear_id; T *aligned_s = (T *) sycl::min(((((uintptr_t) s) + ISHMEMI_ALIGNMASK) & (~ISHMEMI_ALIGNMASK)), (uintptr_t) (s + count)); while (((uintptr_t) &s[idx]) < ((uintptr_t) (aligned_s))) { @@ -168,15 +169,15 @@ inline void vector_reduce_work_group(T *d, const T *s, size_t count, const Group sycl::vec vs; sycl::vec vd; while ((idx + ishmemi_vec_length) <= count) { - vs.load(0, ds + idx); - vd.load(0, dd + idx); + vs.load(0, ds + static_cast(idx)); + vd.load(0, dd + static_cast(idx)); reduce_op(vd, vs); - vd.store(0, dd + idx); + vd.store(0, dd + static_cast(idx)); idx += vstride; } - idx = linear_id + (static_cast(count) & (~(static_cast(ishmemi_vec_length) - 1))); + idx = linear_id + static_cast((static_cast(count) & (~(ishmemi_vec_length - 1)))); while (idx < count) { - reduce_op(dd[idx], ds[idx]); + reduce_op(dd[static_cast(idx)], ds[static_cast(idx)]); idx += stride; } } @@ -192,9 +193,9 @@ int ishmemi_generic_op_reduce(ishmem_team_t team, T *dest, const T *src, size_t while (nreduce > 0) { size_t this_reduce = nreduce; if (this_reduce > max_reduce) this_reduce = max_reduce; - void *cdst = ishmem_copy((void *) team_ptr->source, (void *) src, this_reduce * sizeof(T)); + void *cdst = ishmemi_copy((void *) team_ptr->source, (void *) src, this_reduce * sizeof(T)); if ((uintptr_t) cdst != (uintptr_t) team_ptr->source) { - ISHMEM_DEBUG_MSG("ishmem_copy in failed\n"); + ISHMEM_DEBUG_MSG("ishmemi_copy in failed\n"); return (1); } @@ -214,9 +215,9 @@ int ishmemi_generic_op_reduce(ishmem_team_t team, T *dest, const T *src, size_t ISHMEM_DEBUG_MSG("runtime reduction failed\n"); return ret; } - cdst = ishmem_copy((void *) dest, (void *) team_ptr->dest, this_reduce * sizeof(T)); + cdst = ishmemi_copy((void *) dest, (void *) team_ptr->dest, this_reduce * sizeof(T)); if ((uintptr_t) cdst != (uintptr_t) dest) { - ISHMEM_DEBUG_MSG("ishmem_copy out failed\n"); + ISHMEM_DEBUG_MSG("ishmemi_copy out failed\n"); return 1; } dest += this_reduce; @@ -240,16 +241,17 @@ inline int ishmemi_sub_reduce(ishmem_team_t team, T *dest, const T *source, size #endif int my_world_pe = ishmem_team_translate_pe(team, team_ptr->my_pe, ISHMEM_TEAM_WORLD); - ishmem_team_sync(team); /* assure all source buffers are ready for use */ + ishmemi_team_sync(team); /* assure all source buffers are ready for use */ int idx = 0; for (int pe = team_ptr->start; idx < team_ptr->size; pe += team_ptr->stride, idx++) { if (pe == my_world_pe) continue; - T *remote = ISHMEMI_FAST_ADJUST(T, info, info->local_pes[pe], source); + uint8_t local_index = ISHMEMI_LOCAL_PES[pe]; + T *remote = ISHMEMI_FAST_ADJUST(T, info, local_index, source); vector_reduce_helper(vector_reduce, dest, remote, nreduce); } - ishmem_team_sync(team); + ishmemi_team_sync(team); return ret; } @@ -349,7 +351,8 @@ inline int ishmemi_sub_reduce_work_group(ishmem_team_t team, T *dest, const T *s int my_world_pe = ishmem_team_translate_pe(team, team_ptr->my_pe, ISHMEM_TEAM_WORLD); for (int pe = team_ptr->start; idx < team_ptr->size; pe += team_ptr->stride, idx++) { if (pe == my_world_pe) continue; - T *remote = ISHMEMI_FAST_ADJUST(T, info, info->local_pes[pe], source); + uint8_t local_index = ISHMEMI_LOCAL_PES[pe]; + T *remote = ISHMEMI_FAST_ADJUST(T, info, local_index, source); vector_reduce_helper(vector_reduce_work_group, dest, remote, nreduce, grp); } @@ -460,7 +463,7 @@ sycl::event ishmemi_reduce_on_queue(ishmem_team_t team, T *dest, const T *src, s if (ret) *ret = tmp_ret; }); } else { - cgh.host_task([=]() { + cgh.single_task([=]() { int tmp_ret = ishmemi_reduce(team, dest, src, nreduce); if (ret) *ret = tmp_ret; }); diff --git a/src/collectives/sync.cpp b/src/collectives/sync.cpp index 2920b64..1a0b77b 100644 --- a/src/collectives/sync.cpp +++ b/src/collectives/sync.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -7,64 +7,11 @@ #include "ishmem/types.h" #include "proxy_impl.h" #include "collectives.h" +#include "sync_impl.h" #include "runtime.h" #include "teams.h" #include "on_queue.h" -static inline void sync_team_fallback(ishmem_team_t team) -{ - ishmemi_request_t req; - ishmemi_ringcompletion_t comp __attribute__((unused)); - req.op = TEAM_SYNC; - req.type = NONE; - req.team = team; - -#ifdef __SYCL_DEVICE_ONLY__ - ishmemi_proxy_blocking_request(req); - atomic_fence(sycl::memory_order::seq_cst, sycl::memory_scope::system); -#else - ishmemi_runtime->proxy_funcs[req.op][req.type](&req, &comp); -#endif -} - -inline void ishmemi_team_sync(ishmem_team_t team) -{ - /* Node-local, on-device implementation */ - if constexpr (ishmemi_is_device) { - ishmemi_info_t *info = global_info; - ishmemi_team_device_t *team_ptr = &info->team_device_pool[team]; - if (team_ptr->only_intra) { - int index = team_ptr->psync_idx; - long *my_psync = &team_ptr->psync[index]; - int last_i = team_ptr->last_pe + 1; - int stride = team_ptr->stride; - for (int i = team_ptr->start + 1; i <= last_i; i += stride) { - long *psync = ISHMEMI_FAST_ADJUST(long, info, i, my_psync); - /* These atomics can be relaxed because we don't care about their ordering */ - sycl::atomic_ref - atomic_psync(*psync); - atomic_psync += 1L; /* atomic increment info->ipc_buffers[pOffset] */ - } - team_ptr->psync_idx = (index + 1) & (ISHMEM_SYNC_NUM_PSYNC_ARRS - 1); - /* This atomic has to be seq_cst because we definitely want it to happen in order */ - sycl::atomic_ref - atomic_psync(*my_psync); - long expected; - int size = team_ptr->size; - do { - expected = size; - } while (!atomic_psync.compare_exchange_strong( - expected, 0L, sycl::memory_order::seq_cst, sycl::memory_order::seq_cst)); - return; - } - } - - /* Otherwise */ - sync_team_fallback(team); -} - void ishmem_sync_all() { ishmemi_team_sync(ISHMEM_TEAM_WORLD); @@ -96,7 +43,9 @@ ISHMEM_DEVICE_ATTRIBUTES int ishmem_team_sync(ishmem_team_t team) if constexpr (enable_error_checking) { if (team <= ISHMEM_TEAM_INVALID || team >= ISHMEMI_N_TEAMS) return -1; } + ishmemi_team_sync(team); + return 0; } @@ -112,13 +61,13 @@ sycl::event ishmemx_team_sync_on_queue(ishmem_team_t team, int *ret, sycl::queue set_cmd_grp_dependencies(cgh, entry_already_exists, iter->second->event, deps); if (myteam->only_intra) { cgh.single_task([=]() { - int tmp_ret = ishmem_team_sync(team); - if (ret) *ret = tmp_ret; + ishmemi_team_sync(team); + if (ret) *ret = 0; }); } else { - cgh.host_task([=]() { - int tmp_ret = ishmem_team_sync(team); - if (ret) *ret = tmp_ret; + cgh.single_task([=]() { + ishmemi_team_sync(team); + if (ret) *ret = 0; }); } }); diff --git a/src/env_utils.cpp b/src/env_utils.cpp index 72b3268..35b6e71 100644 --- a/src/env_utils.cpp +++ b/src/env_utils.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2025 Intel Corporation +/* Copyright (C) 2024 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause * * Portions derived from Sandia OpenSHMEM (https://github.com/Sandia-OpenSHMEM/SOS) diff --git a/src/ipc.cpp b/src/ipc.cpp index cda5911..78b3028 100644 --- a/src/ipc.cpp +++ b/src/ipc.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -172,7 +172,7 @@ int ishmemi_ipc_init() ipc_data.ipc_fd[0] = ipc_data.ipc_fd[1] = -1; - for (int i = 0; i < nfds; ++i) { + for (size_t i = 0; i < nfds; ++i) { memcpy(&ipc_data.ipc_fd[i], &ipc_handle[i], sizeof(int)); memcpy(&ipc_data.ipc_handle[i], &ipc_handle[i], sizeof(ze_ipc_mem_handle_t)); } @@ -306,7 +306,7 @@ static int ipc_init_pidfd() /* Gather the info from other PEs */ ishmemi_runtime->node_fcollect(heap_data, local_heap_data, sizeof(ipc_data_t)); - ishmem_copy(local_data, heap_data, static_cast(local_size) * sizeof(ipc_data_t)); + ishmemi_copy(local_data, heap_data, static_cast(local_size) * sizeof(ipc_data_t)); /* Validate that every pid can be opened locally */ for (int i = 0; i < local_size; ++i) { @@ -428,7 +428,7 @@ static int ipc_init_sockets() /* Gather pids from other PEs */ ishmemi_runtime->node_fcollect(heap_pids, local_heap_pid, sizeof(pid_t)); - ishmem_copy(local_pids, heap_pids, static_cast(local_size) * sizeof(pid_t)); + ishmemi_copy(local_pids, heap_pids, static_cast(local_size) * sizeof(pid_t)); for (int i = 0; i < local_size; ++i) { ISHMEM_DEBUG_MSG("heap_pids[%d] = %d (%d)\n", i, heap_pids[i], local_pids[i]); diff --git a/src/ishmem.cpp b/src/ishmem.cpp index 18e1b84..44b6571 100644 --- a/src/ishmem.cpp +++ b/src/ishmem.cpp @@ -229,7 +229,6 @@ static void ishmemi_init(ishmemx_attr_t *attr, bool user_attr) int memory_initialized = 0; int ipc_initialized = 0; int teams_initialized = 0; - int collectives_initialized = 0; ishmemx_runtime_type_t env_runtime; ishmemi_init_op_str(); @@ -347,10 +346,6 @@ static void ishmemi_init(ishmemx_attr_t *attr, bool user_attr) ISHMEM_CHECK_GOTO_MSG(ret, cleanup, "Teams initialization failed '%d'\n", ret); teams_initialized = 1; - ret = ishmemi_collectives_init(); - ISHMEM_CHECK_GOTO_MSG(ret, cleanup, "Collectives initialization failed '%d'\n", ret); - collectives_initialized = 1; - /* proxy_init will initialize ring data structures */ ret = ishmemi_proxy_init(); ISHMEM_CHECK_GOTO_MSG(ret, cleanup, "Proxy initialization failed '%d'\n", ret); @@ -365,10 +360,6 @@ static void ishmemi_init(ishmemx_attr_t *attr, bool user_attr) return; cleanup: - if (collectives_initialized) { - ishmemi_collectives_fini(); - } - if (teams_initialized) { ishmemi_team_fini(); } @@ -444,9 +435,6 @@ void ishmem_finalize() ret = ishmemi_team_fini(); ISHMEM_CHECK_GOTO_MSG(ret, fail, "Teams finalize failed '%d'\n", ret); - ret = ishmemi_collectives_fini(); - ISHMEM_CHECK_GOTO_MSG(ret, fail, "Collectives finalize failed '%d'\n", ret); - if (ishmemi_cpu_info->use_ipc) { ret = ishmemi_ipc_fini(); ISHMEM_CHECK_GOTO_MSG(ret, fail, "IPC finalize failed '%d'\n", ret); diff --git a/src/ishmem.h b/src/ishmem.h index 7ad31fc..48ba241 100644 --- a/src/ishmem.h +++ b/src/ishmem.h @@ -10,11 +10,13 @@ #else #include #endif +#include #define ISHMEM_DEVICE_ATTRIBUTES SYCL_EXTERNAL #define ISHMEM_MAJOR_VERSION 1 -#define ISHMEM_MINOR_VERSION 4 +#define ISHMEM_MINOR_VERSION 5 +#define ISHMEM_PATCH_VERSION 0 #define ISHMEM_MAX_NAME_LEN 256 #define ISHMEM_VENDOR_STRING "Intel® SHMEM" @@ -1234,6 +1236,108 @@ ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint64_prod_reduce(ishmem_team_t, uint64_t * ISHMEM_DEVICE_ATTRIBUTES int ishmem_size_prod_reduce(ishmem_team_t, size_t *, const size_t *, size_t); ISHMEM_DEVICE_ATTRIBUTES int ishmem_ptrdiff_prod_reduce(ishmem_team_t, ptrdiff_t *, const ptrdiff_t *, size_t); +/* scan (prefix sum) */ +template ISHMEM_DEVICE_ATTRIBUTES int ishmem_sum_inscan(T *, const T *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_float_sum_inscan(float *, const float *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_double_sum_inscan(double *, const double *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_char_sum_inscan(char *, const char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_schar_sum_inscan(signed char *, const signed char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_short_sum_inscan(short *, const short *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int_sum_inscan(int *, const int *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_long_sum_inscan(long *, const long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_longlong_sum_inscan(long long *, const long long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uchar_sum_inscan(unsigned char *, const unsigned char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ushort_sum_inscan(unsigned short *, const unsigned short *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint_sum_inscan(unsigned int *, const unsigned int *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ulong_sum_inscan(unsigned long *, const unsigned long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ulonglong_sum_inscan(unsigned long long *, const unsigned long long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int8_sum_inscan(int8_t *, const int8_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int16_sum_inscan(int16_t *, const int16_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int32_sum_inscan(int32_t *, const int32_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int64_sum_inscan(int64_t *, const int64_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint8_sum_inscan(uint8_t *, const uint8_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint16_sum_inscan(uint16_t *, const uint16_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint32_sum_inscan(uint32_t *, const uint32_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint64_sum_inscan(uint64_t *, const uint64_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_size_sum_inscan(size_t *, const size_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ptrdiff_sum_inscan(ptrdiff_t *, const ptrdiff_t *, size_t); + +template ISHMEM_DEVICE_ATTRIBUTES int ishmem_sum_exscan(T *, const T *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_float_sum_exscan(float *, const float *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_double_sum_exscan(double *, const double *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_char_sum_exscan(char *, const char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_schar_sum_exscan(signed char *, const signed char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_short_sum_exscan(short *, const short *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int_sum_exscan(int *, const int *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_long_sum_exscan(long *, const long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_longlong_sum_exscan(long long *, const long long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uchar_sum_exscan(unsigned char *, const unsigned char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ushort_sum_exscan(unsigned short *, const unsigned short *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint_sum_exscan(unsigned int *, const unsigned int *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ulong_sum_exscan(unsigned long *, const unsigned long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ulonglong_sum_exscan(unsigned long long *, const unsigned long long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int8_sum_exscan(int8_t *, const int8_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int16_sum_exscan(int16_t *, const int16_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int32_sum_exscan(int32_t *, const int32_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int64_sum_exscan(int64_t *, const int64_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint8_sum_exscan(uint8_t *, const uint8_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint16_sum_exscan(uint16_t *, const uint16_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint32_sum_exscan(uint32_t *, const uint32_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint64_sum_exscan(uint64_t *, const uint64_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_size_sum_exscan(size_t *, const size_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ptrdiff_sum_exscan(ptrdiff_t *, const ptrdiff_t *, size_t); + +/* scan (prefix sum) on a team */ +template ISHMEM_DEVICE_ATTRIBUTES int ishmem_sum_inscan(ishmem_team_t, T *, const T *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_float_sum_inscan(ishmem_team_t, float *, const float *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_double_sum_inscan(ishmem_team_t, double *, const double *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_char_sum_inscan(ishmem_team_t, char *, const char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_schar_sum_inscan(ishmem_team_t, signed char *, const signed char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_short_sum_inscan(ishmem_team_t, short *, const short *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int_sum_inscan(ishmem_team_t, int *, const int *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_long_sum_inscan(ishmem_team_t, long *, const long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_longlong_sum_inscan(ishmem_team_t, long long *, const long long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uchar_sum_inscan(ishmem_team_t, unsigned char *, const unsigned char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ushort_sum_inscan(ishmem_team_t, unsigned short *, const unsigned short *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint_sum_inscan(ishmem_team_t, unsigned int *, const unsigned int *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ulong_sum_inscan(ishmem_team_t, unsigned long *, const unsigned long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ulonglong_sum_inscan(ishmem_team_t, unsigned long long *, const unsigned long long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int8_sum_inscan(ishmem_team_t, int8_t *, const int8_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int16_sum_inscan(ishmem_team_t, int16_t *, const int16_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int32_sum_inscan(ishmem_team_t, int32_t *, const int32_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int64_sum_inscan(ishmem_team_t, int64_t *, const int64_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint8_sum_inscan(ishmem_team_t, uint8_t *, const uint8_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint16_sum_inscan(ishmem_team_t, uint16_t *, const uint16_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint32_sum_inscan(ishmem_team_t, uint32_t *, const uint32_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint64_sum_inscan(ishmem_team_t, uint64_t *, const uint64_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_size_sum_inscan(ishmem_team_t, size_t *, const size_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ptrdiff_sum_inscan(ishmem_team_t, ptrdiff_t *, const ptrdiff_t *, size_t); + +template ISHMEM_DEVICE_ATTRIBUTES int ishmem_sum_exscan(ishmem_team_t, T *, const T *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_float_sum_exscan(ishmem_team_t, float *, const float *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_double_sum_exscan(ishmem_team_t, double *, const double *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_char_sum_exscan(ishmem_team_t, char *, const char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_schar_sum_exscan(ishmem_team_t, signed char *, const signed char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_short_sum_exscan(ishmem_team_t, short *, const short *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int_sum_exscan(ishmem_team_t, int *, const int *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_long_sum_exscan(ishmem_team_t, long *, const long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_longlong_sum_exscan(ishmem_team_t, long long *, const long long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uchar_sum_exscan(ishmem_team_t, unsigned char *, const unsigned char *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ushort_sum_exscan(ishmem_team_t, unsigned short *, const unsigned short *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint_sum_exscan(ishmem_team_t, unsigned int *, const unsigned int *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ulong_sum_exscan(ishmem_team_t, unsigned long *, const unsigned long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ulonglong_sum_exscan(ishmem_team_t, unsigned long long *, const unsigned long long *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int8_sum_exscan(ishmem_team_t, int8_t *, const int8_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int16_sum_exscan(ishmem_team_t, int16_t *, const int16_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int32_sum_exscan(ishmem_team_t, int32_t *, const int32_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_int64_sum_exscan(ishmem_team_t, int64_t *, const int64_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint8_sum_exscan(ishmem_team_t, uint8_t *, const uint8_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint16_sum_exscan(ishmem_team_t, uint16_t *, const uint16_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint32_sum_exscan(ishmem_team_t, uint32_t *, const uint32_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_uint64_sum_exscan(ishmem_team_t, uint64_t *, const uint64_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_size_sum_exscan(ishmem_team_t, size_t *, const size_t *, size_t); +ISHMEM_DEVICE_ATTRIBUTES int ishmem_ptrdiff_sum_exscan(ishmem_team_t, ptrdiff_t *, const ptrdiff_t *, size_t); + /* test */ template ISHMEM_DEVICE_ATTRIBUTES int ishmem_test(T *, int, T); ISHMEM_DEVICE_ATTRIBUTES int ishmem_int_test(int *, int, int); diff --git a/src/ishmem/copy.h b/src/ishmem/copy.h index 30be53e..58d8627 100644 --- a/src/ishmem/copy.h +++ b/src/ishmem/copy.h @@ -245,7 +245,7 @@ void vec_copy_work_group_strided_push(T *d, const T *s, size_t count, Group grp) if constexpr (ishmemi_is_device) { size_t stride = grp.get_local_linear_range(); size_t linear_id = grp.get_local_linear_id(); - long idx = (long) linear_id; + size_t idx = linear_id; T *aligned_d = (T *) sycl::min(((((uintptr_t) d) + ISHMEMI_ALIGNMASK) & (~ISHMEMI_ALIGNMASK)), (uintptr_t) (d + count)); @@ -256,11 +256,11 @@ void vec_copy_work_group_strided_push(T *d, const T *s, size_t count, Group grp) d[idx] = s[idx]; idx += stride; } - count -= (unsigned long) (aligned_d - d); // pointer difference is in units of T + count -= (size_t) (aligned_d - d); // pointer difference is in units of T s += (aligned_d - d); /* at this point, if count > 0, then d is aligned, s may not be aligned */ if (count == 0) return; - idx = (long) (linear_id * ishmemi_vec_length); + idx = (linear_id * ishmemi_vec_length); size_t vstride = stride * ishmemi_vec_length; sycl::multi_ptr @@ -285,7 +285,7 @@ void vec_copy_work_group_strided_push(T *d, const T *s, size_t count, Group grp) * back to item at a time */ /* idx here should be the postfix index + linear_id */ - idx = (long) (linear_id + (count & (~((unsigned long) ishmemi_vec_length - 1)))); + idx = (linear_id + (count & (~((unsigned long) ishmemi_vec_length - 1)))); while (idx < count) { dd[idx] = ds[idx]; idx += stride; @@ -324,7 +324,7 @@ void vec_copy_work_group_strided_pull(T *d, const T *s, size_t count, Group grp) if constexpr (ishmemi_is_device) { size_t stride = grp.get_local_linear_range(); size_t linear_id = grp.get_local_linear_id(); - long idx = (long) linear_id; + size_t idx = linear_id; T *aligned_s = (T *) sycl::min(((((uintptr_t) s) + ISHMEMI_ALIGNMASK) & (~ISHMEMI_ALIGNMASK)), (uintptr_t) (s + count)); @@ -339,7 +339,7 @@ void vec_copy_work_group_strided_pull(T *d, const T *s, size_t count, Group grp) d += (aligned_s - s); /* at this point, if count > 0, then d is aligned, s may not be aligned */ if (count == 0) return; - idx = (long) (linear_id * ishmemi_vec_length); + idx = (linear_id * ishmemi_vec_length); size_t vstride = stride * ishmemi_vec_length; sycl::multi_ptr @@ -363,7 +363,7 @@ void vec_copy_work_group_strided_pull(T *d, const T *s, size_t count, Group grp) * count & (ishmemi_vec_length-1) items * back to item at a time */ - idx = (long) (linear_id + (count & (~((unsigned long) ishmemi_vec_length - 1)))); + idx = (linear_id + (count & (~((unsigned long) ishmemi_vec_length - 1)))); while (idx < count) { dd[idx] = ds[idx]; idx += stride; diff --git a/src/ishmem/env_defs.h b/src/ishmem/env_defs.h index 578fd1a..c8fc523 100644 --- a/src/ishmem/env_defs.h +++ b/src/ishmem/env_defs.h @@ -36,5 +36,6 @@ ISHMEMI_ENV_DEF(TEAM_SHARED_ONLY_SELF, bool, false, "Include only the self PE in ISHMEM_TEAM_SHARED") /* Runtime definitions */ -ISHMEMI_ENV_DEF(RUNTIME, std::string, "OPENSHMEM", "The runtime to use for scale-out communication") +ISHMEMI_ENV_DEF(RUNTIME, std::string, ISHMEM_DEFAULT_RUNTIME_STR, + "The default runtime to use for scale-out communication") ISHMEMI_ENV_DEF(RUNTIME_USE_OSHMPI, bool, false, "Specify the OpenSHMEM backend as OSHMPI") diff --git a/src/ishmem/env_utils.h b/src/ishmem/env_utils.h index f77618f..ac78910 100644 --- a/src/ishmem/env_utils.h +++ b/src/ishmem/env_utils.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2025 Intel Corporation +/* Copyright (C) 2024 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause * * Portions derived from Sandia OpenSHMEM (https://github.com/Sandia-OpenSHMEM/SOS) diff --git a/src/ishmem/types.h b/src/ishmem/types.h index 5e19d0e..ca863e9 100644 --- a/src/ishmem/types.h +++ b/src/ishmem/types.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -70,6 +70,8 @@ typedef enum : uint16_t { MIN_REDUCE, SUM_REDUCE, PROD_REDUCE, + INSCAN, + EXSCAN, WAIT, WAIT_ALL, WAIT_ALL_VECTOR, @@ -150,6 +152,8 @@ constexpr ishmemi_op_t ISHMEMI_OP_max_reduce = MAX_REDUCE; constexpr ishmemi_op_t ISHMEMI_OP_min_reduce = MIN_REDUCE; constexpr ishmemi_op_t ISHMEMI_OP_sum_reduce = SUM_REDUCE; constexpr ishmemi_op_t ISHMEMI_OP_prod_reduce = PROD_REDUCE; +constexpr ishmemi_op_t ISHMEMI_OP_in_scan = INSCAN; +constexpr ishmemi_op_t ISHMEMI_OP_ex_scan = EXSCAN; constexpr ishmemi_op_t ISHMEMI_OP_wait_until = WAIT; constexpr ishmemi_op_t ISHMEMI_OP_wait_until_all = WAIT_ALL; constexpr ishmemi_op_t ISHMEMI_OP_wait_until_all_vector = WAIT_ALL_VECTOR; diff --git a/src/ishmem/util.h b/src/ishmem/util.h index f1744d2..aad5372 100644 --- a/src/ishmem/util.h +++ b/src/ishmem/util.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause * * Portions derived from Sandia OpenSHMEM (https://github.com/Sandia-OpenSHMEM/SOS) @@ -266,6 +266,14 @@ ISHMEM_DEVICE_ATTRIBUTES constexpr bool ishmemi_op_is_reduction() else return false; } +template +ISHMEM_DEVICE_ATTRIBUTES constexpr bool ishmemi_op_is_scan() +{ + if constexpr (OP == INSCAN) return true; + else if constexpr (OP == EXSCAN) return true; + else return false; +} + template ISHMEM_DEVICE_ATTRIBUTES constexpr bool ishmemi_op_is_standard_amo() { @@ -340,6 +348,7 @@ template ISHMEM_DEVICE_ATTRIBUTES constexpr bool ishmemi_op_floating_point_matters() { if constexpr (ishmemi_op_is_value_reduction()) return true; + else if constexpr (ishmemi_op_is_scan()) return true; else if constexpr (ishmemi_op_is_extended_amo()) return true; else if constexpr (OP == P) return true; else if constexpr (OP == G) return true; @@ -350,6 +359,7 @@ template ISHMEM_DEVICE_ATTRIBUTES constexpr bool ishmemi_op_sign_matters() { if constexpr (ishmemi_op_is_reduction()) return true; + else if constexpr (ishmemi_op_is_scan()) return true; else if constexpr (ishmemi_op_is_amo()) return true; else if constexpr (OP == SIGNAL_WAIT_UNTIL) return false; else if constexpr (ishmemi_op_is_sync()) return true; @@ -472,6 +482,7 @@ template ISHMEM_DEVICE_ATTRIBUTES constexpr bool ishmemi_op_uses_team() { if constexpr (ishmemi_op_is_reduction()) return true; + else if constexpr (ishmemi_op_is_scan()) return true; else if constexpr (OP == ALLTOALL) return true; else if constexpr (OP == BCAST) return true; else if constexpr (OP == COLLECT) return true; diff --git a/src/ishmem_config.h.in b/src/ishmem_config.h.in index 211fc87..44852c9 100644 --- a/src/ishmem_config.h.in +++ b/src/ishmem_config.h.in @@ -8,3 +8,8 @@ #cmakedefine ENABLE_ERROR_CHECKING #cmakedefine ENABLE_REDUCED_LINK_ENGINES #cmakedefine ENABLE_DLMALLOC + +/* clang-format off */ +#define ISHMEM_DEFAULT_RUNTIME @ISHMEM_DEFAULT_RUNTIME_VAL@ +#define ISHMEM_DEFAULT_RUNTIME_STR "@ISHMEM_DEFAULT_RUNTIME_STR@" +/* clang-format on */ diff --git a/src/ishmemx.h b/src/ishmemx.h index 5010a41..de87a1c 100644 --- a/src/ishmemx.h +++ b/src/ishmemx.h @@ -5,7 +5,7 @@ #ifndef I_SHMEMX_H #define I_SHMEMX_H -#include "ishmem.h" +#include #define ISHMEM_DEVICE_ATTRIBUTES SYCL_EXTERNAL #define ISHMEMX_TEAM_NODE 2 @@ -19,8 +19,8 @@ typedef enum : uint8_t { } ishmemx_runtime_type_t; typedef struct ishmemx_attr_t { - /* By default, the runtime is assumed to be OpenSHMEM */ - ishmemx_runtime_type_t runtime = ISHMEMX_RUNTIME_OPENSHMEM; + /* By default, the runtime is assumed to be MPI */ + ishmemx_runtime_type_t runtime = ISHMEM_DEFAULT_RUNTIME; /* By default, runtimes are assumed to be initialized by ISHMEM */ bool initialize_runtime = true; /* By default, gpu is used */ @@ -1801,6 +1801,108 @@ template ISHMEM_DEVICE_ATTRIBUTES int ishmemx_uint64_prod_reduc template ISHMEM_DEVICE_ATTRIBUTES int ishmemx_size_prod_reduce_work_group(ishmem_team_t, size_t *, const size_t *, size_t, const Group &); template ISHMEM_DEVICE_ATTRIBUTES int ishmemx_ptrdiff_prod_reduce_work_group(ishmem_team_t, ptrdiff_t *, const ptrdiff_t *, size_t, const Group &); +/* scan_on_queue (prefix sum) */ +template sycl::event ishmemx_sum_inscan_on_queue(T *, const T *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_float_sum_inscan_on_queue(float *, const float *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_double_sum_inscan_on_queue(double *, const double *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_char_sum_inscan_on_queue(char *, const char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_schar_sum_inscan_on_queue(signed char *, const signed char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_short_sum_inscan_on_queue(short *, const short *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int_sum_inscan_on_queue(int *, const int *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_long_sum_inscan_on_queue(long *, const long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_longlong_sum_inscan_on_queue(long long *, const long long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uchar_sum_inscan_on_queue(unsigned char *, const unsigned char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ushort_sum_inscan_on_queue(unsigned short *, const unsigned short *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint_sum_inscan_on_queue(unsigned int *, const unsigned int *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ulong_sum_inscan_on_queue(unsigned long *, const unsigned long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ulonglong_sum_inscan_on_queue(unsigned long long *, const unsigned long long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int8_sum_inscan_on_queue(int8_t *, const int8_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int16_sum_inscan_on_queue(int16_t *, const int16_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int32_sum_inscan_on_queue(int32_t *, const int32_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int64_sum_inscan_on_queue(int64_t *, const int64_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint8_sum_inscan_on_queue(uint8_t *, const uint8_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint16_sum_inscan_on_queue(uint16_t *, const uint16_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint32_sum_inscan_on_queue(uint32_t *, const uint32_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint64_sum_inscan_on_queue(uint64_t *, const uint64_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_size_sum_inscan_on_queue(size_t *, const size_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ptrdiff_sum_inscan_on_queue(ptrdiff_t *, const ptrdiff_t *, size_t, int *, sycl::queue &, const std::vector & = {}); + +template sycl::event ishmemx_sum_exscan_on_queue(T *, const T *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_float_sum_exscan_on_queue(float *, const float *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_double_sum_exscan_on_queue(double *, const double *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_char_sum_exscan_on_queue(char *, const char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_schar_sum_exscan_on_queue(signed char *, const signed char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_short_sum_exscan_on_queue(short *, const short *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int_sum_exscan_on_queue(int *, const int *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_long_sum_exscan_on_queue(long *, const long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_longlong_sum_exscan_on_queue(long long *, const long long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uchar_sum_exscan_on_queue(unsigned char *, const unsigned char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ushort_sum_exscan_on_queue(unsigned short *, const unsigned short *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint_sum_exscan_on_queue(unsigned int *, const unsigned int *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ulong_sum_exscan_on_queue(unsigned long *, const unsigned long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ulonglong_sum_exscan_on_queue(unsigned long long *, const unsigned long long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int8_sum_exscan_on_queue(int8_t *, const int8_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int16_sum_exscan_on_queue(int16_t *, const int16_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int32_sum_exscan_on_queue(int32_t *, const int32_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int64_sum_exscan_on_queue(int64_t *, const int64_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint8_sum_exscan_on_queue(uint8_t *, const uint8_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint16_sum_exscan_on_queue(uint16_t *, const uint16_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint32_sum_exscan_on_queue(uint32_t *, const uint32_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint64_sum_exscan_on_queue(uint64_t *, const uint64_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_size_sum_exscan_on_queue(size_t *, const size_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ptrdiff_sum_exscan_on_queue(ptrdiff_t *, const ptrdiff_t *, size_t, int *, sycl::queue &, const std::vector & = {}); + +/* scan_on_queue (prefix sum) on a team */ +template sycl::event ishmemx_sum_inscan_on_queue(ishmem_team_t, T *, const T *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_float_sum_inscan_on_queue(ishmem_team_t, float *, const float *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_double_sum_inscan_on_queue(ishmem_team_t, double *, const double *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_char_sum_inscan_on_queue(ishmem_team_t, char *, const char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_schar_sum_inscan_on_queue(ishmem_team_t, signed char *, const signed char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_short_sum_inscan_on_queue(ishmem_team_t, short *, const short *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int_sum_inscan_on_queue(ishmem_team_t, int *, const int *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_long_sum_inscan_on_queue(ishmem_team_t, long *, const long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_longlong_sum_inscan_on_queue(ishmem_team_t, long long *, const long long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uchar_sum_inscan_on_queue(ishmem_team_t, unsigned char *, const unsigned char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ushort_sum_inscan_on_queue(ishmem_team_t, unsigned short *, const unsigned short *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint_sum_inscan_on_queue(ishmem_team_t, unsigned int *, const unsigned int *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ulong_sum_inscan_on_queue(ishmem_team_t, unsigned long *, const unsigned long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ulonglong_sum_inscan_on_queue(ishmem_team_t, unsigned long long *, const unsigned long long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int8_sum_inscan_on_queue(ishmem_team_t, int8_t *, const int8_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int16_sum_inscan_on_queue(ishmem_team_t, int16_t *, const int16_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int32_sum_inscan_on_queue(ishmem_team_t, int32_t *, const int32_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int64_sum_inscan_on_queue(ishmem_team_t, int64_t *, const int64_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint8_sum_inscan_on_queue(ishmem_team_t, uint8_t *, const uint8_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint16_sum_inscan_on_queue(ishmem_team_t, uint16_t *, const uint16_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint32_sum_inscan_on_queue(ishmem_team_t, uint32_t *, const uint32_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint64_sum_inscan_on_queue(ishmem_team_t, uint64_t *, const uint64_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_size_sum_inscan_on_queue(ishmem_team_t, size_t *, const size_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ptrdiff_sum_inscan_on_queue(ishmem_team_t, ptrdiff_t *, const ptrdiff_t *, size_t, int *, sycl::queue &, const std::vector & = {}); + +template sycl::event ishmemx_sum_exscan_on_queue(ishmem_team_t, T *, const T *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_float_sum_exscan_on_queue(ishmem_team_t, float *, const float *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_double_sum_exscan_on_queue(ishmem_team_t, double *, const double *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_char_sum_exscan_on_queue(ishmem_team_t, char *, const char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_schar_sum_exscan_on_queue(ishmem_team_t, signed char *, const signed char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_short_sum_exscan_on_queue(ishmem_team_t, short *, const short *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int_sum_exscan_on_queue(ishmem_team_t, int *, const int *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_long_sum_exscan_on_queue(ishmem_team_t, long *, const long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_longlong_sum_exscan_on_queue(ishmem_team_t, long long *, const long long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uchar_sum_exscan_on_queue(ishmem_team_t, unsigned char *, const unsigned char *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ushort_sum_exscan_on_queue(ishmem_team_t, unsigned short *, const unsigned short *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint_sum_exscan_on_queue(ishmem_team_t, unsigned int *, const unsigned int *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ulong_sum_exscan_on_queue(ishmem_team_t, unsigned long *, const unsigned long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ulonglong_sum_exscan_on_queue(ishmem_team_t, unsigned long long *, const unsigned long long *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int8_sum_exscan_on_queue(ishmem_team_t, int8_t *, const int8_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int16_sum_exscan_on_queue(ishmem_team_t, int16_t *, const int16_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int32_sum_exscan_on_queue(ishmem_team_t, int32_t *, const int32_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_int64_sum_exscan_on_queue(ishmem_team_t, int64_t *, const int64_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint8_sum_exscan_on_queue(ishmem_team_t, uint8_t *, const uint8_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint16_sum_exscan_on_queue(ishmem_team_t, uint16_t *, const uint16_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint32_sum_exscan_on_queue(ishmem_team_t, uint32_t *, const uint32_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_uint64_sum_exscan_on_queue(ishmem_team_t, uint64_t *, const uint64_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_size_sum_exscan_on_queue(ishmem_team_t, size_t *, const size_t *, size_t, int *, sycl::queue &, const std::vector & = {}); +sycl::event ishmemx_ptrdiff_sum_exscan_on_queue(ishmem_team_t, ptrdiff_t *, const ptrdiff_t *, size_t, int *, sycl::queue &, const std::vector & = {}); + /* test_work_group */ template ISHMEM_DEVICE_ATTRIBUTES int ishmemx_test_work_group(T *, int, T, const Group &); template ISHMEM_DEVICE_ATTRIBUTES int ishmemx_int_test_work_group(int *, int, int, const Group &); diff --git a/src/memory.cpp b/src/memory.cpp index d17ca88..4f35408 100644 --- a/src/memory.cpp +++ b/src/memory.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -8,32 +8,34 @@ #include "accelerator.h" #include "runtime.h" -static char *ishmemi_heap_curr = nullptr; // why static ? +namespace { + /* Private immediate command list for copying data */ + ze_command_list_handle_t copy_list = {}; + /* IPC handles for mmap regions */ + ze_ipc_mem_handle_t heap_handle = {}; + ze_ipc_mem_handle_t info_handle = {}; + + /* Heap vars */ + mspace ishmemi_mspace; + char *heap_curr = nullptr; +} // namespace + +/* Heap var */ void *ishmemi_heap_base = nullptr; +void *ishmemi_mmap_heap_base = nullptr; size_t ishmemi_heap_length = 0; uintptr_t ishmemi_heap_last = 0; -void *ishmemi_mmap_heap_base = nullptr; -ishmemi_info_t *ishmemi_gpu_info = nullptr; +/* Info object vars */ size_t ishmemi_info_size = 0; +ishmemi_info_t *ishmemi_gpu_info = nullptr; ishmemi_info_t *ishmemi_mmap_gpu_info = nullptr; -mspace ishmemi_mspace; -ze_command_queue_desc_t ishmem_copy_cmd_queue_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = nullptr, - .ordinal = 1, - .index = 0, - .flags = 0, - .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, -}; -ze_command_list_handle_t ishmem_copy_cmd_list = {}; - int ishmemi_memory_init() { int ret = 0; + ISHMEM_DEBUG_MSG("Symmetric heap size %ld\n", ishmemi_params.SYMMETRIC_SIZE); ishmemi_heap_length = ishmemi_params.SYMMETRIC_SIZE + ISHMEMI_HEAP_OVERHEAD; @@ -44,44 +46,55 @@ int ishmemi_memory_init() ishmemi_params.SYMMETRIC_SIZE); /* Allocate symmetric heap, and create mmap access to it */ - if (!ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { + if (ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { + /* Host memory alloc */ + ret = ishmemi_usm_alloc_host(&ishmemi_heap_base, ishmemi_heap_length); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + ISHMEM_CHECK_GOTO_MSG(ishmemi_heap_base == nullptr, fn_fail, + "Unable to allocate ishmemi_heap_base\n"); + } else { /* Device memory alloc */ ret = ishmemi_usm_alloc_device(&ishmemi_heap_base, ishmemi_heap_length); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + ISHMEM_CHECK_GOTO_MSG(ishmemi_heap_base == nullptr, fn_fail, + "Unable to allocate ishmemi_heap_base\n"); + /* The idea is for the host to peek and poke the symmetric heap. Possibly useful for - * host initiated operations or for debugging - */ - ishmemi_mmap_heap_base = ishmemi_get_mmap_address(ishmemi_heap_base, ishmemi_heap_length); - if (ishmemi_mmap_heap_base == nullptr) { - RAISE_ERROR_MSG("Unable to mmap GPU symmetric heap\n"); - } + * host initiated operations or for debugging */ + ishmemi_mmap_heap_base = + ishmemi_get_mmap_address(ishmemi_heap_base, ishmemi_heap_length, &heap_handle); + ISHMEM_CHECK_GOTO_MSG(ishmemi_mmap_heap_base == nullptr, fn_fail, + "Unable to mmap GPU symmetric heap\n"); + ::memset(ishmemi_mmap_heap_base, 0, ishmemi_heap_length); ishmemi_heap_last = (uintptr_t) pointer_offset(ishmemi_heap_base, ishmemi_heap_length - 1); - } else { - /* Shared memory alloc */ - ret = ishmemi_usm_alloc_host(&ishmemi_heap_base, ishmemi_heap_length); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); } - /* this initialized the sbrk style symmtric heap allocator */ - ishmemi_heap_curr = (char *) ishmemi_heap_base; + /* This initializes the sbrk style symmtric heap allocator */ + heap_curr = (char *) ishmemi_heap_base; if (ishmemi_params.DEBUG) { ze_device_handle_t temp_gpu_device; - ze_memory_allocation_properties_t mem_properties; - mem_properties.pNext = nullptr; - mem_properties.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES; + ze_memory_allocation_properties_t mem_properties = { + .stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, + .pNext = nullptr, + .type = ZE_MEMORY_TYPE_UNKNOWN, + .id = 0, + .pageSize = 0, + }; + ZE_CHECK(zeMemGetAllocProperties(ishmemi_ze_context, ishmemi_heap_base, &mem_properties, &temp_gpu_device)); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + ISHMEM_DEBUG_MSG("Heap allocation type: %s\n", (mem_properties.type == ZE_MEMORY_TYPE_SHARED ? "shared" : "device")); } - /* allocate info structure */ - ishmemi_info_size = - sizeof(ishmemi_info_t) + (static_cast(ishmemi_n_pes) * sizeof(uint8_t)); + /* Allocate info structure */ + ishmemi_info_size = sizeof(ishmemi_info_t) + static_cast(ishmemi_n_pes); ret = ishmemi_usm_alloc_device((void **) &ishmemi_gpu_info, ishmemi_info_size); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); /* SYCL queue to initialize global_info */ try { @@ -90,147 +103,153 @@ int ishmemi_memory_init() } catch (...) { ret = -1; } - ISHMEM_CHECK_GOTO_MSG(ret, fn_fail, "ishmemi_alloc_usm_device failed '%d'\n", ret); /* host access for device data */ - ishmemi_mmap_gpu_info = ishmemi_get_mmap_address(ishmemi_gpu_info, ishmemi_info_size); - if (ishmemi_mmap_gpu_info == nullptr) { - RAISE_ERROR_MSG("Unable to mmap GPU info object\n"); - } + ishmemi_mmap_gpu_info = + ishmemi_get_mmap_address(ishmemi_gpu_info, ishmemi_info_size, &info_handle); + ISHMEM_CHECK_GOTO_MSG(ishmemi_mmap_gpu_info == nullptr, fn_fail, + "Unable to mmap GPU info object\n"); + ::memset(ishmemi_mmap_gpu_info, 0, ishmemi_info_size); +#ifdef ENABLE_DLMALLOC if (ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { ishmemi_mspace = create_mspace_with_base(ishmemi_heap_base, ishmemi_heap_length, 0); } else { -#ifdef ENABLE_DLMALLOC ishmemi_mspace = create_mspace_with_base(ishmemi_mmap_heap_base, ishmemi_heap_length, 0); -#endif } +#endif - /* create ZE command list */ - ZE_CHECK(zeCommandListCreateImmediate(ishmemi_ze_context, ishmemi_gpu_device, - &ishmem_copy_cmd_queue_desc, &ishmem_copy_cmd_list)); + /* create an immediate command list for use in ishmem_copy */ + ret = ishmemi_create_command_list(COPY_QUEUE, true, ©_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - return (nullptr == ishmemi_heap_base) ? -1 : 0; - +fn_exit: + return ret; fn_fail: - return -1; + if (!ret) ret = 1; + goto fn_exit; } int ishmemi_memory_fini() { int ret = 0; + if (!ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { - if (ishmemi_mmap_heap_base != nullptr) munmap(ishmemi_mmap_heap_base, ishmemi_heap_length); + if (ishmemi_mmap_heap_base != nullptr) { + ret = ishmemi_close_mmap_address(heap_handle, ishmemi_mmap_heap_base, + ishmemi_heap_length); + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + } + ishmemi_mmap_heap_base = nullptr; } - if (ishmemi_heap_base != nullptr) { - /* TODO, should ISHMEMI_FREE check the result of the call to ishmemi_usm_free? And do what? - */ - ISHMEMI_FREE(ishmemi_usm_free, ishmemi_heap_base); - ishmemi_heap_length = 0; - } + + ret = ishmemi_usm_free(ishmemi_heap_base); + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + ishmemi_heap_base = nullptr; + ishmemi_heap_length = 0; - ret = munmap((void *) ishmemi_mmap_gpu_info, ishmemi_info_size); + ret = ishmemi_close_mmap_address(info_handle, ishmemi_mmap_gpu_info, ishmemi_info_size); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - ZE_CHECK(zeCommandListDestroy(ishmem_copy_cmd_list)); + + ZE_CHECK(zeCommandListDestroy(copy_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); ret = ishmemi_usm_free(ishmemi_gpu_info); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + ishmemi_gpu_info = nullptr; + ishmemi_info_size = 0; + fn_exit: return ret; } void *ishmemi_get_next(size_t incr, size_t alignment) { - /* Override user input to ensure minimal alignment*/ + size_t space; + void *old = nullptr; + void *ret = nullptr; + ptrdiff_t used = 0; + + /* Override user input to ensure minimal alignment */ alignment = ISHMEMI_ALLOC_ALIGN > alignment ? ISHMEMI_ALLOC_ALIGN : alignment; - /* this is ptrdiff_t so it will be negative if curr < base */ - ptrdiff_t used = ((intptr_t) ishmemi_heap_curr) - ((intptr_t) ishmemi_heap_base); - if ((used < 0) || (used >= ishmemi_heap_length)) { - RAISE_ERROR_MSG("Symmetric heap out of bounds\n"); - /* Not a recoverable error, since we don't know where to move heap_curr */ - } - size_t space = ishmemi_heap_length - - static_cast(used); // this is guaranteed positive, so unsigned is ok - /* std:: align will bump old_ptr by the alignment adjustment */ - void *old_ptr = ishmemi_heap_curr; - void *result = (char *) std::align(alignment, incr, old_ptr, space); - if (result == nullptr) { - ISHMEM_WARN_MSG("Out of symmetric space\n"); - } else { - ishmemi_heap_curr = ((char *) old_ptr) + incr; - } - return result; -} + /* `used` will be negative if curr < base */ + used = ((intptr_t) heap_curr) - ((intptr_t) ishmemi_heap_base); + ISHMEM_CHECK_GOTO_MSG(((used < 0) || (used >= (ptrdiff_t) ishmemi_heap_length)), fn_fail, + "Unable to allocate %zu bytes in symmetric heap\n", incr); -void *ishmem_malloc(size_t size) -{ - if constexpr (enable_error_checking) validate_init(); - void *ret; - if (size == 0) return (nullptr); + space = ishmemi_heap_length - static_cast(used); /* Guaranteed to be positive */ + old = heap_curr; + ret = (char *) std::align(alignment, incr, old, space); + ISHMEM_CHECK_GOTO_MSG(ret == nullptr, fn_exit, + "Unable to allocate %zu bytes in symmetric heap\n", incr); - // TODO: internal barrier, dlmalloc, thread-safety - if (!ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { -#ifndef ENABLE_DLMALLOC - ret = ishmemi_get_next(size); -#else - void *host_ret = mspace_memalign(ishmemi_mspace, ISHMEMI_ALLOC_ALIGN, size); - ISHMEM_CHECK_GOTO_MSG(host_ret == nullptr, fn_fail, - "Unable to allocate %zu bytes in symmetric space\n", size); - - ret = (void *) (((uintptr_t) host_ret - (uintptr_t) ishmemi_mmap_heap_base) + - (uintptr_t) ishmemi_heap_base); -#endif - } else { - ret = mspace_memalign(ishmemi_mspace, ISHMEMI_ALLOC_ALIGN, size); - ISHMEM_CHECK_GOTO_MSG(ret == nullptr, fn_fail, - "Unable to allocate %zu bytes in symmetric space\n", size); - } + heap_curr = ((char *) old) + incr; - ishmemi_runtime->barrier_all(); +fn_exit: return ret; - fn_fail: - return nullptr; + ret = nullptr; + goto fn_exit; } -void *ishmem_align(size_t alignment, size_t size) +void *ishmemi_alloc(size_t size, size_t alignment) { - if constexpr (enable_error_checking) validate_init(); - void *ret; + void *host_ret = nullptr; + void *ret = nullptr; - if (size == 0) return nullptr; - // Undefined behaviour if alignment is not a power of 2 - if (alignment == 0 || (alignment & (alignment - 1)) != 0) return nullptr; + if (size == 0) { + goto fn_fail; + } - if (!ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { -#ifndef ENABLE_DLMALLOC - ret = ishmemi_get_next(size, alignment); + ISHMEM_CHECK_GOTO_MSG((alignment == 0 || (alignment & (alignment - 1)) != 0), fn_fail, + "Alignment must be a power of 2\n"); + + if (ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { +#ifdef ENABLE_DLMALLOC + ret = mspace_memalign(ishmemi_mspace, alignment, size); + ISHMEM_CHECK_GOTO_MSG(ret == nullptr, fn_fail, + "Unable to allocate %zu bytes in symmetric heap\n", size); #else - void *host_ret = mspace_memalign(ishmemi_mspace, alignment, size); + ISHMEM_CHECK_GOTO_MSG(ret == nullptr, fn_fail, + "Host-accessibly heap requires dlmalloc to be enabled\n", size); +#endif + } else { +#ifdef ENABLE_DLMALLOC + host_ret = mspace_memalign(ishmemi_mspace, alignment, size); ISHMEM_CHECK_GOTO_MSG(host_ret == nullptr, fn_fail, - "Unable to allocate %zu bytes in symmetric space\n", size); + "Unable to allocate %zu bytes in symmetric heap\n", size); ret = (void *) (((uintptr_t) host_ret - (uintptr_t) ishmemi_mmap_heap_base) + (uintptr_t) ishmemi_heap_base); +#else + ret = ishmemi_get_next(size); #endif - } else { - ret = mspace_memalign(ishmemi_mspace, alignment, size); - ISHMEM_CHECK_GOTO_MSG(ret == nullptr, fn_fail, - "Unable to allocate %zu bytes in symmetric space\n", size); } ishmemi_runtime->barrier_all(); - return ret; +fn_exit: + return ret; fn_fail: - return nullptr; + ret = nullptr; + goto fn_exit; +} + +void *ishmem_malloc(size_t size) +{ + if constexpr (enable_error_checking) validate_init(); + return ishmemi_alloc(size); +} + +void *ishmem_align(size_t alignment, size_t size) +{ + if constexpr (enable_error_checking) validate_init(); + return ishmemi_alloc(size, alignment); } void *ishmem_calloc(size_t count, size_t size) @@ -241,63 +260,35 @@ void *ishmem_calloc(size_t count, size_t size) void *ishmemi_calloc(size_t count, size_t size) { - void *ptr; - if (count == 0 || size == 0) return (nullptr); + void *ptr = nullptr; - // TODO: internal barrier, dlmalloc, thread-safety - if (!ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { -#ifndef ENABLE_DLMALLOC - ptr = ishmemi_get_next(size * count); -#else - void *host_ret = mspace_memalign(ishmemi_mspace, ISHMEMI_ALLOC_ALIGN, count * size); - ISHMEM_CHECK_GOTO_MSG(host_ret == nullptr, fn_fail, - "Unable to allocate %zu bytes in symmetric space\n", count * size); + ptr = ishmemi_alloc(count * size); + ISHMEM_CHECK_GOTO_MSG(ptr == nullptr, fn_fail, "Failed to allocate object\n"); - ptr = (void *) (((uintptr_t) host_ret - (uintptr_t) ishmemi_mmap_heap_base) + - (uintptr_t) ishmemi_heap_base); -#endif - } else { - ptr = mspace_memalign(ishmemi_mspace, ISHMEMI_ALLOC_ALIGN, count * size); - ISHMEM_CHECK_GOTO_MSG(ptr == nullptr, fn_fail, - "Unable to allocate %zu bytes in symmetric space\n", count * size); - } - - if (ptr != nullptr) { - ze_command_queue_desc_t cmd_queue_desc = {.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = nullptr, - .ordinal = 1, - .index = 0, - .flags = 0, - .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; - ze_command_list_handle_t cmd_list = {}; - uint32_t zero = 0; - int ret = 0; - ZE_CHECK(zeCommandListCreateImmediate(ishmemi_ze_context, ishmemi_gpu_device, - &cmd_queue_desc, &cmd_list)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - ZE_CHECK(zeCommandListAppendMemoryFill(cmd_list, ptr, &zero, 1, count * size, nullptr, 0, - nullptr)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - ZE_CHECK(zeCommandListDestroy(cmd_list)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - } - - ishmemi_runtime->barrier_all(); + ptr = ishmemi_zero(ptr, count * size); + ISHMEM_CHECK_GOTO_MSG(ptr == nullptr, fn_fail, "Failed to zero allocation\n"); +fn_exit: return ptr; - fn_fail: - return nullptr; + ptr = nullptr; + goto fn_exit; } void ishmem_free(void *ptr) { if constexpr (enable_error_checking) validate_init(); + ishmemi_free(ptr); +} + +void ishmemi_free(void *ptr) +{ ishmemi_runtime->barrier_all(); if (ishmemi_params.ENABLE_ACCESSIBLE_HOST_HEAP) { if (ptr != nullptr) { +#ifdef ENABLE_DLMALLOC mspace_free(ishmemi_mspace, ptr); +#endif } } else { if (ptr != nullptr) { @@ -310,46 +301,41 @@ void ishmem_free(void *ptr) } } -void *ishmem_copy(void *dst, const void *src, size_t size) +void *ishmem_copy(void *dest, const void *src, size_t size) { - int ret = 0; - ze_memory_type_t dst_type, src_type; + if constexpr (enable_error_checking) validate_init(); + return ishmemi_copy(dest, src, size); +} - /* Check the pointer type for dst and src */ - ishmemi_get_memory_type(dst, &dst_type); - ishmemi_get_memory_type(src, &src_type); +void *ishmemi_copy(void *dest, const void *src, size_t size) +{ + int ret = 0; - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - ZE_CHECK( - zeCommandListAppendMemoryCopy(ishmem_copy_cmd_list, dst, src, size, nullptr, 0, nullptr)); + ZE_CHECK(zeCommandListAppendMemoryCopy(copy_list, dest, src, size, nullptr, 0, nullptr)); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - return dst; + return dest; fn_fail: return nullptr; } -void *ishmem_zero(void *dst, size_t size) +void *ishmem_zero(void *dest, size_t size) +{ + if constexpr (enable_error_checking) validate_init(); + return ishmemi_zero(dest, size); +} + +void *ishmemi_zero(void *dest, size_t size) { - ze_command_list_handle_t cmd_list = {}; - uint32_t zero = 0; int ret = 0; - ze_command_queue_desc_t cmd_queue_desc = {.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = nullptr, - .ordinal = 1, - .index = 0, - .flags = 0, - .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; - ZE_CHECK(zeCommandListCreateImmediate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_queue_desc, - &cmd_list)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - ZE_CHECK(zeCommandListAppendMemoryFill(cmd_list, dst, &zero, 1, size, nullptr, 0, nullptr)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - ZE_CHECK(zeCommandListDestroy(cmd_list)); + uint32_t zero = 0; + + ZE_CHECK(zeCommandListAppendMemoryFill(copy_list, dest, &zero, 1, size, nullptr, 0, nullptr)); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - return dst; + + return dest; + fn_fail: return nullptr; } diff --git a/src/memory.h b/src/memory.h index 0130def..0865c40 100644 --- a/src/memory.h +++ b/src/memory.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -9,33 +9,27 @@ #include #define ISHMEMI_HEAP_OVERHEAD 1024 +#define ISHMEMI_ALLOC_ALIGN ((size_t) 64) -#ifdef __cplusplus +/* dlmalloc definitions */ extern "C" { -#endif - typedef void *mspace; -/* mspace routines */ + mspace create_mspace_with_base(void *, size_t, int); void *mspace_memalign(mspace, size_t, size_t); void mspace_free(mspace, void *); - -#define ISHMEMI_ALLOC_ALIGN ((size_t) 64) -void *ishmemi_get_next(size_t incr, size_t alignment = ISHMEMI_ALLOC_ALIGN); +} /* Memory routines */ -/* Initialize memory */ int ishmemi_memory_init(); - -/* Finalize memory */ int ishmemi_memory_fini(); -void *ishmemi_calloc(size_t count, size_t size); -void *ishmemi_ptr(const void *dest, int pe); - -#ifdef __cplusplus -} -#endif +void *ishmemi_alloc(size_t, size_t alignment = ISHMEMI_ALLOC_ALIGN); +void *ishmemi_calloc(size_t count, size_t); +void *ishmemi_copy(void *, const void *, size_t); +void *ishmemi_zero(void *, size_t); +void *ishmemi_ptr(const void *, int); +void ishmemi_free(void *); #define ISHMEMI_FAST_ADJUST(TYPENAME, info, index, p) \ ((TYPENAME *) (reinterpret_cast(p) + \ diff --git a/src/memory_ordering.cpp b/src/memory_ordering.cpp index 073a3b6..884661e 100644 --- a/src/memory_ordering.cpp +++ b/src/memory_ordering.cpp @@ -75,7 +75,7 @@ sycl::event ishmemx_quiet_on_queue(sycl::queue &q, const std::vectorsecond->event, deps); - cgh.host_task([=]() { ishmem_quiet(); }); + cgh.single_task([=]() { ishmem_quiet(); }); }); ishmemi_on_queue_events_map[&q]->event = e; return e; diff --git a/src/nbi.cpp b/src/nbi.cpp index c2aeccd..7c0e557 100644 --- a/src/nbi.cpp +++ b/src/nbi.cpp @@ -31,7 +31,7 @@ sycl::event ishmemx_put_nbi_on_queue(T *dest, const T *src, size_t nelems, int p auto e = q.submit([&](sycl::handler &cgh) { set_cmd_grp_dependencies(cgh, entry_already_exists, iter->second->event, deps); - cgh.host_task([=]() { ishmem_put_nbi(dest, src, nelems, pe); }); + cgh.single_task([=]() { ishmem_put_nbi(dest, src, nelems, pe); }); }); ishmemi_on_queue_events_map[&q]->event = e; return e; @@ -142,7 +142,7 @@ sycl::event ishmemx_get_nbi_on_queue(T *dest, const T *src, size_t nelems, int p auto e = q.submit([&](sycl::handler &cgh) { set_cmd_grp_dependencies(cgh, entry_already_exists, iter->second->event, deps); - cgh.host_task([=]() { ishmem_get_nbi(dest, src, nelems, pe); }); + cgh.single_task([=]() { ishmem_get_nbi(dest, src, nelems, pe); }); }); ishmemi_on_queue_events_map[&q]->event = e; return e; diff --git a/src/on_queue.h b/src/on_queue.h index 59971c7..2fe3e6a 100644 --- a/src/on_queue.h +++ b/src/on_queue.h @@ -16,7 +16,7 @@ struct ishmemi_on_queue_map_entry_t { } }; -inline void set_cmd_grp_dependencies(sycl::handler &cgh, bool entry_already_exists, sycl::event e, +inline void set_cmd_grp_dependencies(sycl::handler &cgh, bool entry_already_exists, sycl::event &e, const std::vector &deps) { if (entry_already_exists) { diff --git a/src/proxy.cpp b/src/proxy.cpp index 25e2c6f..1a9d91f 100644 --- a/src/proxy.cpp +++ b/src/proxy.cpp @@ -189,9 +189,9 @@ int ishmemi_proxy_init() ISHMEM_DEBUG_MSG("can't get proxy thread affinity\n"); } else { off = snprintf(str, sizeof(str), "proxy thread affinity: "); - for (int i = 0; i < sizeof(cpu_set_t) * 8; i += 1) { + for (size_t i = 0; i < sizeof(cpu_set_t) * 8; i += 1) { if (CPU_ISSET(i, &cpuset)) - off += snprintf(str + off, sizeof(str) - static_cast(off), "%d, ", i); + off += snprintf(str + off, sizeof(str) - static_cast(off), "%ld, ", i); } ISHMEM_DEBUG_MSG("%s\n", str); } diff --git a/src/proxy_func.cpp b/src/proxy_func.cpp index 8b19195..68fad47 100644 --- a/src/proxy_func.cpp +++ b/src/proxy_func.cpp @@ -113,12 +113,12 @@ int ishmemi_proxy_func_init() sizeof(ishmemi_runtime_proxy_func_t *) * ISHMEMI_OP_END); ISHMEM_CHECK_GOTO_MSG(ishmemi_upcall_funcs == nullptr, fn_exit, "Allocation of ishmemi_upcall_funcs failed\n"); - for (int i = 0; i < ISHMEMI_OP_END; ++i) { + for (size_t i = 0; i < ISHMEMI_OP_END; ++i) { ishmemi_upcall_funcs[i] = (ishmemi_runtime_proxy_func_t *) ::malloc( sizeof(ishmemi_runtime_proxy_func_t) * ishmemi_runtime->proxy_func_num_types); ISHMEM_CHECK_GOTO_MSG(ishmemi_upcall_funcs[i] == nullptr, fn_exit, "Allocation of ishmemi_upcall_funcs row failed\n"); - for (int j = 0; j < ishmemi_runtime->proxy_func_num_types; ++j) { + for (size_t j = 0; j < ishmemi_runtime->proxy_func_num_types; ++j) { ishmemi_upcall_funcs[i][j] = ishmemi_runtime->proxy_funcs[i][j]; } } @@ -245,8 +245,8 @@ int ishmemi_proxy_func_init() int ishmemi_proxy_func_fini() { - for (int i = 0; i < ISHMEMI_OP_END; ++i) { - for (int j = 0; j < ishmemi_runtime->proxy_func_num_types; ++j) { + for (size_t i = 0; i < ISHMEMI_OP_END; ++i) { + for (size_t j = 0; j < ishmemi_runtime->proxy_func_num_types; ++j) { ishmemi_upcall_funcs[i][j] = nullptr; } ::free(ishmemi_upcall_funcs[i]); diff --git a/src/rma.cpp b/src/rma.cpp index abe0d3f..837de23 100644 --- a/src/rma.cpp +++ b/src/rma.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2025 Intel Corporation +/* Copyright (C) 2023 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -32,7 +32,7 @@ sycl::event ishmemx_put_on_queue(T *dest, const T *src, size_t nelems, int pe, s auto e = q.submit([&](sycl::handler &cgh) { set_cmd_grp_dependencies(cgh, entry_already_exists, iter->second->event, deps); - cgh.host_task([=]() { ishmem_put(dest, src, nelems, pe); }); + cgh.single_task([=]() { ishmem_put(dest, src, nelems, pe); }); }); ishmemi_on_queue_events_map[&q]->event = e; return e; @@ -181,7 +181,7 @@ sycl::event ishmemx_iput_on_queue(T *dest, const T *src, ptrdiff_t dst, ptrdiff_ ishmemx_iput_work_group(dest, src, dst, sst, nelems, pe, it.get_group()); }); } else { - cgh.host_task([=]() { ishmem_iput(dest, src, dst, sst, nelems, pe); }); + cgh.single_task([=]() { ishmem_iput(dest, src, dst, sst, nelems, pe); }); } }); ishmemi_on_queue_events_map[&q]->event = e; @@ -346,7 +346,7 @@ sycl::event ishmemx_ibput_on_queue(T *dest, const T *src, ptrdiff_t dst, ptrdiff it.get_group()); }); } else { - cgh.host_task([=]() { ishmemx_ibput(dest, src, dst, sst, bsize, nblocks, pe); }); + cgh.single_task([=]() { ishmemx_ibput(dest, src, dst, sst, bsize, nblocks, pe); }); } }); ishmemi_on_queue_events_map[&q]->event = e; @@ -538,7 +538,7 @@ sycl::event ishmemx_get_on_queue(T *dest, const T *src, size_t nelems, int pe, s auto e = q.submit([&](sycl::handler &cgh) { set_cmd_grp_dependencies(cgh, entry_already_exists, iter->second->event, deps); - cgh.host_task([=]() { ishmem_get(dest, src, nelems, pe); }); + cgh.single_task([=]() { ishmem_get(dest, src, nelems, pe); }); }); ishmemi_on_queue_events_map[&q]->event = e; return e; @@ -687,7 +687,7 @@ sycl::event ishmemx_iget_on_queue(T *dest, const T *src, ptrdiff_t dst, ptrdiff_ ishmemx_iget_work_group(dest, src, dst, sst, nelems, pe, it.get_group()); }); } else { - cgh.host_task([=]() { ishmem_iget(dest, src, dst, sst, nelems, pe); }); + cgh.single_task([=]() { ishmem_iget(dest, src, dst, sst, nelems, pe); }); } }); ishmemi_on_queue_events_map[&q]->event = e; @@ -852,7 +852,7 @@ sycl::event ishmemx_ibget_on_queue(T *dest, const T *src, ptrdiff_t dst, ptrdiff it.get_group()); }); } else { - cgh.host_task([=]() { ishmemx_ibget(dest, src, dst, sst, bsize, nblocks, pe); }); + cgh.single_task([=]() { ishmemx_ibget(dest, src, dst, sst, bsize, nblocks, pe); }); } }); ishmemi_on_queue_events_map[&q]->event = e; diff --git a/src/runtime/runtime_mpi.cpp b/src/runtime/runtime_mpi.cpp index 611cdd9..7bd91f9 100644 --- a/src/runtime/runtime_mpi.cpp +++ b/src/runtime/runtime_mpi.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -63,6 +63,26 @@ #define CALC_DISP(target, base) (intptr_t) target - (ptrdiff_t) base +#define CONVERT_GPU_BUFFER(QUALIFIER, TYPE, var, size, constexpr_check) \ + QUALIFIER TYPE *var##_host = var; \ + bool var##_gpu = false; \ + ze_ipc_mem_handle_t var##_handle = {}; \ + if constexpr (constexpr_check) { \ + if (ISHMEMI_HOST_IN_HEAP(var)) { \ + var##_host = ISHMEMI_DEVICE_TO_MMAP_ADDR(TYPE, var); \ + } else if ((var##_gpu == is_gpu_buffer(var)) && var##_gpu) { \ + var##_host = ishmemi_get_mmap_address(var, size, &var##_handle); \ + } \ + } + +#define CLEANUP_GPU_BUFFER(TYPE, var, size, constexpr_check) \ + if constexpr (constexpr_check) { \ + if (var##_gpu) { \ + ret = ishmemi_close_mmap_address(var##_handle, (TYPE *) var##_host, size); \ + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); \ + } \ + } + /* Runtime generic implementations */ namespace { template @@ -86,6 +106,23 @@ namespace { } } + template + static inline bool is_gpu_buffer(const T *ptr) + { + int ret; + ze_memory_type_t type; + + if (ptr == nullptr) return false; + + ret = ishmemi_get_memory_type(ptr, &type); + ISHMEM_CHECK_GOTO_MSG(ret == -1, fn_exit, "Failed to check memory type of pointer\n"); + + return (type == ZE_MEMORY_TYPE_DEVICE); + + fn_exit: + return true; + } + inline int barrier_impl(MPI_Comm comm, MPI_Win win) { int ret = 0; @@ -142,6 +179,100 @@ namespace { return ret; } + template + inline int test_multi_impl(T cmp_value, int cmp, MPI_Datatype dt, int rank, MPI_Aint start, + size_t nelems, const int *status, size_t *indices, size_t &complete, + MPI_Win win) + { + int ret = 0; + MPI_Op op = MPI_NO_OP; + MPI_Aint disp = start; + T *results = (T *) ::calloc(nelems, sizeof(T)); + ISHMEM_CHECK_GOTO_MSG(results == nullptr, fn_fail, "Unable to allocate host memory\n"); + + for (size_t i = 0; i < nelems; ++i) { + if (status && status[i]) { + disp = disp + (MPI_Aint) sizeof(T); + continue; + } + + MPI_CHECK_GOTO(fn_fail, ishmemi_mpi_wrappers::Fetch_and_op(nullptr, &results[i], dt, + rank, disp, op, win)); + + disp = disp + (MPI_Aint) sizeof(T); + } + + MPI_CHECK_GOTO(fn_fail, ishmemi_mpi_wrappers::Win_flush_local(rank, win)); + + for (size_t i = 0; i < nelems; ++i) { + int tmp; + if (status && status[i]) { + continue; + } + + tmp = compare(cmp, results[i], cmp_value); + ISHMEM_CHECK_GOTO_MSG(tmp == -1, fn_fail, "Unknown or unsupported comparison op\n"); + if (tmp == 1) { + indices[complete] = i; + ++complete; + } + } + + fn_exit: + ::free(results); + return ret; + fn_fail: + ret = -1; + goto fn_exit; + } + + template + inline int test_multi_impl(const T *cmp_values, int cmp, MPI_Datatype dt, int rank, + MPI_Aint start, size_t nelems, const int *status, size_t *indices, + size_t &complete, MPI_Win win) + { + int ret = 0; + MPI_Op op = MPI_NO_OP; + MPI_Aint disp = start; + T *results = (T *) ::calloc(nelems, sizeof(T)); + ISHMEM_CHECK_GOTO_MSG(results == nullptr, fn_fail, "Unable to allocate host memory\n"); + + for (size_t i = 0; i < nelems; ++i) { + if (status && status[i]) { + disp = disp + (MPI_Aint) sizeof(T); + continue; + } + + MPI_CHECK_GOTO(fn_fail, ishmemi_mpi_wrappers::Fetch_and_op(nullptr, &results[i], dt, + rank, disp, op, win)); + + disp = disp + (MPI_Aint) sizeof(T); + } + + MPI_CHECK_GOTO(fn_fail, ishmemi_mpi_wrappers::Win_flush_local(rank, win)); + + for (size_t i = 0; i < nelems; ++i) { + int tmp; + if (status && status[i]) { + continue; + } + + tmp = compare(cmp, results[i], cmp_values[i]); + ISHMEM_CHECK_GOTO_MSG(tmp == -1, fn_fail, "Unknown or unsupported comparison op\n"); + if (tmp == 1) { + indices[complete] = i; + ++complete; + } + } + + fn_exit: + ::free(results); + return ret; + fn_fail: + ret = -1; + goto fn_exit; + } + template inline int test_impl(T cmp_value, int cmp, MPI_Datatype dt, int rank, MPI_Aint disp, MPI_Win win) @@ -344,7 +475,7 @@ namespace impl { bsize = 1; } - if (dst == bsize && sst == bsize) { + if (dst == (ptrdiff_t) bsize && sst == (ptrdiff_t) bsize) { put(msg, comp); } else { MPI_Datatype sdt = MPI_DATATYPE_NULL; @@ -376,13 +507,13 @@ namespace impl { bsize = 1; } - if (dst == bsize && sst == bsize) { + if (dst == (ptrdiff_t) bsize && sst == (ptrdiff_t) bsize) { put(msg, comp); } else { T *src_offset = (T *) src; MPI_Aint disp_offset = disp; - for (int i = 0; i < nelems; ++i) { + for (size_t i = 0; i < nelems; ++i) { MPI_CHECK_GOTO( fn_exit, ishmemi_mpi_wrappers::Put(src_offset, (int) bsize, dt, pe, disp_offset, (int) bsize, dt, win)); @@ -437,7 +568,7 @@ namespace impl { bsize = 1; } - if (dst == bsize && sst == bsize) { + if (dst == (ptrdiff_t) bsize && sst == (ptrdiff_t) bsize) { get(msg, comp); } else { MPI_Datatype sdt = MPI_DATATYPE_NULL; @@ -469,13 +600,13 @@ namespace impl { bsize = 1; } - if (dst == bsize && sst == bsize) { + if (dst == (ptrdiff_t) bsize && sst == (ptrdiff_t) bsize) { get(msg, comp); } else { T *dest_offset = (T *) dest; MPI_Aint disp_offset = disp; - for (int i = 0; i < nelems; ++i) { + for (size_t i = 0; i < nelems; ++i) { MPI_CHECK_GOTO(fn_exit, ishmemi_mpi_wrappers::Get(dest_offset, (int) bsize, dt, pe, disp_offset, (int) bsize, dt, win)); @@ -608,7 +739,7 @@ namespace impl { ISHMEMI_RUNTIME_MPI_REQUEST_HELPER(uint8_t, BCAST); if (rank == root) { - ishmem_copy(dest, src, nelems); + ishmemi_copy(dest, src, nelems); } MPI_CHECK(ishmemi_mpi_wrappers::Bcast(dest, (int) nelems, dt, root, comm)); return ret; @@ -625,7 +756,10 @@ namespace impl { int world_team_size = ishmemi_runtime_mpi::teams[ishmemi_runtime_mpi::world_team].size; recvcounts = (int *) ::malloc(sizeof(int) * (size_t) world_team_size); + ISHMEM_CHECK_GOTO_MSG(recvcounts == nullptr, fn_fail, "Unable to allocate host memory\n"); + displs = (int *) ::malloc(sizeof(int) * (size_t) world_team_size); + ISHMEM_CHECK_GOTO_MSG(displs == nullptr, fn_fail, "Unable to allocate host memory\n"); /* Allgather nelems */ MPI_CHECK_GOTO(fn_exit, ishmemi_mpi_wrappers::Allgather(&rcount, 1, MPI_INT, recvcounts, 1, @@ -641,13 +775,16 @@ namespace impl { /* Perform the collect */ MPI_CHECK_GOTO(fn_exit, ishmemi_mpi_wrappers::Allgatherv(src, (int) nelems, dt, dest, recvcounts, displs, dt, comm)); + fn_exit: comp->completion.ret.i = ret; - fn_exit: ::free(displs); ::free(recvcounts); return ret; + fn_fail: + ret = -1; + goto fn_exit; } static int fcollect(ishmemi_request_t *msg, ishmemi_ringcompletion_t *comp) @@ -674,6 +811,29 @@ namespace impl { return ret; } + /* SCAN */ + template + int inscan(ishmemi_request_t *msg, ishmemi_ringcompletion_t *comp) + { + int ret = 0; + ISHMEMI_RUNTIME_MPI_REQUEST_HELPER(T, INSCAN); + + MPI_CHECK(ishmemi_mpi_wrappers::Scan(src, dest, (int) nelems, dt, MPI_SUM, comm)); + comp->completion.ret.i = ret; + return ret; + } + + template + int exscan(ishmemi_request_t *msg, ishmemi_ringcompletion_t *comp) + { + int ret = 0; + ISHMEMI_RUNTIME_MPI_REQUEST_HELPER(T, EXSCAN); + + MPI_CHECK(ishmemi_mpi_wrappers::Exscan(src, dest, (int) nelems, dt, MPI_SUM, comm)); + comp->completion.ret.i = ret; + return ret; + } + /* Point-to-point Synchronization */ template static int test(ishmemi_request_t *msg, ishmemi_ringcompletion_t *comp) @@ -698,14 +858,18 @@ namespace impl { if (nelems == 0) ret = 1; - for (int i = 0; i < nelems; ++i) { - if (status && status[i]) { + /* Get host buffers */ + CONVERT_GPU_BUFFER(const, int, status, sizeof(int) * nelems, true); + CONVERT_GPU_BUFFER(const, T, cmp_values, sizeof(T) * nelems, VECTOR); + + for (size_t i = 0; i < nelems; ++i) { + if (status_host && status_host[i]) { disp = disp + (MPI_Aint) sizeof(T); continue; } if constexpr (VECTOR) { - ret = test_impl(cmp_values[i], cmp, dt, rank, disp, win); + ret = test_impl(cmp_values_host[i], cmp, dt, rank, disp, win); } else { ret = test_impl(cmp_value, cmp, dt, rank, disp, win); } @@ -715,6 +879,9 @@ namespace impl { disp = disp + (MPI_Aint) sizeof(T); } + CLEANUP_GPU_BUFFER(int, status, sizeof(int) * nelems, true); + CLEANUP_GPU_BUFFER(T, cmp_values, sizeof(T) * nelems, VECTOR); + comp->completion.ret.i = ret; ret = 0; @@ -729,26 +896,33 @@ namespace impl { int ret = 0; ISHMEMI_RUNTIME_MPI_DISP_REQUEST_HELPER(T, OP, dest); - for (int i = 0; i < nelems; ++i) { - if (status && status[i]) { + /* Get host buffers */ + CONVERT_GPU_BUFFER(const, int, status, sizeof(int) * nelems, true); + CONVERT_GPU_BUFFER(const, T, cmp_values, sizeof(T) * nelems, VECTOR); + + for (size_t i = 0; i < nelems; ++i) { + if (status_host && status_host[i]) { disp = disp + (MPI_Aint) sizeof(T); continue; } if constexpr (VECTOR) { - ret = test_impl(cmp_values[i], cmp, dt, rank, disp, win); + ret = test_impl(cmp_values_host[i], cmp, dt, rank, disp, win); } else { ret = test_impl(cmp_value, cmp, dt, rank, disp, win); } ISHMEM_CHECK_GOTO_MSG(ret == -1, fn_exit, "Failed to execute test_any"); if (ret == 1) { - complete = static_cast(i); + complete = i; break; } disp = disp + (MPI_Aint) sizeof(T); } + CLEANUP_GPU_BUFFER(int, status, sizeof(int) * nelems, true); + CLEANUP_GPU_BUFFER(T, cmp_values, sizeof(T) * nelems, VECTOR); + comp->completion.ret.szt = complete; ret = 0; @@ -763,25 +937,23 @@ namespace impl { int ret = 0; ISHMEMI_RUNTIME_MPI_DISP_REQUEST_HELPER(T, OP, dest); - for (int i = 0; i < nelems; ++i) { - if (status && status[i]) { - disp = disp + (MPI_Aint) sizeof(T); - continue; - } - - if constexpr (VECTOR) { - ret = test_impl(cmp_values[i], cmp, dt, rank, disp, win); - } else { - ret = test_impl(cmp_value, cmp, dt, rank, disp, win); - } + /* Get host buffers */ + CONVERT_GPU_BUFFER(, size_t, indices, sizeof(size_t) * nelems, true); + CONVERT_GPU_BUFFER(const, int, status, sizeof(int) * nelems, true); + CONVERT_GPU_BUFFER(const, T, cmp_values, sizeof(T) * nelems, VECTOR); - ISHMEM_CHECK_GOTO_MSG(ret == -1, fn_exit, "Failed to execute test_some"); - if (ret == 1) { - indices[complete] = static_cast(i); - ++complete; - } - disp = disp + (MPI_Aint) sizeof(T); + if constexpr (VECTOR) { + ret = test_multi_impl(cmp_values_host, cmp, dt, rank, disp, nelems, status_host, + indices_host, complete, win); + } else { + ret = test_multi_impl(cmp_value, cmp, dt, rank, disp, nelems, status_host, indices_host, + complete, win); } + ISHMEM_CHECK_GOTO_MSG(ret == -1, fn_exit, "Failed to run multiple test ops\n"); + + CLEANUP_GPU_BUFFER(size_t, indices, sizeof(size_t) * nelems, true); + CLEANUP_GPU_BUFFER(int, status, sizeof(int) * nelems, true); + CLEANUP_GPU_BUFFER(T, cmp_values, sizeof(T) * nelems, VECTOR); comp->completion.ret.szt = complete; ret = 0; @@ -843,23 +1015,27 @@ namespace impl { int ret = 0; ISHMEMI_RUNTIME_MPI_DISP_REQUEST_HELPER(T, OP, dest); - int num_skip = 0; - if (status) { - for (int i = 0; i < nelems; ++i) { - num_skip += (status[i] == 0) ? 0 : 1; + /* Get host buffers */ + CONVERT_GPU_BUFFER(const, int, status, sizeof(int) * nelems, true); + CONVERT_GPU_BUFFER(const, T, cmp_values, sizeof(T) * nelems, VECTOR); + + size_t num_skip = 0; + if (status_host) { + for (size_t i = 0; i < nelems; ++i) { + num_skip += (status_host[i] == 0) ? 0 : 1; } } if (num_skip < nelems) { /* Iteratively wait_until on each ivar */ - for (int i = 0; i < nelems; ++i) { - if (status && status[i]) { + for (size_t i = 0; i < nelems; ++i) { + if (status_host && status_host[i]) { disp = disp + (MPI_Aint) sizeof(T); continue; } while (true) { if constexpr (VECTOR) { - ret = test_impl(cmp_values[i], cmp, dt, rank, disp, win); + ret = test_impl(cmp_values_host[i], cmp, dt, rank, disp, win); } else { ret = test_impl(cmp_value, cmp, dt, rank, disp, win); } @@ -872,6 +1048,9 @@ namespace impl { } } + CLEANUP_GPU_BUFFER(int, status, sizeof(int) * nelems, true); + CLEANUP_GPU_BUFFER(T, cmp_values, sizeof(T) * nelems, VECTOR); + ret = 0; fn_exit: @@ -886,31 +1065,35 @@ namespace impl { ISHMEMI_RUNTIME_MPI_DISP_REQUEST_HELPER(T, OP, dest); MPI_Aint tmp_disp = disp; - int num_skip = 0; - if (status) { - for (int i = 0; i < nelems; ++i) { - num_skip += (status[i] == 0) ? 0 : 1; + /* Get host buffers */ + CONVERT_GPU_BUFFER(const, int, status, sizeof(int) * nelems, true); + CONVERT_GPU_BUFFER(const, T, cmp_values, sizeof(T) * nelems, VECTOR); + + size_t num_skip = 0; + if (status_host) { + for (size_t i = 0; i < nelems; ++i) { + num_skip += (status_host[i] == 0) ? 0 : 1; } } if (num_skip < nelems) { while (true) { /* Iteratively test each ivar */ - for (int i = 0; i < nelems; ++i) { - if (status && status[i]) { + for (size_t i = 0; i < nelems; ++i) { + if (status_host && status_host[i]) { tmp_disp = tmp_disp + (MPI_Aint) sizeof(T); continue; } if constexpr (VECTOR) { - ret = test_impl(cmp_values[i], cmp, dt, rank, tmp_disp, win); + ret = test_impl(cmp_values_host[i], cmp, dt, rank, tmp_disp, win); } else { ret = test_impl(cmp_value, cmp, dt, rank, tmp_disp, win); } ISHMEM_CHECK_GOTO_MSG(ret == -1, fn_exit, "Failed to execute wait_until_any"); if (ret == 1) { - complete = static_cast(i); + complete = i; break; } tmp_disp = tmp_disp + (MPI_Aint) sizeof(T); @@ -923,6 +1106,9 @@ namespace impl { } } + CLEANUP_GPU_BUFFER(int, status, sizeof(int) * nelems, true); + CLEANUP_GPU_BUFFER(T, cmp_values, sizeof(T) * nelems, VECTOR); + comp->completion.ret.szt = complete; ret = 0; @@ -938,33 +1124,39 @@ namespace impl { ISHMEMI_RUNTIME_MPI_DISP_REQUEST_HELPER(T, OP, dest); MPI_Aint tmp_disp = disp; - int num_skip = 0; - if (status) { - for (int i = 0; i < nelems; ++i) { - num_skip += (status[i] == 0) ? 0 : 1; + /* Get host buffers */ + CONVERT_GPU_BUFFER(, size_t, indices, sizeof(size_t) * nelems, true); + CONVERT_GPU_BUFFER(const, int, status, sizeof(int) * nelems, true); + CONVERT_GPU_BUFFER(const, T, cmp_values, sizeof(T) * nelems, VECTOR); + + size_t num_skip = 0; + if (status_host) { + for (size_t i = 0; i < nelems; ++i) { + num_skip += (status_host[i] == 0) ? 0 : 1; } } if (num_skip < nelems) { while (true) { /* Iteratively test each ivar */ - for (int i = 0; i < nelems; ++i) { - if (status && status[i]) { + for (size_t i = 0; i < nelems; ++i) { + if (status_host && status_host[i]) { tmp_disp = tmp_disp + (MPI_Aint) sizeof(T); continue; } if constexpr (VECTOR) { - ret = test_impl(cmp_values[i], cmp, dt, rank, tmp_disp, win); + ret = test_impl(cmp_values_host[i], cmp, dt, rank, tmp_disp, win); } else { ret = test_impl(cmp_value, cmp, dt, rank, tmp_disp, win); } ISHMEM_CHECK_GOTO_MSG(ret == -1, fn_exit, "Failed to execute wait_until_some"); if (ret == 1) { - indices[complete] = static_cast(i); + indices_host[complete] = i; ++complete; } + tmp_disp = tmp_disp + (MPI_Aint) sizeof(T); force_progress(comm); } @@ -975,6 +1167,10 @@ namespace impl { } } + CLEANUP_GPU_BUFFER(size_t, indices, sizeof(size_t) * nelems, true); + CLEANUP_GPU_BUFFER(int, status, sizeof(int) * nelems, true); + CLEANUP_GPU_BUFFER(T, cmp_values, sizeof(T) * nelems, VECTOR); + comp->completion.ret.szt = complete; ret = 0; @@ -1506,10 +1702,10 @@ void ishmemi_runtime_mpi::funcptr_init(void) /* Initialize every function with the "unsupported op" function */ /* Note: KILL operation is covered inside the proxy directly - it is the same for all backends * currently */ - for (int i = 0; i < ISHMEMI_OP_END; ++i) { + for (size_t i = 0; i < ISHMEMI_OP_END; ++i) { proxy_funcs[i] = (ishmemi_runtime_proxy_func_t *) ::malloc( sizeof(ishmemi_runtime_proxy_func_t) * ishmemi_runtime_type::proxy_func_num_types); - for (int j = 0; j < ishmemi_runtime_type::proxy_func_num_types; ++j) { + for (size_t j = 0; j < ishmemi_runtime_type::proxy_func_num_types; ++j) { proxy_funcs[i][j] = ishmemi_runtime_type::unsupported; } } @@ -1851,6 +2047,33 @@ void ishmemi_runtime_mpi::funcptr_init(void) proxy_funcs[SUM_REDUCE][DOUBLE] = impl::reduce; proxy_funcs[PROD_REDUCE][DOUBLE] = impl::reduce; + /* Scan */ + proxy_funcs[INSCAN][UINT8] = impl::inscan; + proxy_funcs[INSCAN][UINT16] = impl::inscan; + proxy_funcs[INSCAN][UINT32] = impl::inscan; + proxy_funcs[INSCAN][UINT64] = impl::inscan; + proxy_funcs[INSCAN][ULONGLONG] = impl::inscan; + proxy_funcs[INSCAN][INT8] = impl::inscan; + proxy_funcs[INSCAN][INT16] = impl::inscan; + proxy_funcs[INSCAN][INT32] = impl::inscan; + proxy_funcs[INSCAN][INT64] = impl::inscan; + proxy_funcs[INSCAN][LONGLONG] = impl::inscan; + proxy_funcs[INSCAN][FLOAT] = impl::inscan; + proxy_funcs[INSCAN][DOUBLE] = impl::inscan; + + proxy_funcs[EXSCAN][UINT8] = impl::exscan; + proxy_funcs[EXSCAN][UINT16] = impl::exscan; + proxy_funcs[EXSCAN][UINT32] = impl::exscan; + proxy_funcs[EXSCAN][UINT64] = impl::exscan; + proxy_funcs[EXSCAN][ULONGLONG] = impl::exscan; + proxy_funcs[EXSCAN][INT8] = impl::exscan; + proxy_funcs[EXSCAN][INT16] = impl::exscan; + proxy_funcs[EXSCAN][INT32] = impl::exscan; + proxy_funcs[EXSCAN][INT64] = impl::exscan; + proxy_funcs[EXSCAN][LONGLONG] = impl::exscan; + proxy_funcs[EXSCAN][FLOAT] = impl::exscan; + proxy_funcs[EXSCAN][DOUBLE] = impl::exscan; + /* Point-to-point Synchronization */ proxy_funcs[TEST][INT32] = impl::test; proxy_funcs[TEST_ALL][INT32] = impl::test_all; @@ -1959,8 +2182,8 @@ void ishmemi_runtime_mpi::funcptr_init(void) void ishmemi_runtime_mpi::funcptr_fini(void) { - for (int i = 0; i < ISHMEMI_OP_END; ++i) { - for (int j = 0; j < ishmemi_runtime_type::proxy_func_num_types; ++j) { + for (size_t i = 0; i < ISHMEMI_OP_END; ++i) { + for (size_t j = 0; j < ishmemi_runtime_type::proxy_func_num_types; ++j) { proxy_funcs[i][j] = ishmemi_runtime_type::unsupported; } ISHMEMI_FREE(::free, proxy_funcs[i]); diff --git a/src/runtime/runtime_openshmem.cpp b/src/runtime/runtime_openshmem.cpp index bfce439..d07c3f2 100644 --- a/src/runtime/runtime_openshmem.cpp +++ b/src/runtime/runtime_openshmem.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -746,6 +746,27 @@ int ishmemi_openshmem_reduce(ishmemi_request_t *msg, ishmemi_ringcompletion_t *c return comp->completion.ret.i; } +/* Scan */ +template +int ishmemi_openshmem_inscan(ishmemi_request_t *msg, ishmemi_ringcompletion_t *comp) +{ + ISHMEMI_RUNTIME_REQUEST_HELPER(T, INSCAN); + auto wrapper = ishmemi_openshmem_wrappers::inscan(); + comp->completion.ret.i = + wrapper(ishmemi_openshmem_wrappers::SHMEM_TEAM_WORLD, dest, src, nelems); + return comp->completion.ret.i; +} + +template +int ishmemi_openshmem_exscan(ishmemi_request_t *msg, ishmemi_ringcompletion_t *comp) +{ + ISHMEMI_RUNTIME_REQUEST_HELPER(T, EXSCAN); + auto wrapper = ishmemi_openshmem_wrappers::exscan(); + comp->completion.ret.i = + wrapper(ishmemi_openshmem_wrappers::SHMEM_TEAM_WORLD, dest, src, nelems); + return comp->completion.ret.i; +} + /* Point-to-Point Synchronization */ template int ishmemi_openshmem_test(ishmemi_request_t *msg, ishmemi_ringcompletion_t *comp) @@ -1086,10 +1107,10 @@ void ishmemi_runtime_openshmem::funcptr_init() /* Initialize every function with the "unsupported op" function */ /* Note: KILL operation is covered inside the proxy directly - it is the same for all backends * currently */ - for (int i = 0; i < ISHMEMI_OP_END; ++i) { + for (size_t i = 0; i < ISHMEMI_OP_END; ++i) { proxy_funcs[i] = (ishmemi_runtime_proxy_func_t *) ::malloc( sizeof(ishmemi_runtime_proxy_func_t) * ishmemi_runtime_type::proxy_func_num_types); - for (int j = 0; j < ishmemi_runtime_type::proxy_func_num_types; ++j) { + for (size_t j = 0; j < ishmemi_runtime_type::proxy_func_num_types; ++j) { proxy_funcs[i][j] = ishmemi_runtime_type::unsupported; } } @@ -1401,6 +1422,37 @@ void ishmemi_runtime_openshmem::funcptr_init() proxy_funcs[SUM_REDUCE][DOUBLE] = ishmemi_openshmem_reduce; proxy_funcs[PROD_REDUCE][DOUBLE] = ishmemi_openshmem_reduce; + /* Scan */ + if (ishmemi_openshmem_wrappers::inscan_exists) { + proxy_funcs[INSCAN][UINT8] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][UINT16] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][UINT32] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][UINT64] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][ULONGLONG] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][INT8] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][INT16] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][INT32] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][INT64] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][LONGLONG] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][FLOAT] = ishmemi_openshmem_inscan; + proxy_funcs[INSCAN][DOUBLE] = ishmemi_openshmem_inscan; + } + + if (ishmemi_openshmem_wrappers::exscan_exists) { + proxy_funcs[EXSCAN][UINT8] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][UINT16] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][UINT32] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][UINT64] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][ULONGLONG] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][INT8] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][INT16] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][INT32] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][INT64] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][LONGLONG] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][FLOAT] = ishmemi_openshmem_exscan; + proxy_funcs[EXSCAN][DOUBLE] = ishmemi_openshmem_exscan; + } + /* Point-to-Point Synchronization */ if (oshmpi) { proxy_funcs[TEST][INT32] = ishmemi_openshmem_test; @@ -1650,8 +1702,8 @@ void ishmemi_runtime_openshmem::funcptr_init() void ishmemi_runtime_openshmem::funcptr_fini() { - for (int i = 0; i < ISHMEMI_OP_END; ++i) { - for (int j = 0; j < ishmemi_runtime_type::proxy_func_num_types; ++j) { + for (size_t i = 0; i < ISHMEMI_OP_END; ++i) { + for (size_t j = 0; j < ishmemi_runtime_type::proxy_func_num_types; ++j) { proxy_funcs[i][j] = ishmemi_runtime_type::unsupported; } ISHMEMI_FREE(::free, proxy_funcs[i]); diff --git a/src/runtime/wrapper.h b/src/runtime/wrapper.h index f0b2da3..c846727 100644 --- a/src/runtime/wrapper.h +++ b/src/runtime/wrapper.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -9,6 +9,18 @@ #define ISHMEMI_LINK_STRINGIFY(INPUT) #INPUT +#define ISHMEMI_TRY_LINK_SYMBOL(lib_handle, prefix, suffix, exists) \ + do { \ + void **var_ptr = (void **) &suffix; \ + void *tmp = (void *) dlsym(lib_handle, ISHMEMI_LINK_STRINGIFY(prefix##_##suffix)); \ + if (tmp == nullptr) { \ + exists = false; \ + } else { \ + *var_ptr = tmp; \ + wrapper_list.push_back(var_ptr); \ + } \ + } while (0); + #define ISHMEMI_LINK_SYMBOL(lib_handle, prefix, suffix) \ do { \ void **var_ptr = (void **) &suffix; \ diff --git a/src/runtime/wrapper_mpi.cpp b/src/runtime/wrapper_mpi.cpp index 45f33d5..c7a72ba 100644 --- a/src/runtime/wrapper_mpi.cpp +++ b/src/runtime/wrapper_mpi.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -65,6 +65,8 @@ namespace ishmemi_mpi_wrappers { int (*Allgatherv)(const void *, int, MPI_Datatype, void *, const int[], const int[], MPI_Datatype, MPI_Comm); int (*Allreduce)(const void *, void *, int, MPI_Datatype, MPI_Op, MPI_Comm); + int (*Scan)(const void *, void *, int, MPI_Datatype, MPI_Op, MPI_Comm); + int (*Exscan)(const void *, void *, int, MPI_Datatype, MPI_Op, MPI_Comm); int (*Alltoall)(const void *, int, MPI_Datatype, void *, int, MPI_Datatype, MPI_Comm); int (*Barrier)(MPI_Comm); int (*Bcast)(void *, int, MPI_Datatype, int, MPI_Comm); @@ -154,6 +156,8 @@ namespace ishmemi_mpi_wrappers { ISHMEMI_LINK_SYMBOL(mpi_handle, MPI, Allgather); ISHMEMI_LINK_SYMBOL(mpi_handle, MPI, Allgatherv); ISHMEMI_LINK_SYMBOL(mpi_handle, MPI, Allreduce); + ISHMEMI_LINK_SYMBOL(mpi_handle, MPI, Scan); + ISHMEMI_LINK_SYMBOL(mpi_handle, MPI, Exscan); ISHMEMI_LINK_SYMBOL(mpi_handle, MPI, Alltoall); ISHMEMI_LINK_SYMBOL(mpi_handle, MPI, Barrier); ISHMEMI_LINK_SYMBOL(mpi_handle, MPI, Bcast); diff --git a/src/runtime/wrapper_mpi.h b/src/runtime/wrapper_mpi.h index 25b0793..308a048 100644 --- a/src/runtime/wrapper_mpi.h +++ b/src/runtime/wrapper_mpi.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -64,6 +64,8 @@ namespace ishmemi_mpi_wrappers { extern int (*Allgatherv)(const void *, int, MPI_Datatype, void *, const int[], const int[], MPI_Datatype, MPI_Comm); extern int (*Allreduce)(const void *, void *, int, MPI_Datatype, MPI_Op, MPI_Comm); + extern int (*Scan)(const void *, void *, int, MPI_Datatype, MPI_Op, MPI_Comm); + extern int (*Exscan)(const void *, void *, int, MPI_Datatype, MPI_Op, MPI_Comm); extern int (*Alltoall)(const void *, int, MPI_Datatype, void *, int, MPI_Datatype, MPI_Comm); extern int (*Barrier)(MPI_Comm); extern int (*Bcast)(void *, int, MPI_Datatype, int, MPI_Comm); diff --git a/src/runtime/wrapper_openshmem.cpp b/src/runtime/wrapper_openshmem.cpp index 9c0c3e1..1041fd8 100644 --- a/src/runtime/wrapper_openshmem.cpp +++ b/src/runtime/wrapper_openshmem.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -340,6 +340,35 @@ namespace ishmemi_openshmem_wrappers { reduce_type double_sum_reduce; reduce_type double_prod_reduce; + /* Scan */ + bool inscan_exists; + scan_type uint8_sum_inscan; + scan_type uint16_sum_inscan; + scan_type uint32_sum_inscan; + scan_type uint64_sum_inscan; + scan_type ulonglong_sum_inscan; + scan_type int8_sum_inscan; + scan_type int16_sum_inscan; + scan_type int32_sum_inscan; + scan_type int64_sum_inscan; + scan_type longlong_sum_inscan; + scan_type float_sum_inscan; + scan_type double_sum_inscan; + + bool exscan_exists; + scan_type uint8_sum_exscan; + scan_type uint16_sum_exscan; + scan_type uint32_sum_exscan; + scan_type uint64_sum_exscan; + scan_type ulonglong_sum_exscan; + scan_type int8_sum_exscan; + scan_type int16_sum_exscan; + scan_type int32_sum_exscan; + scan_type int64_sum_exscan; + scan_type longlong_sum_exscan; + scan_type float_sum_exscan; + scan_type double_sum_exscan; + /* Point-to-Point Synchronization */ test_type int32_test; test_all_type int32_test_all; @@ -796,6 +825,35 @@ namespace ishmemi_openshmem_wrappers { ISHMEMI_LINK_SYMBOL(shmem_handle, shmem, double_sum_reduce); ISHMEMI_LINK_SYMBOL(shmem_handle, shmem, double_prod_reduce); + /* Scan */ + inscan_exists = true; + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, uint8_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, uint16_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, uint32_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, uint64_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, ulonglong_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, int8_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, int16_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, int32_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, int64_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, longlong_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, float_sum_inscan, inscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, double_sum_inscan, inscan_exists); + + exscan_exists = true; + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, uint8_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, uint16_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, uint32_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, uint64_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, ulonglong_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, int8_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, int16_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, int32_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, int64_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, longlong_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, float_sum_exscan, exscan_exists); + ISHMEMI_TRY_LINK_SYMBOL(shmem_handle, shmemx, double_sum_exscan, exscan_exists); + /* Point-to-Point Synchronization */ ISHMEMI_LINK_SYMBOL(shmem_handle, shmem, int32_test); ISHMEMI_LINK_SYMBOL(shmem_handle, shmem, int32_test_all); diff --git a/src/runtime/wrapper_openshmem.h b/src/runtime/wrapper_openshmem.h index 547ec0b..d765e9f 100644 --- a/src/runtime/wrapper_openshmem.h +++ b/src/runtime/wrapper_openshmem.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -57,6 +57,9 @@ namespace ishmemi_openshmem_wrappers { /* Reductions */ template using reduce_type = int (*)(shmem_team_t, T *, const T *, size_t); + /* Scan */ + template using scan_type = int (*)(shmem_team_t, T *, const T *, size_t); + /* Point-to-Point Synchronization */ template using test_type = int (*)(T *, int, T); template using test_all_type = int (*)(T *, size_t, const int *, int, T); @@ -395,6 +398,35 @@ namespace ishmemi_openshmem_wrappers { extern reduce_type double_sum_reduce; extern reduce_type double_prod_reduce; + /* Scan */ + extern bool inscan_exists; + extern scan_type uint8_sum_inscan; + extern scan_type uint16_sum_inscan; + extern scan_type uint32_sum_inscan; + extern scan_type uint64_sum_inscan; + extern scan_type ulonglong_sum_inscan; + extern scan_type int8_sum_inscan; + extern scan_type int16_sum_inscan; + extern scan_type int32_sum_inscan; + extern scan_type int64_sum_inscan; + extern scan_type longlong_sum_inscan; + extern scan_type float_sum_inscan; + extern scan_type double_sum_inscan; + + extern bool exscan_exists; + extern scan_type uint8_sum_exscan; + extern scan_type uint16_sum_exscan; + extern scan_type uint32_sum_exscan; + extern scan_type uint64_sum_exscan; + extern scan_type ulonglong_sum_exscan; + extern scan_type int8_sum_exscan; + extern scan_type int16_sum_exscan; + extern scan_type int32_sum_exscan; + extern scan_type int64_sum_exscan; + extern scan_type longlong_sum_exscan; + extern scan_type float_sum_exscan; + extern scan_type double_sum_exscan; + /* Point-to-Point Synchronization */ extern test_type int32_test; extern test_all_type int32_test_all; @@ -814,6 +846,35 @@ return ulonglong_atomic_compare_swap_nbi; template <> inline auto reduce() -> reduce_type { return double_sum_reduce; } template <> inline auto reduce() -> reduce_type { return double_prod_reduce; } + template static constexpr scan_type inscan() { static_assert(assert_dependency::value, "Undefined wrapper function"); } + template <> inline auto inscan() -> scan_type { return uint8_sum_inscan; } + template <> inline auto inscan() -> scan_type { return uint16_sum_inscan; } + template <> inline auto inscan() -> scan_type { return uint32_sum_inscan; } + template <> inline auto inscan() -> scan_type { return uint64_sum_inscan; } + template <> inline auto inscan() -> scan_type { return ulonglong_sum_inscan; } + template <> inline auto inscan() -> scan_type { return int8_sum_inscan; } + template <> inline auto inscan() -> scan_type { return int16_sum_inscan; } + template <> inline auto inscan() -> scan_type { return int32_sum_inscan; } + template <> inline auto inscan() -> scan_type { return int64_sum_inscan; } + template <> inline auto inscan() -> scan_type { return longlong_sum_inscan; } + template <> inline auto inscan() -> scan_type { return float_sum_inscan; } + template <> inline auto inscan() -> scan_type { return double_sum_inscan; } + + template static constexpr scan_type exscan() { static_assert(assert_dependency::value, "Undefined wrapper function"); } + template <> inline auto exscan() -> scan_type { return uint8_sum_exscan; } + template <> inline auto exscan() -> scan_type { return uint16_sum_exscan; } + template <> inline auto exscan() -> scan_type { return uint32_sum_exscan; } + template <> inline auto exscan() -> scan_type { return uint64_sum_exscan; } + template <> inline auto exscan() -> scan_type { return ulonglong_sum_exscan; } + template <> inline auto exscan() -> scan_type { return int8_sum_exscan; } + template <> inline auto exscan() -> scan_type { return int16_sum_exscan; } + template <> inline auto exscan() -> scan_type { return int32_sum_exscan; } + template <> inline auto exscan() -> scan_type { return int64_sum_exscan; } + template <> inline auto exscan() -> scan_type { return longlong_sum_exscan; } + template <> inline auto exscan() -> scan_type { return float_sum_exscan; } + template <> inline auto exscan() -> scan_type { return double_sum_exscan; } + + template static constexpr test_type test() { static_assert(assert_dependency::value, "Undefined wrapper function"); } template <> inline auto test() -> test_type { return uint32_test; } template <> inline auto test() -> test_type { return uint64_test; } diff --git a/src/runtime_ipc.cpp b/src/runtime_ipc.cpp index 48d5a43..a35f656 100644 --- a/src/runtime_ipc.cpp +++ b/src/runtime_ipc.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ #include "ishmem/err.h" @@ -9,35 +9,48 @@ int ishmemi_ipc_put_v(int nitems, struct put_item *items) { int ret = 0; - ze_command_list_desc_t cmd_list_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + void *ipc_dst = nullptr; + ze_command_list_handle_t cmd_list = {}; + + ze_event_handle_t event; + ze_event_desc_t event_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, .pNext = nullptr, - .commandQueueGroupOrdinal = 2, - .flags = 0, + .index = 0, + .signal = 0, + .wait = 0, }; - ze_command_list_handle_t cmd_list; - - ZE_CHECK( - zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_list_desc, &cmd_list)); + ret = ishmemi_create_command_list(COPY_QUEUE, false, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - for (size_t i = 0; i < nitems; i += 1) { - void *ipc_dst = get_ipc_buffer(items[i].pe, (void *) items[i].dst); - if (ipc_dst == nullptr) return (1); /* dest is not ipc-able */ + for (int i = 0; i < nitems; ++i) { + ipc_dst = get_ipc_buffer(items[i].pe, (void *) items[i].dst); + ISHMEMI_CHECK_RESULT((ipc_dst == nullptr), 0, fn_exit); ZE_CHECK(zeCommandListAppendMemoryCopy(cmd_list, ipc_dst, items[i].src, items[i].size, nullptr, 0, nullptr)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); } + + ZE_CHECK(zeEventCreate(ishmemi_ze_event_pool, &event_desc, &event)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + + ZE_CHECK(zeCommandListAppendSignalEvent(cmd_list, event)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + ZE_CHECK(zeCommandListClose(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - ZE_CHECK(zeCommandQueueExecuteCommandLists(ishmemi_ze_all_cmd_queue, 1, &cmd_list, nullptr)); + ret = ishmemi_execute_command_lists(COPY_QUEUE, 1, &cmd_list); + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + + ZE_CHECK(zeEventHostSynchronize(event, UINT64_MAX)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); ZE_CHECK(zeCommandListDestroy(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + fn_exit: return ret; } diff --git a/src/runtime_ipc.h b/src/runtime_ipc.h index 74d9860..4c198f2 100644 --- a/src/runtime_ipc.h +++ b/src/runtime_ipc.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -48,36 +48,27 @@ int ishmemi_ipc_put_immediate_cl(TYPENAME *dst, const TYPENAME *src, size_t nele { int ret = 0; size_t bytes = nelems * size_of(); - ze_command_queue_desc_t cmd_queue_desc = {.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = nullptr, - .ordinal = 2, - .index = 0, - .flags = 0, - .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; ze_command_list_handle_t cmd_list = {}; + ishmemi_queue_type_t queue_type = UNDEFINED_QUEUE; if ((pe == ishmemi_my_pe) || (pe == (ishmemi_my_pe ^ 1))) { - // use main copy engine - cmd_queue_desc.ordinal = 1; - cmd_queue_desc.index = 0; + queue_type = COPY_QUEUE; } else { - // rotate through link copy engines - cmd_queue_desc.ordinal = 2; - cmd_queue_desc.index = ishmemi_link_engine[ishmemi_next_link_engine_index()]; + queue_type = LINK_QUEUE; } + void *ipc_dst = get_ipc_buffer(pe, (void *) dst); - if (ipc_dst == nullptr) return (1); /* dest is not ipc-able */ + ISHMEMI_CHECK_RESULT((ipc_dst == nullptr), 0, fn_exit); - ZE_CHECK(zeCommandListCreateImmediate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_queue_desc, - &cmd_list)); + ret = ishmemi_create_command_list(queue_type, true, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + ZE_CHECK(zeCommandListAppendMemoryCopy(cmd_list, ipc_dst, src, bytes, nullptr, 0, nullptr)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + ZE_CHECK(zeCommandListDestroy(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - return (ret); fn_exit: return ret; @@ -88,16 +79,11 @@ int ishmemi_ipc_put_regular_cl(TYPENAME *dst, const TYPENAME *src, size_t nelems { int ret = 0; size_t bytes = nelems * size_of(); - ze_command_queue_handle_t cmd_queue; - ze_command_list_desc_t cmd_list_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, - .pNext = nullptr, - .commandQueueGroupOrdinal = 2, - .flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, - }; - ze_command_list_handle_t cmd_list; + ze_command_list_handle_t cmd_list = {}; + ishmemi_queue_type_t queue_type = UNDEFINED_QUEUE; + ze_event_handle_t event; ze_event_desc_t event_desc = { .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, .pNext = nullptr, @@ -105,26 +91,17 @@ int ishmemi_ipc_put_regular_cl(TYPENAME *dst, const TYPENAME *src, size_t nelems .signal = 0, .wait = 0, }; - ze_event_handle_t event; void *ipc_dst = get_ipc_buffer(pe, (void *) dst); - if (ipc_dst == nullptr) return (1); /* dest is not ipc-able */ + ISHMEMI_CHECK_RESULT((ipc_dst == nullptr), 0, fn_exit); if ((pe == ishmemi_my_pe) || (pe == (ishmemi_my_pe ^ 1))) { - // use main copy engine - cmd_queue = ishmemi_ze_cmd_queue; - cmd_list_desc.commandQueueGroupOrdinal = 1; /* main copy engine ordinal */ - cmd_list_desc.flags = 0; - /* create command list for the main command queue */ + queue_type = COPY_QUEUE; } else { - // rotate through link copy engines - unsigned int idx = ishmemi_next_link_engine_index(); - cmd_queue = ishmemi_ze_link_cmd_queue[idx]; - cmd_list_desc.commandQueueGroupOrdinal = 2; /* link engines ordinal */ - cmd_list_desc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; + queue_type = LINK_QUEUE; } - ZE_CHECK( - zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_list_desc, &cmd_list)); + + ret = ishmemi_create_command_list(queue_type, false, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); ZE_CHECK(zeEventCreate(ishmemi_ze_event_pool, &event_desc, &event)); @@ -136,7 +113,7 @@ int ishmemi_ipc_put_regular_cl(TYPENAME *dst, const TYPENAME *src, size_t nelems ZE_CHECK(zeCommandListClose(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - ZE_CHECK(zeCommandQueueExecuteCommandLists(cmd_queue, 1, &cmd_list, nullptr)); + ret = ishmemi_execute_command_lists(queue_type, 1, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); ZE_CHECK(zeEventHostSynchronize(event, UINT64_MAX)); @@ -145,8 +122,6 @@ int ishmemi_ipc_put_regular_cl(TYPENAME *dst, const TYPENAME *src, size_t nelems ZE_CHECK(zeCommandListDestroy(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - return (ret); - fn_exit: return ret; } @@ -165,44 +140,22 @@ template int ishmemi_ipc_put_nbi(TYPENAME *dst, const TYPENAME *src, size_t nelems, int pe) { int ret = 0; - size_t outstanding = 0; size_t bytes = nelems * size_of(); - void *ipc_dst = get_ipc_buffer(pe, (void *) dst); - if (ipc_dst == nullptr) return (1); /* dest is not ipc-able */ - /* Check if src is a GPU buffer */ - ze_command_queue_handle_t cmd_queue; + ze_command_list_handle_t cmd_list = {}; - ze_command_list_desc_t cmd_list_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, - .pNext = nullptr, - .commandQueueGroupOrdinal = 2, - .flags = 0, - }; + ishmemi_queue_type_t queue_type = UNDEFINED_QUEUE; + + void *ipc_dst = get_ipc_buffer(pe, (void *) dst); + ISHMEMI_CHECK_RESULT((ipc_dst == nullptr), 0, fn_exit); if ((pe == ishmemi_my_pe) || (pe == (ishmemi_my_pe ^ 1))) { - // use main copy engine - cmd_queue = ishmemi_ze_cmd_queue; - cmd_list_desc.commandQueueGroupOrdinal = 1; /* main copy engine ordinal */ - /* create command list for the main command queue */ - ZE_CHECK( - zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_list_desc, &cmd_list)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - /* save the command list for later destruction on synchronize */ - outstanding = ishmemi_ze_cmd_lists.push_back_thread_safe(cmd_list); + queue_type = COPY_QUEUE; } else { - // rotate through link copy engines - unsigned int idx = ishmemi_next_link_engine_index(); - cmd_queue = ishmemi_ze_link_cmd_queue[idx]; - cmd_list_desc.commandQueueGroupOrdinal = 2; /* link engines ordinal */ - /* create command list for the chosen link command queue */ - ZE_CHECK( - zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_list_desc, &cmd_list)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - /* save the command list for later destruction on synchronize */ - outstanding = ishmemi_ze_link_cmd_lists[idx].push_back_thread_safe(cmd_list); + queue_type = LINK_QUEUE; } - /* We can assume that dst is a GPU buffer since it has to be on the symmetric heap */ + ret = ishmemi_create_command_list_nbi(queue_type, &cmd_list); + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); ZE_CHECK(zeCommandListAppendMemoryCopy(cmd_list, ipc_dst, src, bytes, nullptr, 0, nullptr)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); @@ -210,10 +163,10 @@ int ishmemi_ipc_put_nbi(TYPENAME *dst, const TYPENAME *src, size_t nelems, int p ZE_CHECK(zeCommandListClose(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - ZE_CHECK(zeCommandQueueExecuteCommandLists(cmd_queue, 1, &cmd_list, nullptr)); + ret = ishmemi_execute_command_lists(queue_type, 1, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - if (outstanding >= ishmemi_params.NBI_COUNT) ishmemi_level_zero_sync(); + /* TODO: Should we sync here or check periodically in proxy thread? */ fn_exit: return ret; @@ -224,35 +177,27 @@ int ishmemi_ipc_get_immediate_cl(TYPENAME *dst, const TYPENAME *src, size_t nele { int ret = 0; size_t bytes = nelems * size_of(); - ze_command_queue_desc_t cmd_queue_desc = {.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = nullptr, - .ordinal = 1, - .index = 0, - .flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, - .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + ze_command_list_handle_t cmd_list = {}; + ishmemi_queue_type_t queue_type = UNDEFINED_QUEUE; if ((pe == ishmemi_my_pe) || (pe == (ishmemi_my_pe ^ 1))) { - // use main copy engine - cmd_queue_desc.ordinal = 1; - cmd_queue_desc.index = 0; + queue_type = COPY_QUEUE; } else { - // rotate through link copy engines< - cmd_queue_desc.ordinal = 2; - cmd_queue_desc.index = ishmemi_link_engine[ishmemi_next_link_engine_index()]; + queue_type = LINK_QUEUE; } + void *ipc_src = get_ipc_buffer(pe, (void *) src); - if (ipc_src == nullptr) return (1); /* src is not ipc-able */ + ISHMEMI_CHECK_RESULT((ipc_src == nullptr), 0, fn_exit); - ZE_CHECK(zeCommandListCreateImmediate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_queue_desc, - &cmd_list)); + ret = ishmemi_create_command_list(queue_type, true, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + ZE_CHECK(zeCommandListAppendMemoryCopy(cmd_list, dst, ipc_src, bytes, nullptr, 0, nullptr)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); + ZE_CHECK(zeCommandListDestroy(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - return (ret); fn_exit: return ret; @@ -263,16 +208,11 @@ int ishmemi_ipc_get_regular_cl(TYPENAME *dst, const TYPENAME *src, size_t nelems { int ret = 0; size_t bytes = nelems * size_of(); - ze_command_queue_handle_t cmd_queue; - ze_command_list_desc_t cmd_list_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, - .pNext = nullptr, - .commandQueueGroupOrdinal = 2, - .flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY, - }; - ze_command_list_handle_t cmd_list; + ze_command_list_handle_t cmd_list = {}; + ishmemi_queue_type_t queue_type = UNDEFINED_QUEUE; + ze_event_handle_t event; ze_event_desc_t event_desc = { .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, .pNext = nullptr, @@ -280,26 +220,17 @@ int ishmemi_ipc_get_regular_cl(TYPENAME *dst, const TYPENAME *src, size_t nelems .signal = 0, .wait = 0, }; - ze_event_handle_t event; void *ipc_src = get_ipc_buffer(pe, (void *) src); - if (ipc_src == nullptr) return (1); /* src is not ipc-able */ + ISHMEMI_CHECK_RESULT((ipc_src == nullptr), 0, fn_exit); if ((pe == ishmemi_my_pe) || (pe == (ishmemi_my_pe ^ 1))) { - // use main copy engine - cmd_queue = ishmemi_ze_cmd_queue; - cmd_list_desc.commandQueueGroupOrdinal = 1; /* main copy engine ordinal */ - cmd_list_desc.flags = 0; - /* create command list for the main command queue */ + queue_type = COPY_QUEUE; } else { - // rotate through link copy engines - unsigned int idx = ishmemi_next_link_engine_index(); - cmd_queue = ishmemi_ze_link_cmd_queue[idx]; - cmd_list_desc.commandQueueGroupOrdinal = 2; /* link engines ordinal */ - cmd_list_desc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; + queue_type = LINK_QUEUE; } - ZE_CHECK( - zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_list_desc, &cmd_list)); + + ret = ishmemi_create_command_list(queue_type, false, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); ZE_CHECK(zeEventCreate(ishmemi_ze_event_pool, &event_desc, &event)); @@ -311,7 +242,7 @@ int ishmemi_ipc_get_regular_cl(TYPENAME *dst, const TYPENAME *src, size_t nelems ZE_CHECK(zeCommandListClose(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - ZE_CHECK(zeCommandQueueExecuteCommandLists(cmd_queue, 1, &cmd_list, nullptr)); + ret = ishmemi_execute_command_lists(queue_type, 1, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); ZE_CHECK(zeEventHostSynchronize(event, UINT64_MAX)); @@ -320,8 +251,6 @@ int ishmemi_ipc_get_regular_cl(TYPENAME *dst, const TYPENAME *src, size_t nelems ZE_CHECK(zeCommandListDestroy(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - return (ret); - fn_exit: return ret; } @@ -340,44 +269,22 @@ template int ishmemi_ipc_get_nbi(TYPENAME *dst, const TYPENAME *src, size_t nelems, int pe) { int ret = 0; - size_t outstanding = 0; size_t bytes = nelems * size_of(); - void *ipc_src = get_ipc_buffer(pe, (void *) src); - if (ipc_src == nullptr) return (1); /* src is not ipc-able */ - /* Check if src is a GPU buffer */ - ze_command_queue_handle_t cmd_queue; + ze_command_list_handle_t cmd_list = {}; - ze_command_list_desc_t cmd_list_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, - .pNext = nullptr, - .commandQueueGroupOrdinal = 2, - .flags = 0, - }; + ishmemi_queue_type_t queue_type = UNDEFINED_QUEUE; + + void *ipc_src = get_ipc_buffer(pe, (void *) src); + ISHMEMI_CHECK_RESULT((ipc_src == nullptr), 0, fn_exit); if ((pe == ishmemi_my_pe) || (pe == (ishmemi_my_pe ^ 1))) { - // use main copy engine - cmd_queue = ishmemi_ze_cmd_queue; - cmd_list_desc.commandQueueGroupOrdinal = 1; /* main copy engine ordinal */ - /* create command list for the main command queue */ - ZE_CHECK( - zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_list_desc, &cmd_list)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - /* save the command list for later destruction on synchronize */ - outstanding = ishmemi_ze_cmd_lists.push_back_thread_safe(cmd_list); + queue_type = COPY_QUEUE; } else { - // rotate through link copy engines - unsigned int idx = ishmemi_next_link_engine_index(); - cmd_queue = ishmemi_ze_link_cmd_queue[idx]; - cmd_list_desc.commandQueueGroupOrdinal = 2; /* link engines ordinal */ - /* create command list for the chosen link command queue */ - ZE_CHECK( - zeCommandListCreate(ishmemi_ze_context, ishmemi_gpu_device, &cmd_list_desc, &cmd_list)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - /* save the command list for later destruction on synchronize */ - outstanding = ishmemi_ze_link_cmd_lists[idx].push_back_thread_safe(cmd_list); + queue_type = LINK_QUEUE; } - /* We can assume that dst is a GPU buffer since it has to be on the symmetric heap */ + ret = ishmemi_create_command_list_nbi(queue_type, &cmd_list); + ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); ZE_CHECK(zeCommandListAppendMemoryCopy(cmd_list, dst, ipc_src, bytes, nullptr, 0, nullptr)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); @@ -385,11 +292,10 @@ int ishmemi_ipc_get_nbi(TYPENAME *dst, const TYPENAME *src, size_t nelems, int p ZE_CHECK(zeCommandListClose(cmd_list)); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - ZE_CHECK(zeCommandQueueExecuteCommandLists(cmd_queue, 1, &cmd_list, nullptr)); + ret = ishmemi_execute_command_lists(queue_type, 1, &cmd_list); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - if (outstanding >= ishmemi_params.NBI_COUNT) ishmemi_level_zero_sync(); - goto fn_exit; + /* TODO: Should we sync here or check periodically in proxy thread? */ fn_exit: return ret; diff --git a/src/signaling.cpp b/src/signaling.cpp index 2f5cf78..820ea39 100644 --- a/src/signaling.cpp +++ b/src/signaling.cpp @@ -133,7 +133,7 @@ sycl::event ishmemx_put_signal_on_queue(T *dest, const T *src, size_t nelems, ui it.get_group()); }); } else { - cgh.host_task( + cgh.single_task( [=]() { ishmem_put_signal(dest, src, nelems, sig_addr, signal, sig_op, pe); }); } }); @@ -455,7 +455,7 @@ sycl::event ishmemx_put_signal_nbi_on_queue(T *dest, const T *src, size_t nelems pe, it.get_group()); }); } else { - cgh.host_task( + cgh.single_task( [=]() { ishmem_put_signal_nbi(dest, src, nelems, sig_addr, signal, sig_op, pe); }); } }); diff --git a/src/synchronization.cpp b/src/synchronization.cpp index b621322..e0f7a61 100644 --- a/src/synchronization.cpp +++ b/src/synchronization.cpp @@ -683,7 +683,7 @@ void ishmem_wait_until_all(T *ivars, size_t nelems, const int *status, int cmp, nelems * sizeof(int), ishmemi_op_t::WAIT_ALL); } #ifdef __SYCL_DEVICE_ONLY__ - for (int i = 0; i < nelems; i++) { + for (size_t i = 0; i < nelems; i++) { if (!status || !status[i]) { ishmem_wait_until(&ivars[i], cmp, cmp_value); } @@ -999,7 +999,7 @@ void ishmem_wait_until_all_vector(T *ivars, size_t nelems, const int *status, in nelems * sizeof(int), ishmemi_op_t::WAIT_ALL_VECTOR); } #ifdef __SYCL_DEVICE_ONLY__ - for (int i = 0; i < nelems; i++) { + for (size_t i = 0; i < nelems; i++) { if (!status || !status[i]) { ishmem_wait_until(&ivars[i], cmp, cmp_values[i]); } diff --git a/src/teams.cpp b/src/teams.cpp index a2111a4..ba0e58c 100644 --- a/src/teams.cpp +++ b/src/teams.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause * * Portions derived from Sandia OpenSHMEM (https://github.com/Sandia-OpenSHMEM/SOS) @@ -73,7 +73,7 @@ int ishmemi_team_local_pes(int start, int stride, int size) static int team_init(ishmem_team_t team, int my_pe, int start, int stride, int size) { /* team_host is in the device symmetric heap, so we need to copy to it, rather than store */ - if (team < 0 || team >= ISHMEMI_N_TEAMS) return -1; + if (team < 0 || team >= (long) ISHMEMI_N_TEAMS) return -1; ishmemi_team_host_t *team_host = &ishmemi_cpu_info->team_host_pool[team]; /* team_host is in the device symmetric heap, so we need to copy to it, rather than store */ @@ -251,14 +251,14 @@ int ishmemi_team_init(void) cleanup: ISHMEMI_FREE(ishmemi_runtime->free, ishmemi_cpu_info->team_host_pool); - ISHMEMI_FREE(ishmem_free, ishmemi_mmap_gpu_info->team_device_pool); + ISHMEMI_FREE(ishmemi_free, ishmemi_mmap_gpu_info->team_device_pool); return -1; } void ishmemi_team_destroy(ishmem_team_t team) { - if (team <= ISHMEM_TEAM_INVALID || team >= ISHMEMI_N_TEAMS) return; + if (team <= ISHMEM_TEAM_INVALID || team >= (long) ISHMEMI_N_TEAMS) return; if (team == ISHMEM_TEAM_WORLD || team == ISHMEM_TEAM_SHARED || team == ISHMEMX_TEAM_NODE) { ISHMEM_WARN_MSG("User attempted to destroy a pre-defined team.\n"); @@ -284,7 +284,7 @@ int ishmemi_team_fini(void) } /* Free the device team resources */ - ISHMEMI_FREE(ishmem_free, ishmemi_mmap_gpu_info->team_device_pool); + ISHMEMI_FREE(ishmemi_free, ishmemi_mmap_gpu_info->team_device_pool); /* Free the host team resources */ ISHMEMI_FREE(ishmemi_runtime->free, ishmemi_cpu_info->team_host_pool); @@ -366,7 +366,7 @@ int ishmemi_team_split_strided(ishmem_team_t parent_team_idx, int PE_start, int N_PSYNC_BYTES); ISHMEM_DEBUG_MSG("All pSyncs [ %s ], allocated %d\n", bit_str, *new_team); - if (*new_team == ISHMEM_TEAM_INVALID || *new_team >= ishmemi_params.TEAMS_MAX) { + if (*new_team == ISHMEM_TEAM_INVALID || *new_team >= (long) ishmemi_params.TEAMS_MAX) { ISHMEM_WARN_MSG("No more teams available (max = %ld), try increasing SHMEM_TEAMS_MAX\n", ishmemi_params.TEAMS_MAX); /* No psync was available, but must call barrier across parent team before returning. */ @@ -521,7 +521,7 @@ int ishmemi_team_split_2d(ishmem_team_t parent_team_idx, int xrange, int ishmem_team_my_pe(ishmem_team_t team) { if constexpr (enable_error_checking) validate_init(); - if (team <= ISHMEM_TEAM_INVALID || team >= ISHMEMI_N_TEAMS) return -1; + if (team <= ISHMEM_TEAM_INVALID || team >= (long) ISHMEMI_N_TEAMS) return -1; else #ifdef __SYCL_DEVICE_ONLY__ return global_info->team_device_pool[team].my_pe; @@ -533,7 +533,7 @@ int ishmem_team_my_pe(ishmem_team_t team) int ishmem_team_n_pes(ishmem_team_t team) { if constexpr (enable_error_checking) validate_init(); - if (team <= ISHMEM_TEAM_INVALID || team >= ISHMEMI_N_TEAMS) return -1; + if (team <= ISHMEM_TEAM_INVALID || team >= (long) ISHMEMI_N_TEAMS) return -1; else #ifdef __SYCL_DEVICE_ONLY__ return global_info->team_device_pool[team].size; @@ -545,7 +545,7 @@ int ishmem_team_n_pes(ishmem_team_t team) int ishmem_team_get_config(ishmem_team_t team, long config_mask, ishmem_team_config_t *config) { if constexpr (enable_error_checking) validate_init(); - if (team <= ISHMEM_TEAM_INVALID || team >= ISHMEMI_N_TEAMS) return -1; + if (team <= ISHMEM_TEAM_INVALID || team >= (long) ISHMEMI_N_TEAMS) return -1; #ifdef __SYCL_DEVICE_ONLY__ ishmemi_team_device_t *team_ptr = &global_info->team_device_pool[team]; @@ -573,7 +573,7 @@ int ishmem_team_translate_pe(ishmem_team_t src_team, int src_pe, ishmem_team_t d { if constexpr (enable_error_checking) validate_init(); if (src_team <= ISHMEM_TEAM_INVALID || dest_team <= ISHMEM_TEAM_INVALID || - src_team >= ISHMEMI_N_TEAMS || dest_team >= ISHMEMI_N_TEAMS) + src_team >= (long) ISHMEMI_N_TEAMS || dest_team >= (long) ISHMEMI_N_TEAMS) return -1; #if __SYCL_DEVICE_ONLY__ @@ -617,7 +617,7 @@ int ishmem_team_split_strided(ishmem_team_t parent_team, int PE_start, int PE_st ishmem_team_t *new_team) { if constexpr (enable_error_checking) validate_init(); - if (parent_team <= ISHMEM_TEAM_INVALID || parent_team >= ISHMEMI_N_TEAMS || + if (parent_team <= ISHMEM_TEAM_INVALID || parent_team >= (long) ISHMEMI_N_TEAMS || (PE_stride == 0 && PE_size != 1)) return -1; @@ -631,7 +631,7 @@ int ishmem_team_split_2d(ishmem_team_t parent_team, int xrange, long yaxis_mask, ishmem_team_t *yaxis_team) { if constexpr (enable_error_checking) validate_init(); - if (parent_team <= ISHMEM_TEAM_INVALID || parent_team >= ISHMEMI_N_TEAMS) return -1; + if (parent_team <= ISHMEM_TEAM_INVALID || parent_team >= (long) ISHMEMI_N_TEAMS) return -1; return ishmemi_team_split_2d(parent_team, xrange, xaxis_config, xaxis_mask, xaxis_team, yaxis_config, yaxis_mask, yaxis_team); diff --git a/src/teams.h b/src/teams.h index a63e198..d4e54f5 100644 --- a/src/teams.h +++ b/src/teams.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause * * Portions derived from Sandia OpenSHMEM (https://github.com/Sandia-OpenSHMEM/SOS) @@ -46,7 +46,7 @@ * buffer, then an out of place reduction is used. */ -#define SET_HEAP_FIELD(to, value) ishmem_copy(&to, &value, sizeof(to)) +#define SET_HEAP_FIELD(to, value) ishmemi_copy(&to, &value, sizeof(to)) /* Create two different team data structures, one for host memory and one for device memory * They contain duplicate fields as needed to avoid remote references. diff --git a/test/cmake/common.cmake b/test/cmake/common.cmake index c049139..8f56d60 100644 --- a/test/cmake/common.cmake +++ b/test/cmake/common.cmake @@ -1,4 +1,4 @@ -# Copyright (C) 2025 Intel Corporation +# Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause # ------------------------------------------------------------------- diff --git a/test/include/ishmem_test_config.h.in b/test/include/ishmem_test_config.h.in index ce6bc15..6e1e785 100644 --- a/test/include/ishmem_test_config.h.in +++ b/test/include/ishmem_test_config.h.in @@ -1,4 +1,4 @@ -/* Copyright (C) 2025 Intel Corporation +/* Copyright (C) 2024 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ diff --git a/test/include/ishmem_tester.h b/test/include/ishmem_tester.h index 855dd5e..6003564 100644 --- a/test/include/ishmem_tester.h +++ b/test/include/ishmem_tester.h @@ -33,6 +33,11 @@ constexpr size_t max_wg = 16; ishmemi_type_t bitwise_reduction_types[] = {UCHAR, USHORT, UINT, ULONG, ULONGLONG, INT8, INT16, INT32, INT64, UINT8, UINT16, UINT32, UINT64, SIZE, ISHMEMI_TYPE_END}; + +ishmemi_type_t scan_types[] = {SHORT, INT, LONG, LONGLONG, PTRDIFF, USHORT, UINT, + ULONG, ULONGLONG, INT16, INT32, INT64, UINT16, UINT32, + UINT64, SIZE, FLOAT, DOUBLE, ISHMEMI_TYPE_END}; + ishmemi_type_t compare_reduction_types[] = { CHAR, SCHAR, SHORT, INT, LONG, LONGLONG, PTRDIFF, UCHAR, USHORT, UINT, ULONG, ULONGLONG, INT8, INT16, INT32, INT64, diff --git a/test/include/runtime.h b/test/include/runtime.h index 3520543..6631032 100644 --- a/test/include/runtime.h +++ b/test/include/runtime.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2025 Intel Corporation +/* Copyright (C) 2024 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ diff --git a/test/performance/alltoall_bw.cpp b/test/performance/alltoall_bw.cpp index e30590f..3d3af0f 100644 --- a/test/performance/alltoall_bw.cpp +++ b/test/performance/alltoall_bw.cpp @@ -4,17 +4,17 @@ #define BW_TEST_HEADER #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_alltoall(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems); \ } #define BW_TEST_FUNCTION_ON_QUEUE \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_alltoall_on_queue((long *) dest, (long *) src, nelems, test_return, q); \ } #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_alltoall_work_group(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems, \ grp); \ } diff --git a/test/performance/broadcast_bw.cpp b/test/performance/broadcast_bw.cpp index 5e67301..8bb1e45 100644 --- a/test/performance/broadcast_bw.cpp +++ b/test/performance/broadcast_bw.cpp @@ -5,17 +5,17 @@ #define BW_TEST_HEADER #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_broadcast(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems, 0); \ } #define BW_TEST_FUNCTION_ON_QUEUE \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_broadcast_on_queue((long *) dest, (long *) src, nelems, 0, test_return, q); \ } #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_broadcast_work_group(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems, \ 0, grp); \ } diff --git a/test/performance/collect_bw.cpp b/test/performance/collect_bw.cpp index 5826212..fa1cd2d 100644 --- a/test/performance/collect_bw.cpp +++ b/test/performance/collect_bw.cpp @@ -4,11 +4,11 @@ #define BW_TEST_HEADER #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_collect(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems); \ } #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_collect_work_group(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems, \ grp); \ } diff --git a/test/performance/fcollect_bw.cpp b/test/performance/fcollect_bw.cpp index e602e0a..865abe3 100644 --- a/test/performance/fcollect_bw.cpp +++ b/test/performance/fcollect_bw.cpp @@ -4,17 +4,17 @@ #define BW_TEST_HEADER #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_fcollect(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems); \ } #define BW_TEST_FUNCTION_ON_QUEUE \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_fcollect_on_queue((long *) dest, (long *) src, nelems, test_return, q); \ } #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_fcollect_work_group(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems, \ grp); \ } diff --git a/test/performance/get_bw.cpp b/test/performance/get_bw.cpp index 362e0a5..a0ce54e 100644 --- a/test/performance/get_bw.cpp +++ b/test/performance/get_bw.cpp @@ -4,17 +4,17 @@ #define BW_TEST_HEADER int pe = n_pes - 1; #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_get((long *) dest, (long *) src, nelems, pe); \ } #define BW_TEST_FUNCTION_ON_QUEUE \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_get_on_queue((long *) dest, (long *) src, nelems, pe, q); \ } #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_get_work_group((long *) dest, (long *) src, nelems, pe, grp); \ } diff --git a/test/performance/get_nbi_bw.cpp b/test/performance/get_nbi_bw.cpp index c448cac..3b7e082 100644 --- a/test/performance/get_nbi_bw.cpp +++ b/test/performance/get_nbi_bw.cpp @@ -4,13 +4,13 @@ #define BW_TEST_HEADER int pe = n_pes - 1; #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_get_nbi((long *) dest, (long *) src, nelems, pe); \ } \ ishmem_quiet(); #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_get_nbi_work_group((long *) dest, (long *) src, nelems, pe, grp); \ } \ ishmemx_quiet_work_group(grp); diff --git a/test/performance/put_bw.cpp b/test/performance/put_bw.cpp index 8265865..42e178e 100644 --- a/test/performance/put_bw.cpp +++ b/test/performance/put_bw.cpp @@ -4,17 +4,17 @@ #define BW_TEST_HEADER int pe = n_pes - 1; #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_put((long *) dest, (long *) src, nelems, pe); \ } #define BW_TEST_FUNCTION_ON_QUEUE \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_put_on_queue((long *) dest, (long *) src, nelems, pe, q); \ } #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_put_work_group((long *) dest, (long *) src, nelems, pe, grp); \ } diff --git a/test/performance/put_nbi_bw.cpp b/test/performance/put_nbi_bw.cpp index 9266f03..ecc18be 100644 --- a/test/performance/put_nbi_bw.cpp +++ b/test/performance/put_nbi_bw.cpp @@ -4,13 +4,13 @@ #define BW_TEST_HEADER int pe = n_pes - 1; #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_put_nbi((long *) dest, (long *) src, nelems, pe); \ } \ ishmem_quiet(); #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_put_nbi_work_group((long *) dest, (long *) src, nelems, pe, grp); \ } \ ishmemx_quiet_work_group(grp); diff --git a/test/performance/reduce_bw.cpp b/test/performance/reduce_bw.cpp index 0285b69..9619719 100644 --- a/test/performance/reduce_bw.cpp +++ b/test/performance/reduce_bw.cpp @@ -5,17 +5,17 @@ #define BW_TEST_HEADER #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_long_sum_reduce(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems); \ } #define BW_TEST_FUNCTION_ON_QUEUE \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_sum_reduce_on_queue((long *) dest, (long *) src, nelems, test_return, q); \ } #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_long_sum_reduce_work_group(ISHMEM_TEAM_WORLD, (long *) dest, (long *) src, nelems, \ grp); \ } diff --git a/test/performance/sync_bw.cpp b/test/performance/sync_bw.cpp index 76d7d1d..4241b9b 100644 --- a/test/performance/sync_bw.cpp +++ b/test/performance/sync_bw.cpp @@ -5,12 +5,12 @@ #define BW_TEST_HEADER #define BW_TEST_FUNCTION \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmem_team_sync(ISHMEM_TEAM_WORLD); \ } #define BW_TEST_FUNCTION_WORK_GROUP \ - for (int i = 0; i < iterations; i += 1) { \ + for (size_t i = 0; i < iterations; i += 1) { \ ishmemx_team_sync_work_group(ISHMEM_TEAM_WORLD, grp); \ } #include "ishmem_tester.h" diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index d551803..632f119 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -32,28 +32,33 @@ set(ISHMEM_PE_COUNTS_UNIT_TESTS "2" CACHE STRING "Number of PEs to use for each set(ISHMEM_NON_STANDARD_CTEST_FILES "") set(ISHMEM_SKIP_CTEST_FILES "") -set(ISHMEM_TESTER_MODES device) +set(ISHMEM_TESTER_MODES host_device_device device) set(ISHMEM_TESTER_CTEST_FILES - get - put - ibget - ibput alltoall broadcast collect + exscan fcollect - reduce_sum - reduce_prod + get + get_nbi + ibget + ibput + inscan + put + put_nbi + reduce_and reduce_max reduce_min - reduce_and reduce_or + reduce_prod + reduce_sum reduce_xor - test + signal_wait_until test_all test_all_vector test_any test_any_vector + test_single test_some test_some_vector wait_until @@ -62,8 +67,7 @@ set(ISHMEM_TESTER_CTEST_FILES wait_until_any wait_until_any_vector wait_until_some - wait_until_some_vector - signal_wait_until) + wait_until_some_vector) set(ISHMEM_TESTER_GROUP_MODES device_grp1) set(ISHMEM_TESTER_GROUP_TESTS @@ -72,28 +76,33 @@ set(ISHMEM_TESTER_GROUP_TESTS set(ISHMEM_TESTER_ON_QUEUE_MODES on_queue) set(ISHMEM_TESTER_ON_QUEUE_TESTS - get - put - ibget - ibput alltoall broadcast + collect + exscan fcollect - reduce_sum - reduce_prod + get + get_nbi + ibget + ibput + inscan + put + put_nbi + reduce_and reduce_max reduce_min - reduce_and reduce_or + reduce_prod + reduce_sum reduce_xor + signal_wait_until wait_until wait_until_all wait_until_all_vector wait_until_any wait_until_any_vector wait_until_some - wait_until_some_vector - signal_wait_until) + wait_until_some_vector) enable_testing() @@ -136,11 +145,10 @@ foreach(TEST_SOURCE_FILE ${TEST_SOURCE_FILES}) # Default test settings foreach (MODE IN LISTS ISHMEM_TESTER_MODES) foreach (N ${ISHMEM_PE_COUNTS_UNIT_TESTS}) - add_test(NAME ${EXE}-${N} COMMAND ${CTEST_WRAPPER} ${N} ${ISHMEM_RUN_SCRIPT} + add_test(NAME ${EXE}-${MODE}-${N} COMMAND ${CTEST_WRAPPER} ${N} ${ISHMEM_RUN_SCRIPT} ./${EXE} --test_modes ${MODE} --csv COMMAND_EXPAND_LISTS) endforeach() endforeach() - # Special test settings if (${EXE} IN_LIST ISHMEM_TESTER_GROUP_TESTS) foreach (MODE IN LISTS ISHMEM_TESTER_GROUP_MODES) diff --git a/test/unit/alltoall.cpp b/test/unit/alltoall.cpp index bfbb54c..ce3fdfc 100644 --- a/test/unit/alltoall.cpp +++ b/test/unit/alltoall.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -60,8 +60,8 @@ size_t alltoall_tester::create_source_pattern(ishmemi_type_t t, ishmemi_op_t op, size_t from_pe = (size_t) my_pe; for (size_t to_pe = 0; to_pe < (size_t) n_pes; to_pe += 1) { for (size_t idx = 0; idx < ((test_size_per_pe / sizeof(long)) + 1); idx += 1) { - host_source[idx] = - (long) ((nelems << 48) + ((0x80L + from_pe) << 40) + ((0x80L + to_pe) << 32) + idx); + host_source[idx] = (long) (((long) nelems << 48) + ((long) (0x80L + from_pe) << 40) + + ((long) (0x80L + to_pe) << 32) + (long) idx); } memcpy((void *) (((uintptr_t) aligned_source) + (to_pe * test_size_per_pe)), host_source, test_size_per_pe); @@ -78,10 +78,10 @@ size_t alltoall_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op_t op, /* this is not offset, because the copy from test_dest to host_result does the alignment */ size_t test_size_per_pe = nelems * typesize(t); size_t to_pe = (size_t) my_pe; - for (size_t from_pe = 0; from_pe < n_pes; from_pe += 1) { + for (size_t from_pe = 0; from_pe < (size_t) n_pes; from_pe += 1) { for (size_t idx = 0; idx < ((test_size_per_pe / sizeof(long)) + 1); idx += 1) { - host_source[idx] = - (long) ((nelems << 48) + ((0x80L + from_pe) << 40) + ((0x80L + to_pe) << 32) + idx); + host_source[idx] = (long) (((long) nelems << 48) + ((long) (0x80L + from_pe) << 40) + + ((long) (0x80L + to_pe) << 32) + (long) idx); } memcpy((void *) (((uintptr_t) host_check) + (from_pe * test_size_per_pe)), host_source, test_size_per_pe); diff --git a/test/unit/amo_fetch_add.cpp b/test/unit/amo_fetch_add.cpp index 1923f51..d6c3a0b 100644 --- a/test/unit/amo_fetch_add.cpp +++ b/test/unit/amo_fetch_add.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -50,19 +50,19 @@ constexpr int N = 5; ishmem_barrier_all(); \ auto e_init = q.submit([&](sycl::handler &h) { \ h.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id()[0]; \ + int i = static_cast(idx.get_global_id()[0]); \ remote[i] = (TYPE) 0; \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = (TYPE) 999; \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = (TYPE) 999; \ }); \ }); \ e_init.wait_and_throw(); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = ishmem_##TYPENAME##_atomic_fetch_add( \ - &remote[i], (TYPE) (1), static_cast(j)); \ + int i = static_cast(idx.get_global_id(0)); \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = \ + ishmem_##TYPENAME##_atomic_fetch_add(&remote[i], (TYPE) (1), j); \ }); \ e_run.wait_and_throw(); \ ishmem_barrier_all(); \ diff --git a/test/unit/amo_fetch_add_nbi.cpp b/test/unit/amo_fetch_add_nbi.cpp index ed96a71..b5cfda0 100644 --- a/test/unit/amo_fetch_add_nbi.cpp +++ b/test/unit/amo_fetch_add_nbi.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -50,20 +50,20 @@ constexpr int N = 5; ishmem_barrier_all(); \ auto e_init = q.submit([&](sycl::handler &h) { \ h.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id()[0]; \ + int i = static_cast(idx.get_global_id()[0]); \ remote[i] = (TYPE) 0; \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = (TYPE) 999; \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = (TYPE) 999; \ }); \ }); \ e_init.wait_and_throw(); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ + int i = static_cast(idx.get_global_id(0)); \ auto grp = idx.get_group(); \ - for (size_t j = 0; j < npes; j++) \ - ishmem_##TYPENAME##_atomic_fetch_add_nbi(&val[i * (size_t) npes + j], &remote[i], \ - (TYPE) (1), static_cast(j)); \ + for (int j = 0; j < npes; j++) \ + ishmem_##TYPENAME##_atomic_fetch_add_nbi(&val[i * npes + j], &remote[i], \ + (TYPE) (1), j); \ ishmemx_barrier_all_work_group(grp); \ }); \ e_run.wait_and_throw(); \ diff --git a/test/unit/amo_fetch_and.cpp b/test/unit/amo_fetch_and.cpp index 7155a0a..6e71e43 100644 --- a/test/unit/amo_fetch_and.cpp +++ b/test/unit/amo_fetch_and.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -48,19 +48,19 @@ constexpr int N = 5; ishmem_barrier_all(); \ auto e_init = q.submit([&](sycl::handler &h) { \ h.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id()[0]; \ + int i = static_cast(idx.get_global_id()[0]); \ remote[i] = ~(TYPE) 0; \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = (TYPE) 0; \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = (TYPE) 0; \ }); \ }); \ e_init.wait_and_throw(); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = ishmem_##TYPENAME##_atomic_fetch_and( \ - &remote[i], ~(TYPE) (1LLU << mype), static_cast(j)); \ + int i = static_cast(idx.get_global_id(0)); \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = \ + ishmem_##TYPENAME##_atomic_fetch_and(&remote[i], ~(TYPE) (1LLU << mype), j); \ }); \ e_run.wait_and_throw(); \ ishmem_barrier_all(); \ diff --git a/test/unit/amo_fetch_and_nbi.cpp b/test/unit/amo_fetch_and_nbi.cpp index 30005bb..e9dba9f 100644 --- a/test/unit/amo_fetch_and_nbi.cpp +++ b/test/unit/amo_fetch_and_nbi.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -48,21 +48,20 @@ constexpr int N = 5; ishmem_barrier_all(); \ auto e_init = q.submit([&](sycl::handler &h) { \ h.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id()[0]; \ + int i = static_cast(idx.get_global_id()[0]); \ remote[i] = ~(TYPE) 0; \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = (TYPE) 0; \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = (TYPE) 0; \ }); \ }); \ e_init.wait_and_throw(); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ + int i = static_cast(idx.get_global_id(0)); \ auto grp = idx.get_group(); \ - for (size_t j = 0; j < npes; j++) \ - ishmem_##TYPENAME##_atomic_fetch_and_nbi(&val[i * (size_t) npes + j], &remote[i], \ - ~(TYPE) (1LLU << mype), \ - static_cast(j)); \ + for (int j = 0; j < npes; j++) \ + ishmem_##TYPENAME##_atomic_fetch_and_nbi(&val[i * npes + j], &remote[i], \ + ~(TYPE) (1LLU << mype), j); \ ishmemx_barrier_all_work_group(grp); \ }); \ e_run.wait_and_throw(); \ diff --git a/test/unit/amo_fetch_inc.cpp b/test/unit/amo_fetch_inc.cpp index 8c4f124..0a659ba 100644 --- a/test/unit/amo_fetch_inc.cpp +++ b/test/unit/amo_fetch_inc.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -50,19 +50,18 @@ constexpr int N = 5; ishmem_barrier_all(); \ auto e_init = q.submit([&](sycl::handler &h) { \ h.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id()[0]; \ + int i = static_cast(idx.get_global_id()[0]); \ remote[i] = (TYPE) 0; \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = (TYPE) 999; \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = (TYPE) 999; \ }); \ }); \ e_init.wait_and_throw(); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = \ - ishmem_##TYPENAME##_atomic_fetch_inc(&remote[i], static_cast(j)); \ + int i = static_cast(idx.get_global_id(0)); \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = ishmem_##TYPENAME##_atomic_fetch_inc(&remote[i], j); \ }); \ e_run.wait_and_throw(); \ ishmem_barrier_all(); \ diff --git a/test/unit/amo_fetch_inc_nbi.cpp b/test/unit/amo_fetch_inc_nbi.cpp index 1a0ad4c..0a39b39 100644 --- a/test/unit/amo_fetch_inc_nbi.cpp +++ b/test/unit/amo_fetch_inc_nbi.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -50,20 +50,19 @@ constexpr int N = 5; ishmem_barrier_all(); \ auto e_init = q.submit([&](sycl::handler &h) { \ h.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id()[0]; \ + int i = static_cast(idx.get_global_id()[0]); \ remote[i] = (TYPE) 0; \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = (TYPE) 999; \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = (TYPE) 999; \ }); \ }); \ e_init.wait_and_throw(); \ q.single_task([=]() { ishmem_barrier_all(); }).wait_and_throw(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ + int i = static_cast(idx.get_global_id(0)); \ auto grp = idx.get_group(); \ - for (size_t j = 0; j < npes; j++) \ - ishmem_##TYPENAME##_atomic_fetch_inc_nbi(&val[j + i * (size_t) npes], &remote[i], \ - static_cast(j)); \ + for (int j = 0; j < npes; j++) \ + ishmem_##TYPENAME##_atomic_fetch_inc_nbi(&val[j + i * npes], &remote[i], j); \ ishmemx_barrier_all_work_group(grp); \ }); \ e_run.wait_and_throw(); \ diff --git a/test/unit/amo_fetch_or.cpp b/test/unit/amo_fetch_or.cpp index 1904256..d9be35e 100644 --- a/test/unit/amo_fetch_or.cpp +++ b/test/unit/amo_fetch_or.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -61,10 +61,10 @@ constexpr int N = 5; int *errors = sycl::malloc_host(1, q); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ - for (size_t j = 0; j < npes; j++) \ - fetched_val[j + i * (size_t) npes] = ishmem_##TYPENAME##_atomic_fetch_or( \ - &remote[i], (TYPE) (1LLU << mype), static_cast(j)); \ + int i = static_cast(idx.get_global_id(0)); \ + for (int j = 0; j < npes; j++) \ + fetched_val[j + i * npes] = \ + ishmem_##TYPENAME##_atomic_fetch_or(&remote[i], (TYPE) (1LLU << mype), j); \ }); \ e_run.wait_and_throw(); \ ishmem_barrier_all(); \ diff --git a/test/unit/amo_fetch_or_nbi.cpp b/test/unit/amo_fetch_or_nbi.cpp index 43e988c..3932d8f 100644 --- a/test/unit/amo_fetch_or_nbi.cpp +++ b/test/unit/amo_fetch_or_nbi.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -61,12 +61,11 @@ constexpr int N = 5; int *errors = sycl::malloc_host(1, q); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ + int i = static_cast(idx.get_global_id(0)); \ auto grp = idx.get_group(); \ - for (size_t j = 0; j < npes; j++) \ - ishmem_##TYPENAME##_atomic_fetch_or_nbi(&fetched_val[j + i * (size_t) npes], \ - &remote[i], (TYPE) (1LLU << mype), \ - static_cast(j)); \ + for (int j = 0; j < npes; j++) \ + ishmem_##TYPENAME##_atomic_fetch_or_nbi(&fetched_val[j + i * npes], &remote[i], \ + (TYPE) (1LLU << mype), j); \ ishmemx_barrier_all_work_group(grp); \ }); \ e_run.wait_and_throw(); \ diff --git a/test/unit/amo_fetch_xor.cpp b/test/unit/amo_fetch_xor.cpp index 8017286..164f06f 100644 --- a/test/unit/amo_fetch_xor.cpp +++ b/test/unit/amo_fetch_xor.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -48,19 +48,19 @@ constexpr int N = 5; ishmem_barrier_all(); \ auto e_init = q.submit([&](sycl::handler &h) { \ h.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id()[0]; \ + int i = static_cast(idx.get_global_id()[0]); \ remote[i] = ~(TYPE) 0; \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = (TYPE) 0; \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = (TYPE) 0; \ }); \ }); \ e_init.wait_and_throw(); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = ishmem_##TYPENAME##_atomic_fetch_xor( \ - &remote[i], (TYPE) (1LLU << mype), static_cast(j)); \ + int i = static_cast(idx.get_global_id(0)); \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = \ + ishmem_##TYPENAME##_atomic_fetch_xor(&remote[i], (TYPE) (1LLU << mype), j); \ }); \ e_run.wait_and_throw(); \ ishmem_barrier_all(); \ diff --git a/test/unit/amo_fetch_xor_nbi.cpp b/test/unit/amo_fetch_xor_nbi.cpp index 1f86f4f..357b457 100644 --- a/test/unit/amo_fetch_xor_nbi.cpp +++ b/test/unit/amo_fetch_xor_nbi.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2024 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -48,21 +48,20 @@ constexpr int N = 5; ishmem_barrier_all(); \ auto e_init = q.submit([&](sycl::handler &h) { \ h.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id()[0]; \ + int i = static_cast(idx.get_global_id()[0]); \ remote[i] = ~(TYPE) 0; \ - for (size_t j = 0; j < npes; j++) \ - val[j + i * (size_t) npes] = (TYPE) 0; \ + for (int j = 0; j < npes; j++) \ + val[j + i * npes] = (TYPE) 0; \ }); \ }); \ e_init.wait_and_throw(); \ ishmem_barrier_all(); \ auto e_run = q.parallel_for(sycl::nd_range<1>{N, N}, [=](sycl::nd_item<1> idx) { \ - size_t i = idx.get_global_id(0); \ + int i = static_cast(idx.get_global_id(0)); \ auto grp = idx.get_group(); \ - for (size_t j = 0; j < npes; j++) \ - ishmem_##TYPENAME##_atomic_fetch_xor_nbi(&val[i * (size_t) npes + j], &remote[i], \ - (TYPE) (1LLU << mype), \ - static_cast(j)); \ + for (int j = 0; j < npes; j++) \ + ishmem_##TYPENAME##_atomic_fetch_xor_nbi(&val[i * npes + j], &remote[i], \ + (TYPE) (1LLU << mype), j); \ ishmemx_barrier_all_work_group(grp); \ }); \ e_run.wait_and_throw(); \ diff --git a/test/unit/collect.cpp b/test/unit/collect.cpp index 382f5e5..f579894 100644 --- a/test/unit/collect.cpp +++ b/test/unit/collect.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -87,11 +87,11 @@ size_t collect_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op_t op, t { size_t test_size_per_pe = 0; size_t total_nelems = 0; - for (size_t from_pe = 0; from_pe < n_pes; from_pe += 1) { - test_size_per_pe = (size_t) collect_nelems_dest[from_pe] * typesize(t); + for (int from_pe = 0; from_pe < n_pes; from_pe += 1) { + test_size_per_pe = collect_nelems_dest[from_pe] * typesize(t); for (size_t idx = 0; idx < ((test_size_per_pe / sizeof(long)) + 1); idx += 1) { - host_source[idx] = (long) ((collect_nelems_dest[from_pe] << 48) + - ((0x80L + from_pe) << 40) + (0xffL << 32) + idx); + host_source[idx] = (long) ((long) (collect_nelems_dest[from_pe] << 48) + + ((0x80L + from_pe) << 40) + (0xffL << 32) + (long) idx); } memcpy((void *) (((uintptr_t) host_check) + (total_nelems * typesize(t))), host_source, test_size_per_pe); @@ -118,7 +118,7 @@ size_t collect_tester::run_offset_tests(ishmemi_op_t op) for (int typeindex = 0; typeindex < num_test_types; typeindex += 1) { ishmemi_type_t t = test_types[typeindex]; if (my_pe == 0) { - for (size_t i = 0; i < n_pes; ++i) { + for (int i = 0; i < n_pes; ++i) { collect_nelems_source[i] = (size_t) rand() % max_nelems + 1; } } diff --git a/test/unit/fcollect.cpp b/test/unit/fcollect.cpp index 1b8f986..4f2a3e0 100644 --- a/test/unit/fcollect.cpp +++ b/test/unit/fcollect.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -51,8 +51,8 @@ size_t fcollect_tester::create_source_pattern(ishmemi_type_t t, ishmemi_op_t op, size_t test_size = nelems * typesize(t); size_t from_pe = (size_t) my_pe; for (size_t idx = 0; idx < ((test_size / sizeof(long)) + 1); idx += 1) { - aligned_source[idx] = - (long) ((nelems << 48) + ((0x80L + from_pe) << 40) + (0xffL << 32) + idx); + aligned_source[idx] = (long) (((long) nelems << 48) + ((long) (0x80L + from_pe) << 40) + + (0xffL << 32) + (long) idx); if (patterndebugflag && (idx < 16)) { printf("[%d] source pattern idx %lu val %016lx\n", my_pe, idx, aligned_source[idx]); } @@ -66,10 +66,10 @@ size_t fcollect_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op_t op, size_t nelems) { size_t test_size_per_pe = nelems * typesize(t); - for (size_t from_pe = 0; from_pe < n_pes; from_pe += 1) { + for (size_t from_pe = 0; from_pe < (size_t) n_pes; from_pe += 1) { for (size_t idx = 0; idx < ((test_size_per_pe / sizeof(long)) + 1); idx += 1) { - host_source[idx] = - (long) ((nelems << 48) + ((0x80L + from_pe) << 40) + (0xffL << 32) + idx); + host_source[idx] = (long) (((long) nelems << 48) + ((long) (0x80L + from_pe) << 40) + + (0xffL << 32) + (long) idx); } memcpy((void *) (((uintptr_t) host_check) + (from_pe * test_size_per_pe)), host_source, test_size_per_pe); diff --git a/test/unit/g.cpp b/test/unit/g.cpp index f422377..bc25628 100644 --- a/test/unit/g.cpp +++ b/test/unit/g.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2025 Intel Corporation +/* Copyright (C) 2023 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ diff --git a/test/unit/get_nbi.cpp b/test/unit/get_nbi.cpp index 7bca14c..3bf3cc5 100644 --- a/test/unit/get_nbi.cpp +++ b/test/unit/get_nbi.cpp @@ -15,7 +15,7 @@ #define TEST_BRANCH_ON_QUEUE(testname, typeenum, typename, type, op, opname) \ int pe = (ishmem_my_pe() + ishmem_n_pes() - 1) % ishmem_n_pes(); \ ishmemx_##typename##_##testname##_on_queue((type *) dest, (type *) src, nelems, pe, q); \ - ishmem_quiet(); + ishmemx_quiet_on_queue(q); #define TEST_BRANCH_WORK_GROUP(testname, typeenum, typename, type, op, opname) \ int pe = (ishmem_my_pe() + ishmem_n_pes() - 1) % ishmem_n_pes(); \ diff --git a/test/unit/int_amos.cpp b/test/unit/int_amos.cpp index 749acc3..76f8e15 100644 --- a/test/unit/int_amos.cpp +++ b/test/unit/int_amos.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Intel Corporation +/* Copyright (C) 2025 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ @@ -17,8 +17,8 @@ int main(int argc, char **argv) ishmem_init(); - int my_pe = ishmem_my_pe(); - int npes = ishmem_n_pes(); + uint32_t my_pe = static_cast(ishmem_my_pe()); + uint32_t npes = static_cast(ishmem_n_pes()); sycl::queue q; @@ -38,8 +38,8 @@ int main(int argc, char **argv) auto e_init = q.submit([&](sycl::handler &h) { h.parallel_for(sycl::nd_range<1>{data_size, data_size}, [=](sycl::nd_item<1> idx) { uint32_t i = static_cast(idx.get_global_id()[0]); - source[i] = (static_cast(my_pe) << 16) + i; - target[i] = (static_cast(my_pe) << 16) + 0xface; + source[i] = (my_pe << 16) + i; + target[i] = (my_pe << 16) + 0xface; }); }); e_init.wait_and_throw(); @@ -103,8 +103,8 @@ int main(int argc, char **argv) * correctness (0 for correct, 1 for error) for 14 operations */ auto e_verify = q.submit([&](sycl::handler &h) { h.single_task([=]() { - int target_pe = ((my_pe + 1) % npes) << 16; - int source_pe = ((my_pe - 1 + npes) % npes) << 16; + uint32_t target_pe = static_cast(((my_pe + 1) % npes) << 16); + uint32_t source_pe = static_cast(((my_pe - 1 + npes) % npes) << 16); /* Fetch verify */ if (target[0] != ((target_pe) + 0)) { *errors = *errors + (1 << 0); @@ -180,8 +180,8 @@ int main(int argc, char **argv) q.memcpy(hosttarget, target, sizeof(uint32_t) * data_size).wait_and_throw(); q.memcpy(hostsource, source, sizeof(uint32_t) * data_size).wait_and_throw(); int err_val = *errors; - int exp1 = ((my_pe + 1) % npes) << 16; - int exp2 = ((my_pe - 1 + npes) % npes) << 16; + uint32_t exp1 = ((my_pe + 1) % npes) << 16; + uint32_t exp2 = ((my_pe - 1 + npes) % npes) << 16; if (check_error_bit(err_val, 0)) fprintf(stdout, "[PE %d] Fetch failed: target[0] = 0x%08x, Expected = 0x%08x\n", my_pe, diff --git a/test/unit/int_iget_device.cpp b/test/unit/int_iget_device.cpp index 46dbd4f..d3fd7c4 100644 --- a/test/unit/int_iget_device.cpp +++ b/test/unit/int_iget_device.cpp @@ -87,16 +87,16 @@ int main(int argc, char **argv) CHECK_ALLOC(hosttarget); q.memcpy(hosttarget, target, sizeof(int) * array_size).wait_and_throw(); int source_idx = 0; - for (int i = 0; i < array_size; i += 1) { + for (size_t i = 0; i < array_size; i += 1) { if (((i % dst) == 0) && (i < (dst * elems_to_copy))) { if (hosttarget[i] != (((my_pe + 1) % npes) << 16) + source_idx) { - fprintf(stdout, "[%d] index %d expected 0x%08x got 0x%08x\n", my_pe, i, + fprintf(stdout, "[%d] index %ld expected 0x%08x got 0x%08x\n", my_pe, i, (uint) ((((my_pe + 1) % npes) << 16) + source_idx), hosttarget[i]); } source_idx += sst; } else { if (hosttarget[i] != ((my_pe << 16) + 0xface)) { - fprintf(stdout, "[%d] index %d expected 0x%08x got 0x%08x\n", my_pe, i, + fprintf(stdout, "[%d] index %ld expected 0x%08x got 0x%08x\n", my_pe, i, ((my_pe << 16) + 0xface), hosttarget[i]); } } diff --git a/test/unit/int_iget_on_queue.cpp b/test/unit/int_iget_on_queue.cpp index 1b0cbde..9c0b08a 100644 --- a/test/unit/int_iget_on_queue.cpp +++ b/test/unit/int_iget_on_queue.cpp @@ -80,16 +80,16 @@ int main(int argc, char **argv) CHECK_ALLOC(hosttarget); q.memcpy(hosttarget, target, sizeof(int) * array_size).wait_and_throw(); int source_idx = 0; - for (int i = 0; i < array_size; i += 1) { + for (size_t i = 0; i < array_size; i += 1) { if (((i % dst) == 0) && (i < (dst * elems_to_copy))) { if (hosttarget[i] != (((my_pe + 1) % npes) << 16) + source_idx) { - fprintf(stdout, "[%d] index %d expected 0x%08x got 0x%08x\n", my_pe, i, + fprintf(stdout, "[%d] index %ld expected 0x%08x got 0x%08x\n", my_pe, i, (uint) ((((my_pe + 1) % npes) << 16) + source_idx), hosttarget[i]); } source_idx += sst; } else { if (hosttarget[i] != ((my_pe << 16) + 0xface)) { - fprintf(stdout, "[%d] index %d expected 0x%08x got 0x%08x\n", my_pe, i, + fprintf(stdout, "[%d] index %ld expected 0x%08x got 0x%08x\n", my_pe, i, ((my_pe << 16) + 0xface), hosttarget[i]); } } diff --git a/test/unit/int_iget_work_group_device.cpp b/test/unit/int_iget_work_group_device.cpp index 91fde7f..6f05445 100644 --- a/test/unit/int_iget_work_group_device.cpp +++ b/test/unit/int_iget_work_group_device.cpp @@ -141,7 +141,7 @@ int main(int argc, char **argv) h.single_task([=]() { int source_idx = 0; int expected_value = 0; - for (int i = 0; i < array_size; ++i) { + for (size_t i = 0; i < array_size; ++i) { if (((i % dst) == 0) && (i < (dst * elems_to_copy))) { expected_value = 0x4000000 + (((my_pe + 2) % npes) << 24) + (((my_pe + 1) % npes) << 20) + (source_idx); diff --git a/test/unit/int_iput_device.cpp b/test/unit/int_iput_device.cpp index 3702ef4..2dbb6f3 100644 --- a/test/unit/int_iput_device.cpp +++ b/test/unit/int_iput_device.cpp @@ -66,7 +66,7 @@ int main(int argc, char **argv) h.single_task([=]() { int source_pe = (my_pe > 0) ? (my_pe - 1) : (npes - 1); int source_idx = 0; - for (int i = 0; i < array_size; ++i) { + for (size_t i = 0; i < array_size; ++i) { if (((i % dst) == 0) && (i < (dst * elems_to_copy))) { if (target[i] != (source_pe << 16) + source_idx) { *errors = *errors + 1; @@ -89,16 +89,16 @@ int main(int argc, char **argv) q.memcpy(hosttarget, target, sizeof(int) * array_size).wait_and_throw(); int source_pe = (my_pe > 0) ? (my_pe - 1) : (npes - 1); int source_idx = 0; - for (int i = 0; i < array_size; i += 1) { + for (size_t i = 0; i < array_size; i += 1) { if (((i % dst) == 0) && (i < (dst * elems_to_copy))) { if (hosttarget[i] != (source_pe << 16) + source_idx) { - fprintf(stdout, "[%d] index %d expected 0x%08x got 0x%08x\n", my_pe, i, + fprintf(stdout, "[%d] index %ld expected 0x%08x got 0x%08x\n", my_pe, i, (source_pe << 16) + source_idx, hosttarget[i]); } source_idx += sst; } else { if (hosttarget[i] != ((my_pe << 16) + 0xface)) { - fprintf(stdout, "[%d] index %d expected 0x%08x got 0x%08x\n", my_pe, i, + fprintf(stdout, "[%d] index %ld expected 0x%08x got 0x%08x\n", my_pe, i, ((my_pe << 16) + 0xface), hosttarget[i]); } } diff --git a/test/unit/int_iput_on_queue.cpp b/test/unit/int_iput_on_queue.cpp index 8ba6da3..125cc4c 100644 --- a/test/unit/int_iput_on_queue.cpp +++ b/test/unit/int_iput_on_queue.cpp @@ -59,7 +59,7 @@ int main(int argc, char **argv) h.single_task([=]() { int source_pe = (my_pe > 0) ? (my_pe - 1) : (npes - 1); int source_idx = 0; - for (int i = 0; i < array_size; ++i) { + for (size_t i = 0; i < array_size; ++i) { if (((i % dst) == 0) && (i < (dst * elems_to_copy))) { if (target[i] != (source_pe << 16) + source_idx) { *errors = *errors + 1; @@ -82,16 +82,16 @@ int main(int argc, char **argv) q.memcpy(hosttarget, target, sizeof(int) * array_size).wait_and_throw(); int source_pe = (my_pe > 0) ? (my_pe - 1) : (npes - 1); int source_idx = 0; - for (int i = 0; i < array_size; i += 1) { + for (size_t i = 0; i < array_size; i += 1) { if (((i % dst) == 0) && (i < (dst * elems_to_copy))) { if (hosttarget[i] != (source_pe << 16) + source_idx) { - fprintf(stdout, "[%d] index %d expected 0x%08x got 0x%08x\n", my_pe, i, + fprintf(stdout, "[%d] index %ld expected 0x%08x got 0x%08x\n", my_pe, i, (source_pe << 16) + source_idx, hosttarget[i]); } source_idx += sst; } else { if (hosttarget[i] != ((my_pe << 16) + 0xface)) { - fprintf(stdout, "[%d] index %d expected 0x%08x got 0x%08x\n", my_pe, i, + fprintf(stdout, "[%d] index %ld expected 0x%08x got 0x%08x\n", my_pe, i, ((my_pe << 16) + 0xface), hosttarget[i]); } } diff --git a/test/unit/int_iput_work_group_device.cpp b/test/unit/int_iput_work_group_device.cpp index 9fa16d2..44d043a 100644 --- a/test/unit/int_iput_work_group_device.cpp +++ b/test/unit/int_iput_work_group_device.cpp @@ -142,7 +142,7 @@ int main(int argc, char **argv) int source_pe = (my_pe > 0) ? (my_pe - 1) : (npes - 1); int source_idx = 0; int expected_value = 0; - for (int i = 0; i < array_size; ++i) { + for (size_t i = 0; i < array_size; ++i) { if (((i % dst) == 0) && (i < (dst * elems_to_copy))) { expected_value = 0x4000000 + (((source_pe + 1) % npes) << 24) + ((source_pe % npes) << 20) + (source_idx); diff --git a/test/unit/int_put_nbi_device.cpp b/test/unit/int_put_nbi_device.cpp index 2ceb6ee..ebd0d33 100644 --- a/test/unit/int_put_nbi_device.cpp +++ b/test/unit/int_put_nbi_device.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2025 Intel Corporation +/* Copyright (C) 2023 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ diff --git a/test/unit/put_nbi.cpp b/test/unit/put_nbi.cpp index 8bd9917..ccf74a3 100644 --- a/test/unit/put_nbi.cpp +++ b/test/unit/put_nbi.cpp @@ -15,7 +15,7 @@ #define TEST_BRANCH_ON_QUEUE(testname, typeenum, typename, type, op, opname) \ int pe = (ishmem_my_pe() + 1) % ishmem_n_pes(); \ ishmemx_##typename##_##testname##_on_queue((type *) dest, (type *) src, nelems, pe, q); \ - ishmem_quiet(); + ishmemx_quiet_on_queue(q); #define TEST_BRANCH_WORK_GROUP(testname, typeenum, typename, type, op, opname) \ int pe = (ishmem_my_pe() + 1) % ishmem_n_pes(); \ diff --git a/test/unit/signal_wait_until_host.cpp b/test/unit/signal_wait_until_host.cpp index 369c953..3ee4d36 100644 --- a/test/unit/signal_wait_until_host.cpp +++ b/test/unit/signal_wait_until_host.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2025 Intel Corporation +/* Copyright (C) 2024 Intel Corporation * SPDX-License-Identifier: BSD-3-Clause */ diff --git a/test/unit/test_all.cpp b/test/unit/test_all.cpp index a557bfc..d78f533 100644 --- a/test/unit/test_all.cpp +++ b/test/unit/test_all.cpp @@ -99,6 +99,7 @@ size_t test_all_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op_t op, int main(int argc, char **argv) { class test_all_tester t(argc, argv); + t.max_nelems = 512; size_t bufsize = std::max(NUM_COMPARISON_OPERATORS * sizeof(uint64_t), t.max_nelems * sizeof(uint64_t)); diff --git a/test/unit/test_all_vector.cpp b/test/unit/test_all_vector.cpp index 24c8b69..8ffb9a6 100644 --- a/test/unit/test_all_vector.cpp +++ b/test/unit/test_all_vector.cpp @@ -236,6 +236,7 @@ size_t test_all_vector_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op int main(int argc, char **argv) { class test_all_vector_tester t(argc, argv); + t.max_nelems = 512; size_t bufsize = std::max(NUM_COMPARISON_OPERATORS * sizeof(uint64_t), t.max_nelems * sizeof(uint64_t)); diff --git a/test/unit/test_any.cpp b/test/unit/test_any.cpp index 31b1c34..edab8ca 100644 --- a/test/unit/test_any.cpp +++ b/test/unit/test_any.cpp @@ -116,6 +116,7 @@ size_t test_any_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op_t op, int main(int argc, char **argv) { class test_any_tester t(argc, argv); + t.max_nelems = 512; size_t bufsize = std::max(NUM_COMPARISON_OPERATORS * sizeof(uint64_t), t.max_nelems * sizeof(uint64_t)); diff --git a/test/unit/test_any_vector.cpp b/test/unit/test_any_vector.cpp index 20371d1..392f7d5 100644 --- a/test/unit/test_any_vector.cpp +++ b/test/unit/test_any_vector.cpp @@ -242,6 +242,7 @@ size_t test_any_vector_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op int main(int argc, char **argv) { class test_any_vector_tester t(argc, argv); + t.max_nelems = 512; size_t bufsize = std::max(NUM_COMPARISON_OPERATORS * sizeof(uint64_t), t.max_nelems * sizeof(uint64_t)); diff --git a/test/unit/test_some.cpp b/test/unit/test_some.cpp index 93d8a9b..2557235 100644 --- a/test/unit/test_some.cpp +++ b/test/unit/test_some.cpp @@ -140,6 +140,7 @@ size_t test_some_tester::create_check_pattern(ishmemi_type_t t, ishmemi_op_t op, int main(int argc, char **argv) { class test_some_tester t(argc, argv); + t.max_nelems = 512; size_t bufsize = (NUM_COMPARISON_OPERATORS + (NUM_COMPARISON_OPERATORS * t.max_nelems)) * sizeof(size_t); diff --git a/test/unit/test_some_vector.cpp b/test/unit/test_some_vector.cpp index bb734ec..e9a6054 100644 --- a/test/unit/test_some_vector.cpp +++ b/test/unit/test_some_vector.cpp @@ -286,6 +286,7 @@ size_t test_some_vector_tester::create_check_pattern(ishmemi_type_t t, ishmemi_o int main(int argc, char **argv) { class test_some_vector_tester t(argc, argv); + t.max_nelems = 512; size_t bufsize = (NUM_COMPARISON_OPERATORS + (NUM_COMPARISON_OPERATORS * t.max_nelems)) * sizeof(size_t);