Merge branch 'master' into olruwase/zero_multi_models

tjruwase · web-flow · commit 7edabdb45b0f · 2025-02-25T18:42:11.000-05:00
diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
@@ -32,11 +32,12 @@ jobs:
         run: |
           pip uninstall torch --yes
           pip install setuptools
+          pip install build
           pip list
 
       - name: Build deepspeed
         run: |
-          DS_BUILD_STRING=" " python setup.py sdist
+          DS_BUILD_STRING=" " python -m build --sdist
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -26,7 +26,8 @@ jobs:
     - name: Build DeepSpeed
       run: |
         pip install setuptools
-        DS_BUILD_STRING=" " python setup.py sdist
+        pip install build
+        DS_BUILD_STRING=" " python -m build --sdist
     - name: Publish to PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
       with:
diff --git a/build_win.bat b/build_win.bat
@@ -11,6 +11,6 @@ set DS_BUILD_GDS=0
 set DS_BUILD_RAGGED_DEVICE_OPS=0
 set DS_BUILD_SPARSE_ATTN=0
 
-python setup.py bdist_wheel
+python -m build --wheel --no-isolation
 
 :end
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -284,12 +284,13 @@ int open_file(const char* filename, const bool read_op)
 
 int regular_read(const char* filename, std::vector<char>& buffer)
 {
-    int64_t num_bytes;
-    const auto f_size = get_file_size(filename, num_bytes);
-    assert(f_size != -1);
-    buffer.resize(num_bytes);
     const auto fd = open(filename, O_RDONLY, 0600);
     assert(fd != -1);
+    struct stat fs;
+    const auto result = fstat(fd, &fs);
+    assert(result != -1);
+    int64_t num_bytes = fs.st_size;
+    buffer.resize(num_bytes);
     int64_t read_bytes = 0;
     auto r = 0;
     do {
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
@@ -145,22 +145,11 @@ def has_reduce_scatter_tensor(self):
 
     def init_process_group(self, backend, timeout, init_method, rank, world_size):
         if not torch.distributed.is_initialized():
-            if not required_torch_version(min_version=2.4):
-                # Windows torch builds do not come with lib_uv by default.
-                # More information here: https://pytorch.org/tutorials/intermediate/TCPStore_libuv_backend.html
-                use_libuv = False if os.name == "nt" else True
-                torch.distributed.init_process_group(backend,
-                                                     timeout=timeout,
-                                                     init_method=init_method,
-                                                     rank=rank,
-                                                     world_size=world_size,
-                                                     use_libuv=use_libuv)
-            else:
-                torch.distributed.init_process_group(backend,
-                                                     timeout=timeout,
-                                                     init_method=init_method,
-                                                     rank=rank,
-                                                     world_size=world_size)
+            torch.distributed.init_process_group(backend,
+                                                 timeout=timeout,
+                                                 init_method=init_method,
+                                                 rank=rank,
+                                                 world_size=world_size)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
     @disable_compiler_collective
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
@@ -48,7 +48,8 @@ def move(tensor, device):
     # to save host resources when DP > 1。
 
     if tensor.is_meta:
-        return torch.empty_like(tensor, device=device)
+        # Keep tensor in meta device if tensor is meta.
+        return tensor
     else:
         # Using new tensors help in freeing memory (after split for example) was done before by calling clone().
         # Using copy=True instead of clone() will help in case of cpu --> cpu.
diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
@@ -7,7 +7,8 @@
 import logging
 import sys
 import os
-from deepspeed.runtime.compiler import is_compile_supported, is_compiling
+import torch
+from deepspeed.utils.torch import required_torch_version
 
 log_levels = {
     "debug": logging.DEBUG,
@@ -20,31 +21,6 @@
 
 class LoggerFactory:
 
-    def create_warning_filter(logger):
-        warn = False
-
-        def warn_once(record):
-            nonlocal warn
-            if is_compile_supported() and is_compiling() and not warn:
-                warn = True
-                logger.warning("To avoid graph breaks caused by logger in compile-mode, it is recommended to"
-                               " disable logging by setting env var DISABLE_LOGS_WHILE_COMPILING=1")
-            return True
-
-        return warn_once
-
-    @staticmethod
-    def logging_decorator(func):
-
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            if is_compiling():
-                return
-            else:
-                return func(*args, **kwargs)
-
-        return wrapper
-
     @staticmethod
     def create_logger(name=None, level=logging.INFO):
         """create a logger
@@ -70,12 +46,15 @@ def create_logger(name=None, level=logging.INFO):
         ch.setLevel(level)
         ch.setFormatter(formatter)
         logger_.addHandler(ch)
-        if os.getenv("DISABLE_LOGS_WHILE_COMPILING", "0") == "1":
-            for method in ['info', 'debug', 'error', 'warning', 'critical', 'exception']:
+        if required_torch_version(min_version=2.6) and os.getenv("DISABLE_LOGS_WHILE_COMPILING", "0") == "1":
+            excluded_set = {
+                item.strip()
+                for item in os.getenv("LOGGER_METHODS_TO_EXCLUDE_FROM_DISABLE", "").split(",")
+            }
+            ignore_set = {'info', 'debug', 'error', 'warning', 'critical', 'exception', 'isEnabledFor'} - excluded_set
+            for method in ignore_set:
                 original_logger = getattr(logger_, method)
-                setattr(logger_, method, LoggerFactory.logging_decorator(original_logger))
-        else:
-            logger_.addFilter(LoggerFactory.create_warning_filter(logger_))
+                torch._dynamo.config.ignore_logger_methods.add(original_logger)
         return logger_
 
 
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
@@ -84,7 +84,7 @@ This should complete the full build 2-3 times faster. You can adjust `-j` to spe
 You can also build a binary wheel and install it on multiple machines that have the same type of GPUs and the same software environment (CUDA toolkit, PyTorch, Python, etc.)
 
 ```bash
-DS_BUILD_OPS=1 python setup.py build_ext -j8 bdist_wheel
+DS_BUILD_OPS=1 python -m build --wheel --no-isolation --config-setting="--build-option=build_ext" --config-setting="--build-option=-j8"
 ```
 
 This will create a pypi binary wheel under `dist`, e.g., ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` and then you can install it directly on multiple machines, in our example:
diff --git a/docs/_tutorials/ds-sequence.md b/docs/_tutorials/ds-sequence.md
@@ -111,7 +111,7 @@ pip install .
 cd ${WORK_DIR}
 git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention
 cd flash-attention
-python setup.py install
+python -m pip install .
 ```
 
 You may also want to ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the FlashAttention documentation for more details.
diff --git a/install.sh b/install.sh
@@ -152,7 +152,7 @@ if [ ! -f $hostfile ]; then
 fi
 
 echo "Building deepspeed wheel"
-python setup.py $VERBOSE bdist_wheel
+python -m build $VERBOSE --wheel --no-isolation
 
 if [ "$local_only" == "1" ]; then
     echo "Installing deepspeed"
diff --git a/release/release.sh b/release/release.sh
@@ -38,7 +38,7 @@ if [ $? != 0 ]; then
     exit 1
 fi
 
-DS_BUILD_STRING="" python setup.py sdist
+DS_BUILD_STRING="" python -m build --sdist
 
 if [ ! -f dist/deepspeed-${version}.tar.gz ]; then
     echo "prepared version does not match version given ($version), bump version first?"
diff --git a/setup.py b/setup.py
@@ -233,7 +233,7 @@ def op_enabled(op_name):
 version_str = open('version.txt', 'r').read().strip()
 
 # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
+# Example: `DS_BUILD_STRING=".dev20201022" python -m build --no-isolation`.
 
 # Building wheel for distribution, update version file.
 if is_env_set('DS_BUILD_STRING'):