microsoft · jeffra · Nov 12, 2020 · Oct 30, 2020 · Oct 30, 2020 · Oct 30, 2020
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,7 @@ build/
 dist/
 *.so
 deepspeed.egg-info/
+build.txt
 
 # Website
 docs/_site/
@@ -23,3 +24,7 @@ docs/code-docs/build
 
 # Testing data
 tests/unit/saved_checkpoint/
+
+# Dev/IDE data
+.vscode
+.theia
diff --git a/.gitmodules b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "third_party/apex"]
-	path = third_party/apex
-	url = https://github.com/NVIDIA/apex.git
 [submodule "DeepSpeedExamples"]
 	path = DeepSpeedExamples
 	url = https://github.com/microsoft/DeepSpeedExamples

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+global-include *.cpp *.h *.cu *.tr *.cuh *.cc *.txt
diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
 [![Build Status](https://dev.azure.com/DeepSpeedMSFT/DeepSpeed/_apis/build/status/microsoft.DeepSpeed?branchName=master)](https://dev.azure.com/DeepSpeedMSFT/DeepSpeed/_build/latest?definitionId=1&branchName=master)
+[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://badge.fury.io/py/deepspeed)
 [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)
 [![License MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
 [![Docker Pulls](https://img.shields.io/docker/pulls/deepspeed/deepspeed)](https://hub.docker.com/r/deepspeed/deepspeed)
@@ -31,29 +32,25 @@ information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)
 
 
 # News
-* [2020/09/10] [DeepSpeed: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
+* [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
+* [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
+* [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
   * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
   * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
   * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)
   * [10x bigger model training on a single GPU with ZeRO-Offload](https://www.deepspeed.ai/news/2020/09/08/ZeRO-Offload.html)
 * [2020/08/07] [DeepSpeed Microsoft Research Webinar](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html) is now available on-demand
-* [2020/07/24] [DeepSpeed Microsoft Research Webinar](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html) on August 6th, 2020
-  [![DeepSpeed webinar](docs/assets/images/webinar-aug2020.png)](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-Live.html)
-* [2020/05/19] [ZeRO-2 & DeepSpeed: Shattering Barriers of Deep Learning Speed & Scale](https://www.microsoft.com/en-us/research/blog/zero-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/)
-* [2020/05/19] [An Order-of-Magnitude Larger and Faster Training with ZeRO-2](https://www.deepspeed.ai/news/2020/05/18/zero-stage2.html)
-* [2020/05/19] [The Fastest and Most Efficient BERT Training through Optimized Transformer Kernels](https://www.deepspeed.ai/news/2020/05/18/bert-record.html)
-* [2020/02/13] [Turing-NLG: A 17-billion-parameter language model by Microsoft](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
-* [2020/02/13] [ZeRO & DeepSpeed: New system optimizations enable training models with over 100 billion parameters](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
 
 
 # Table of Contents
 | Section                                 | Description                                 |
 | --------------------------------------- | ------------------------------------------- |
 | [Why DeepSpeed?](#why-deepspeed)        |  DeepSpeed overview                         |
-| [Features](#features)                   |  DeepSpeed features                         |
-| [Further Reading](#further-reading)     |  DeepSpeed documentation, tutorials, etc.   |
-| [Contributing](#contributing)           |  Instructions for contributing to DeepSpeed |
-| [Publications](#publications)           |  DeepSpeed publications                     |
+| [Install](#installation)                |  Installation details                       |
+| [Features](#features)                   |  Feature list and overview                  |
+| [Further Reading](#further-reading)     |  Documentation, tutorials, etc.             |
+| [Contributing](#contributing)           |  Instructions for contributing              |
+| [Publications](#publications)           |  Publications related to DeepSpeed          |
 
 # Why DeepSpeed?
 Training advanced deep learning models is challenging. Beyond model design,
@@ -65,8 +62,32 @@ a large model easily runs out of memory with pure data parallelism and it is
 difficult to use model parallelism. DeepSpeed addresses these challenges to
 accelerate model development *and* training.
 
-# Features
+# Installation
+
+The quickest way to get started with DeepSpeed is via pip, this will install
+the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
+versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
+to as our 'ops'.  By default, all of these extensions/ops will be built
+just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
+ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
+dynamically link them at runtime.
+
+```bash
+pip install deepspeed
+```
+
+After installation you can validate your install and see which extensions/ops
+your machine is compatible with via the DeepSpeed environment report.
 
+```bash
+ds_report
+```
+
+If you would like to pre-install any of the DeepSpeed extensions/ops (instead
+of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced
+installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/).
+
+# Features
 Below we provide a brief feature list, see our detailed [feature
 overview](https://www.deepspeed.ai/features/) for descriptions and usage.
 

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -43,17 +43,15 @@ jobs:
       conda install -q --yes conda
       conda install -q --yes pip
       conda install -q --yes gxx_linux-64
-      if [[ $(cuda.version) != "10.2" ]]; then conda install --yes -c conda-forge cudatoolkit-dev=$(cuda.version) ; fi
       echo "PATH=$PATH, LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
     displayName: 'Setup environment python=$(python.version) pytorch=$(pytorch.version) cuda=$(cuda.version)'
 
     # Manually install torch/torchvision first to enforce versioning.
   - script: |
       source activate $(conda_env)
       pip install --progress-bar=off torch==$(pytorch.version) torchvision==$(torchvision.version)
-      #-f https://download.pytorch.org/whl/torch_stable.html
-      ./install.sh --local_only
-      #python -I basic_install_test.py
+      pip install .[dev]
+      ds_report
     displayName: 'Install DeepSpeed'
 
   - script: |
@@ -71,7 +69,8 @@ jobs:
 
   - script: |
       source activate $(conda_env)
-      pytest --durations=0 --forked --verbose -x tests/unit/
+      if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+      TORCH_EXTENSIONS_DIR=./torch-extensions pytest --durations=0 --forked --verbose -x tests/unit/
     displayName: 'Unit tests'
 
 #   - script: |

diff --git a/basic_install_test.py b/basic_install_test.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+
+from deepspeed.env_report import main
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,14 @@
+/* Copyright 2020 The Microsoft DeepSpeed Team
+   Copyright NVIDIA/apex
+   This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
@@ -2,32 +2,40 @@
 
 #include "custom_cuda_layers.h"
 
+//__global__ void param_update_kernel(const float* input, __half* output, int size)
+//{
+//    const float4* input_cast = reinterpret_cast<const float4*>(input);
+//    float2* output_cast = reinterpret_cast<float2*>(output);
+//
+//    int id = blockIdx.x * blockDim.x + threadIdx.x;
+//
+//    if (id < size) {
+//        float4 data = input_cast[id];
+//        float2 cast_data;
+//        __half* output_h = reinterpret_cast<__half*>(&cast_data);
+//
+//        output_h[0] = (__half)data.x;
+//        output_h[1] = (__half)data.y;
+//        output_h[2] = (__half)data.z;
+//        output_h[3] = (__half)data.w;
+//
+//        output_cast[id] = cast_data;
+//
+//    }
+//}
+
 __global__ void param_update_kernel(const float* input, __half* output, int size)
 {
-    const float4* input_cast = reinterpret_cast<const float4*>(input);
-    float2* output_cast = reinterpret_cast<float2*>(output);
-
     int id = blockIdx.x * blockDim.x + threadIdx.x;
 
-    if (id < size) {
-        float4 data = input_cast[id];
-        float2 cast_data;
-        __half* output_h = reinterpret_cast<__half*>(&cast_data);
-
-        output_h[0] = (__half)data.x;
-        output_h[1] = (__half)data.y;
-        output_h[2] = (__half)data.z;
-        output_h[3] = (__half)data.w;
-
-        output_cast[id] = cast_data;
-    }
+    if (id < size) { output[id] = (__half)input[id]; }
 }
 
 void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
 {
     int threads = 512;
 
-    size /= 4;
+    // size /= 4;
     dim3 grid_dim((size - 1) / threads + 1);
     dim3 block_dim(threads);
 

@@ -0,0 +1,20 @@
+#include <torch/extension.h>
+
+void multi_tensor_adam_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const float epsilon,
+                            const int step,
+                            const int mode,
+                            const int bias_correction,
+                            const float weight_decay);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("multi_tensor_adam",
+          &multi_tensor_adam_cuda,
+          "Compute and apply gradient update to parameters for Adam optimizer");
+}