ludwig-ai · tgaddair · Mar 19, 2021 · Oct 18, 2020 · Oct 22, 2020 · Oct 22, 2020
diff --git a/.travis.yml b/.travis.yml
@@ -5,11 +5,17 @@ language: python
 jobs:
   include:
     - python: "3.6"
-      env: TENSORFLOW=2.3.1
+      env: TENSORFLOW=2.3.1 TEST_FILTER="not distributed"
+    - python: "3.6"
+      env: TENSORFLOW=2.3.1 TEST_FILTER="distributed"
+    - python: "3.7"
+      env: TENSORFLOW=2.4.0 TEST_FILTER="not distributed"
     - python: "3.7"
-      env: TENSORFLOW=2.4.0
+      env: TENSORFLOW=2.4.0 TEST_FILTER="distributed"
+    - python: "3.8"
+      env: TENSORFLOW=nightly TEST_FILTER="not distributed"
     - python: "3.8"
-      env: TENSORFLOW=nightly
+      env: TENSORFLOW=nightly TEST_FILTER="distributed"
 before_install:
   - sudo apt-get update
   - sudo apt-get install -y cmake libsndfile1
@@ -28,4 +34,4 @@ install:
   - HOROVOD_WITH_TENSORFLOW=1 HOROVOD_WITHOUT_MPI=1 HOROVOD_WITHOUT_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 pip install --no-cache-dir '.[test]'
 script:
   - pip list
-  - pytest -v --timeout 300 tests
+  - pytest -v --timeout 300 --durations 10 -m "$TEST_FILTER" tests
@@ -22,23 +22,37 @@
 LOCAL_BACKEND = LocalBackend()
 
 LOCAL = 'local'
+DASK = 'dask'
 HOROVOD = 'horovod'
+RAY = 'ray'
 
-ALL_BACKENDS = [LOCAL, HOROVOD]
+ALL_BACKENDS = [LOCAL, DASK, HOROVOD, RAY]
 
 
 def get_local_backend():
     return LOCAL_BACKEND
 
 
+def create_dask_backend():
+    from ludwig.backend.dask import DaskBackend
+    return DaskBackend()
+
+
 def create_horovod_backend():
     from ludwig.backend.horovod import HorovodBackend
     return HorovodBackend()
 
 
+def create_ray_backend():
+    from ludwig.backend.ray import RayBackend
+    return RayBackend()
+
+
 backend_registry = {
     LOCAL: get_local_backend,
+    DASK: create_dask_backend,
     HOROVOD: create_horovod_backend,
+    RAY: create_ray_backend,
     None: get_local_backend,
 }
 

@@ -0,0 +1,41 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2020 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from ludwig.backend.base import Backend, LocalTrainingMixin
+from ludwig.constants import NAME
+from ludwig.data.dataframe.dask import DaskEngine
+
+
+class DaskBackend(LocalTrainingMixin, Backend):
+    def __init__(self):
+        super().__init__()
+        self._df_engine = DaskEngine()
+
+    def initialize(self):
+        pass
+
+    @property
+    def df_engine(self):
+        return self._df_engine
+
+    @property
+    def supports_multiprocessing(self):
+        return False
+
+    def check_lazy_load_supported(self, feature):
+        raise ValueError(f'DaskBackend does not support lazy loading of data files at train time. '
+                         f'Set preprocessing config `in_memory: True` for feature {feature[NAME]}')
@@ -0,0 +1,187 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2020 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import logging
+from collections import defaultdict
+
+import dask
+import ray
+from horovod.ray import RayExecutor
+from ray.util.dask import ray_dask_get
+
+from ludwig.backend.base import Backend, RemoteTrainingMixin
+from ludwig.constants import NAME
+from ludwig.data.dataframe.dask import DaskEngine
+from ludwig.models.predictor import BasePredictor, RemotePredictor
+from ludwig.models.trainer import BaseTrainer, RemoteTrainer
+from ludwig.utils.tf_utils import initialize_tensorflow
+
+
+logger = logging.getLogger(__name__)
+
+
+def get_dask_kwargs():
+    # TODO ray: select this more intelligently,
+    #  must be greather than or equal to number of Horovod workers
+    return dict(
+        parallelism=int(ray.cluster_resources()['CPU'])
+    )
+
+
+def get_horovod_kwargs():
+    # TODO ray: https://github.com/horovod/horovod/issues/2702
+    resources = [node['Resources'] for node in ray.state.nodes()]
+    use_gpu = int(ray.cluster_resources().get('GPU', 0)) > 0
+
+    # Our goal is to maximize the number of training resources we can
+    # form into a homogenous configuration. The priority is GPUs, but
+    # can fall back to CPUs if there are no GPUs available.
+    key = 'GPU' if use_gpu else 'CPU'
+
+    # Bucket the per node resources by the number of the target resource
+    # available on that host (equivalent to number of slots).
+    buckets = defaultdict(list)
+    for node_resources in resources:
+        buckets[int(node_resources.get(key, 0))].append(node_resources)
+
+    # Maximize for the total number of the target resource = num_slots * num_workers
+    def get_total_resources(bucket):
+        slots, resources = bucket
+        return slots * len(resources)
+
+    best_slots, best_resources = max(buckets.items(), key=get_total_resources)
+    return dict(
+        num_slots=best_slots,
+        num_hosts=len(best_resources),
+        use_gpu=use_gpu
+    )
+
+
+class RayRemoteModel:
+    def __init__(self, model):
+        self.cls, self.args, state = list(model.__reduce__())
+        self.state = ray.put(state)
+
+    def load(self):
+        obj = self.cls(*self.args)
+        obj.__setstate__(ray.get(self.state))
+        return obj
+
+
+class RayTrainer(BaseTrainer):
+    def __init__(self, horovod_kwargs, trainer_kwargs):
+        # TODO ray: make this more configurable by allowing YAML overrides of timeout_s, etc.
+        setting = RayExecutor.create_settings(timeout_s=30)
+        self.executor = RayExecutor(setting, **{**get_horovod_kwargs(), **horovod_kwargs})
+        self.executor.start(executable_cls=RemoteTrainer, executable_kwargs=trainer_kwargs)
+
+    def train(self, model, *args, **kwargs):
+        model = RayRemoteModel(model)
+        results = self.executor.execute(
+            lambda trainer: trainer.train(model.load(), *args, **kwargs)
+        )
+        return results[0]
+
+    def train_online(self, model, *args, **kwargs):
+        model = RayRemoteModel(model)
+        results = self.executor.execute(
+            lambda trainer: trainer.train_online(model.load(), *args, **kwargs)
+        )
+        return results[0]
+
+    @property
+    def validation_field(self):
+        return self.executor.execute_single(lambda trainer: trainer.validation_field)
+
+    @property
+    def validation_metric(self):
+        return self.executor.execute_single(lambda trainer: trainer.validation_metric)
+
+    def shutdown(self):
+        self.executor.shutdown()
+
+
+class RayPredictor(BasePredictor):
+    def __init__(self, horovod_kwargs, predictor_kwargs):
+        # TODO ray: investigate using Dask for prediction instead of Horovod
+        setting = RayExecutor.create_settings(timeout_s=30)
+        self.executor = RayExecutor(setting, **{**get_horovod_kwargs(), **horovod_kwargs})
+        self.executor.start(executable_cls=RemotePredictor, executable_kwargs=predictor_kwargs)
+
+    def batch_predict(self, model, *args, **kwargs):
+        model = RayRemoteModel(model)
+        results = self.executor.execute(
+            lambda predictor: predictor.batch_predict(model.load(), *args, **kwargs)
+        )
+        return results[0]
+
+    def batch_evaluation(self, model, *args, **kwargs):
+        model = RayRemoteModel(model)
+        results = self.executor.execute(
+            lambda predictor: predictor.batch_evaluation(model.load(), *args, **kwargs)
+        )
+        return results[0]
+
+    def batch_collect_activations(self, model, *args, **kwargs):
+        model = RayRemoteModel(model)
+        return self.executor.execute_single(
+            lambda predictor: predictor.batch_collect_activations(model.load(), *args, **kwargs)
+        )
+
+    def shutdown(self):
+        self.executor.shutdown()
+
+
+class RayBackend(RemoteTrainingMixin, Backend):
+    def __init__(self, horovod_kwargs=None):
+        super().__init__()
+        self._df_engine = DaskEngine(**get_dask_kwargs())
+        self._horovod_kwargs = horovod_kwargs or {}
+        self._tensorflow_kwargs = {}
+
+    def initialize(self):
+        try:
+            ray.init('auto', ignore_reinit_error=True)
+        except ConnectionError:
+            logger.info('Initializing new Ray cluster...')
+            ray.init(ignore_reinit_error=True)
+        dask.config.set(scheduler=ray_dask_get)
+
+    def initialize_tensorflow(self, **kwargs):
+        # Make sure we don't claim any GPU resources on the head node
+        initialize_tensorflow(gpus=-1)
+        self._tensorflow_kwargs = kwargs
+
+    def create_trainer(self, **kwargs):
+        executable_kwargs = {**kwargs, **self._tensorflow_kwargs}
+        return RayTrainer(self._horovod_kwargs, executable_kwargs)
+
+    def create_predictor(self, **kwargs):
+        executable_kwargs = {**kwargs, **self._tensorflow_kwargs}
+        return RayPredictor(self._horovod_kwargs, executable_kwargs)
+
+    @property
+    def df_engine(self):
+        return self._df_engine
+
+    @property
+    def supports_multiprocessing(self):
+        return False
+
+    def check_lazy_load_supported(self, feature):
+        raise ValueError(f'RayBackend does not support lazy loading of data files at train time. '
+                         f'Set preprocessing config `in_memory: True` for feature {feature[NAME]}')
@@ -0,0 +1,55 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2020 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from ludwig.data.batcher.base import Batcher
+
+
+class IterableBatcher(Batcher):
+    def __init__(self,
+                 dataset,
+                 data,
+                 steps_per_epoch,
+                 ignore_last=False):
+        self.dataset = dataset
+        self.data = data
+        self.data_it = iter(data)
+
+        self.ignore_last = ignore_last
+        self.steps_per_epoch = steps_per_epoch
+        self.step = 0
+
+    def next_batch(self):
+        if self.last_batch():
+            raise StopIteration()
+
+        sub_batch = {}
+        batch = next(self.data_it)
+        for features_name in self.dataset.features:
+            sub_batch[features_name] = self.dataset.get(
+                features_name,
+                batch
+            )
+
+        self.step += 1
+        return sub_batch
+
+    def last_batch(self):
+        return self.step >= self.steps_per_epoch or (
+                self.ignore_last and
+                self.step + 1 >= self.steps_per_epoch)
+
+    def set_epoch(self, epoch):
+        self.step = 0