## (Advanced) PopulationBasedTraining with Ray Tune, TensorFlow, MLFlow

## step 1 import libraries 

In [3]:
import ray; print(f'ray version {ray.__version__}')
import xgboost_ray; print('xgboost_ray', xgboost_ray.__version__)
import xgboost; print('xgboost', xgboost.__version__)
import lightgbm_ray; print('lightgbm_ray', lightgbm_ray.__version__)
import pandas as pd; print('pandas version', pd.__version__)

ray version 1.8.0
xgboost_ray 0.1.5
xgboost 1.5.1
lightgbm_ray 0.1.2
pandas version 1.2.3


In [4]:
from __future__ import print_function

from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import (Input, Activation, Dense, Permute,
                                     Dropout)
from tensorflow.keras.layers import add, dot, concatenate
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import get_file
from tensorflow.keras.preprocessing.sequence import pad_sequences

from filelock import FileLock
import os
import argparse
import tarfile
import numpy as np
import re

from ray import tune
import mlflow
from ray.tune.integration.mlflow import MLflowLoggerCallback, mlflow_mixin

In [5]:
def tokenize(sent):
    """Return the tokens of a sentence including punctuation.

    >>> tokenize("Bob dropped the apple. Where is the apple?")
    ["Bob", "dropped", "the", "apple", ".", "Where", "is", "the", "apple", "?"]
    """
    return [x.strip() for x in re.split(r"(\W+)?", sent) if x and x.strip()]


In [6]:
def parse_stories(lines, only_supporting=False):
    """Parse stories provided in the bAbi tasks format

    If only_supporting is true, only the sentences
    that support the answer are kept.
    """
    data = []
    story = []
    for line in lines:
        line = line.decode("utf-8").strip()
        nid, line = line.split(" ", 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if "\t" in line:
            q, a, supporting = line.split("\t")
            q = tokenize(q)
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append("")
        else:
            sent = tokenize(line)
            story.append(sent)
    return data


In [7]:
def get_stories(f, only_supporting=False, max_length=None):
    """Given a file name, read the file,
    retrieve the stories,
    and then convert the sentences into a single story.

    If max_length is supplied,
    any stories longer than max_length tokens will be discarded.
    """

    def flatten(data):
        return sum(data, [])

    data = parse_stories(f.readlines(), only_supporting=only_supporting)
    data = [(flatten(story), q, answer) for story, q, answer in data
            if not max_length or len(flatten(story)) < max_length]
    return data


In [8]:
def vectorize_stories(word_idx, story_maxlen, query_maxlen, data):
    inputs, queries, answers = [], [], []
    for story, query, answer in data:
        inputs.append([word_idx[w] for w in story])
        queries.append([word_idx[w] for w in query])
        answers.append(word_idx[answer])
    return (pad_sequences(inputs, maxlen=story_maxlen),
            pad_sequences(queries, maxlen=query_maxlen), np.array(answers))


In [9]:
def read_data(finish_fast=False):
    # Get the file
    try:
        path = get_file(
            "babi-tasks-v1-2.tar.gz",
            origin="https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz")
            # origin = "gs://shakdemo-hyperplane/data/nlp/babi_tasks_1-20_v1-2.tar.gz")
            # origin = "s3://d2v-tmp/demo/data/qa/babi_tasks_1-20_v1-2.tar.gz")
    except Exception:
        print(
            "Error downloading dataset, please download it manually:\n"
            "$ wget http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2"  # noqa: E501
            ".tar.gz\n"
            "$ mv tasks_1-20_v1-2.tar.gz ~/.keras/datasets/babi-tasks-v1-2.tar.gz"  # noqa: E501
        )
        raise

    # Choose challenge
    challenges = {
        # QA1 with 10,000 samples
        "single_supporting_fact_10k": "tasks_1-20_v1-2/en-10k/qa1_"
        "single-supporting-fact_{}.txt",
        # QA2 with 10,000 samples
        "two_supporting_facts_10k": "tasks_1-20_v1-2/en-10k/qa2_"
        "two-supporting-facts_{}.txt",
    }
    challenge_type = "single_supporting_fact_10k"
    challenge = challenges[challenge_type]

    with tarfile.open(path) as tar:
        train_stories = get_stories(tar.extractfile(challenge.format("train")))
        test_stories = get_stories(tar.extractfile(challenge.format("test")))
    if finish_fast:
        train_stories = train_stories[:64]
        test_stories = test_stories[:64]
    return train_stories, test_stories


In [10]:
class MemNNModel(tune.Trainable):
    @mlflow_mixin
    def build_model(self):
        """Helper method for creating the model"""
        vocab = set()
        for story, q, answer in self.train_stories + self.test_stories:
            vocab |= set(story + q + [answer])
        vocab = sorted(vocab)

        # Reserve 0 for masking via pad_sequences
        vocab_size = len(vocab) + 1
        story_maxlen = max(
            len(x) for x, _, _ in self.train_stories + self.test_stories)
        query_maxlen = max(
            len(x) for _, x, _ in self.train_stories + self.test_stories)

        word_idx = {c: i + 1 for i, c in enumerate(vocab)}
        self.inputs_train, self.queries_train, self.answers_train = (
            vectorize_stories(word_idx, story_maxlen, query_maxlen,
                              self.train_stories))
        self.inputs_test, self.queries_test, self.answers_test = (
            vectorize_stories(word_idx, story_maxlen, query_maxlen,
                              self.test_stories))

        # placeholders
        input_sequence = Input((story_maxlen, ))
        question = Input((query_maxlen, ))

        # encoders
        # embed the input sequence into a sequence of vectors
        input_encoder_m = Sequential()
        input_encoder_m.add(Embedding(input_dim=vocab_size, output_dim=64))
        input_encoder_m.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, story_maxlen, embedding_dim)

        # embed the input into a sequence of vectors of size query_maxlen
        input_encoder_c = Sequential()
        input_encoder_c.add(
            Embedding(input_dim=vocab_size, output_dim=query_maxlen))
        input_encoder_c.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, story_maxlen, query_maxlen)

        # embed the question into a sequence of vectors
        question_encoder = Sequential()
        question_encoder.add(
            Embedding(
                input_dim=vocab_size, output_dim=64,
                input_length=query_maxlen))
        question_encoder.add(Dropout(self.config.get("dropout", 0.3)))
        # output: (samples, query_maxlen, embedding_dim)

        # encode input sequence and questions (which are indices)
        # to sequences of dense vectors
        input_encoded_m = input_encoder_m(input_sequence)
        input_encoded_c = input_encoder_c(input_sequence)
        question_encoded = question_encoder(question)

        # compute a "match" between the first input vector sequence
        # and the question vector sequence
        # shape: `(samples, story_maxlen, query_maxlen)`
        match = dot([input_encoded_m, question_encoded], axes=(2, 2))
        match = Activation("softmax")(match)

        # add the match matrix with the second input vector sequence
        response = add(
            [match, input_encoded_c])  # (samples, story_maxlen, query_maxlen)
        response = Permute(
            (2, 1))(response)  # (samples, query_maxlen, story_maxlen)

        # concatenate the match matrix with the question vector sequence
        answer = concatenate([response, question_encoded])

        # the original paper uses a matrix multiplication.
        # we choose to use a RNN instead.
        answer = LSTM(32)(answer)  # (samples, 32)

        # one regularization layer -- more would probably be needed.
        answer = Dropout(self.config.get("dropout", 0.3))(answer)
        answer = Dense(vocab_size)(answer)  # (samples, vocab_size)
        # we output a probability distribution over the vocabulary
        answer = Activation("softmax")(answer)

        # build the final model
        model = Model([input_sequence, question], answer)
        return model
    
    @mlflow_mixin
    def setup(self, config):
        with FileLock(os.path.expanduser("~/.tune.lock")):
            self.train_stories, self.test_stories = read_data(
                config["finish_fast"])
        model = self.build_model()
        rmsprop = RMSprop(
            lr=self.config.get("lr", 1e-3), rho=self.config.get("rho", 0.9))
        model.compile(
            optimizer=rmsprop,
            loss="sparse_categorical_crossentropy",
            metrics=["accuracy"])
        self.model = model
        
    @mlflow_mixin
    def step(self):
        # train
        mlflow.tensorflow.autolog()
        self.model.fit(
            [self.inputs_train, self.queries_train],
            self.answers_train,
            batch_size=self.config.get("batch_size", 32),
            epochs=self.config.get("epochs", 1),
            validation_data=([self.inputs_test, self.queries_test],
                             self.answers_test),
            verbose=0)
        _, accuracy = self.model.evaluate(
            [self.inputs_train, self.queries_train],
            self.answers_train,
            verbose=0)
        return {"mean_accuracy": accuracy}
    
    def save_checkpoint(self, checkpoint_dir):
        file_path = checkpoint_dir + "/model"
        self.model.save(file_path)
        return file_path

    def load_checkpoint(self, path):
        # See https://stackoverflow.com/a/42763323
        del self.model
        self.model = load_model(path)


## initialize a Ray cluster 

In [11]:
from hyperplane.ray_common import initialize_ray_cluster, stop_ray_cluster, find_ray_workers
num_workers = 2
cpu_core_per_worker = 15
ram_gb_per_worker = 12 #110 GB allocatible for 16_128 nodes, 12 for 16_16 nodes, 27 for 32_32 nodes
ray_cluster = initialize_ray_cluster(num_workers, cpu_core_per_worker, ram_gb_per_worker)

👉 Hyperplane: selecting worker node pool
best pool spec {'pool_env_var': 'DASK_POOL_16_16', 'allocatable_cores': 15.0, 'allocatable_ram': 12.0}


2021-12-08 15:02:31,381	INFO services.py:1270 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Waiting for worker ray-worker-77eb2548-bc5f-43bd-9c1a-4ff59fb03b26...
Waiting for worker ray-worker-e3fc8c5b-84c0-41b9-ba23-149d722384be...


In [12]:
import ray
from ray.tune.schedulers import PopulationBasedTraining

pbt = PopulationBasedTraining(
    perturbation_interval=2,
    hyperparam_mutations={
        "dropout": lambda: np.random.uniform(0, 1),
        "lr": lambda: 10**np.random.randint(-10, 0),
        "rho": lambda: np.random.uniform(0, 1)
    })

In [14]:
experiment_name = 'pbt_babi_memnn'
mlflow.set_tracking_uri(os.environ.get('DATABASE_URL_NO_PARAMS')[:-12]) ## this one 
mlflow.set_experiment(experiment_name)

In [15]:
results = tune.run(
        MemNNModel,
        name="pbt_babi_memnn",
        scheduler=pbt,
        metric="mean_accuracy",
        mode="max",
        stop={"training_iteration": 2},
        num_samples=2,
        config={
            "finish_fast": True,
            "batch_size": 32,
            "epochs": 1,
            "dropout": 0.3,
            "lr": 0.01,
            "rho": 0.9,
            "mlflow": {
                "experiment_name": experiment_name,
                "tracking_uri": mlflow.get_tracking_uri()
            }
        },
        verbose = 2,
        # sync_config=tune.SyncConfig(
        # sync_to_driver=False,
        # upload_dir="gs://shakdemo-hyperplane/results/ray_tf_nl/"
        # upload_dir="s3://d2v-tmp/demo/ray"
        # )

)


[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:50.528779: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:51.107925: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


[2m[36m(MemNNModel pid=73, ip=10.1.168.3)[0m Downloading data from https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz
   73728/11745123 [..............................] - ETA: 13s
  385024/11745123 [..............................] - ETA: 4s 


[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:54.598060: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:54.598522: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:54.598555: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:54.598597: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ray-worker-e3fc8c5b-84c0-41b9-ba23-149d722384be): /proc/driver/nvidia/version does not exist
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:54.598884: I te

[2m[36m(MemNNModel pid=72, ip=10.1.167.6)[0m Downloading data from https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz
    8192/11745123 [..............................] - ETA: 37s
  278528/11745123 [..............................] - ETA: 5s 


[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:57.043080: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:57.043129: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:57.060707: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:57.067952: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:57.078221: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: /tmp/tmpp0u_vntr/train/plugins/profile/2021_12_08_15_07_57
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:07:57.086345: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to /tmp/tmpp0u_vntr/train/pl



[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:58.648163: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:58.648699: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:58.648734: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:58.648763: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ray-worker-77eb2548-bc5f-43bd-9c1a-4ff59fb03b26): /proc/driver/nvidia/version does not exist
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:58.649017: I te

[2m[36m(pid=72, ip=10.1.167.6)[0m 2021/12/08 15:07:59 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '70941480a7c54cf1bb98ed663d74218a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:59.489430: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:59.489491: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:59.489664: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:59.543847: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:07:59.5

[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:08:00.009744: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:01.570623: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:01.570672: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:01.596288: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:01.619962: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:01.648155: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: /tmp/tmp7byfgk8t/train/plugins/profile

Trial MemNNModel_9b515_00000 reported mean_accuracy=0.22 with parameters={'finish_fast': True, 'batch_size': 32, 'epochs': 1, 'dropout': 0.3, 'lr': 0.01, 'rho': 0.9, 'mlflow': {'experiment_name': 'pbt_babi_memnn', 'tracking_uri': 'postgresql://postgres:postgres@postgresql.postgres-m288j5y2'}}.


[2m[36m(pid=73, ip=10.1.168.3)[0m 2021/12/08 15:08:03 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '95e460991eed4863b4b20f534ac1e845', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:08:03.415144: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:08:03.415177: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:08:03.415215: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:08:03.540455: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
[2m[36m(pid=73, ip=10.1.168.3)[0m 2021-12-08 15:08:03.540494: I tensorflow/core/profiler/lib/pro



Trial MemNNModel_9b515_00001 reported mean_accuracy=0.22 with parameters={'finish_fast': True, 'batch_size': 32, 'epochs': 1, 'dropout': 0.3, 'lr': 0.01, 'rho': 0.9, 'mlflow': {'experiment_name': 'pbt_babi_memnn', 'tracking_uri': 'postgresql://postgres:postgres@postgresql.postgres-m288j5y2'}}.
Trial MemNNModel_9b515_00000 reported mean_accuracy=0.23 with parameters={'finish_fast': True, 'batch_size': 32, 'epochs': 1, 'dropout': 0.3, 'lr': 0.01, 'rho': 0.9, 'mlflow': {'experiment_name': 'pbt_babi_memnn', 'tracking_uri': 'postgresql://postgres:postgres@postgresql.postgres-m288j5y2'}}. This trial completed.


[2m[36m(pid=72, ip=10.1.167.6)[0m 2021/12/08 15:08:14 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '505ff502b39148f99da5fdcfbe3f8f1a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:14.653185: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:14.653224: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:14.653279: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:14.796667: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
[2m[36m(pid=72, ip=10.1.167.6)[0m 2021-12-08 15:08:14.796719: I tensorflow/core/profiler/lib/pro



Trial MemNNModel_9b515_00001 reported mean_accuracy=0.28 with parameters={'finish_fast': True, 'batch_size': 32, 'epochs': 1, 'dropout': 0.3, 'lr': 0.01, 'rho': 0.9, 'mlflow': {'experiment_name': 'pbt_babi_memnn', 'tracking_uri': 'postgresql://postgres:postgres@postgresql.postgres-m288j5y2'}}. This trial completed.


Trial name,status,loc,acc,iter,total time (s)
MemNNModel_9b515_00000,TERMINATED,10.1.168.3:73,0.234375,2,14.3511
MemNNModel_9b515_00001,TERMINATED,10.1.167.6:72,0.28125,2,16.1567


2021-12-08 15:08:21,074	INFO tune.py:630 -- Total run time: 32.30 seconds (31.61 seconds for the tuning loop).


In [16]:
print("Best hyperparameters found were: ", results.best_config)

Best hyperparameters found were:  {'finish_fast': True, 'batch_size': 32, 'epochs': 1, 'dropout': 0.3, 'lr': 0.01, 'rho': 0.9, 'mlflow': {'experiment_name': 'pbt_babi_memnn', 'tracking_uri': 'postgresql://postgres:postgres@postgresql.postgres-m288j5y2'}}


In [17]:
stop_ray_cluster(ray_cluster)

Deleting ray-worker-77eb2548-bc5f-43bd-9c1a-4ff59fb03b26
Deleting ray-worker-e3fc8c5b-84c0-41b9-ba23-149d722384be
