## Use Ray Tune and MLFlow on Hyperplane 

In [None]:
# !pip install tensorboardX --quiet
# !pip install kubernetes==18.20 --quiet

In [None]:
import ray
import tensorflow as tf
import torch 
print(f'ray version {ray.__version__}')
print(f'tf version {tf.__version__}')
print(f'torch version {torch.__version__}')


2021-12-08 05:29:55.971324: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


ray version 1.8.0
tf version 2.4.1
torch version 1.7.1+cpu


In [None]:
from hyperplane.ray_common import initialize_ray_cluster, stop_ray_cluster, find_ray_workers
num_workers = 2
cpu_core_per_worker = 7
ram_gb_per_worker = 6 #110 GB allocatible for 16_128 nodes, 12 for 16_16 nodes, 27 for 32_32 nodes
ray_cluster = initialize_ray_cluster(num_workers, cpu_core_per_worker, ram_gb_per_worker)

👉 Hyperplane: selecting worker node pool
best pool spec {'pool_env_var': 'DASK_POOL_16_16', 'allocatable_cores': 15.0, 'allocatable_ram': 12.0}




Waiting for worker ray-worker-d41fba09-79e7-47c2-8da7-0a75d0eab126...
Waiting for worker ray-worker-24b53498-d460-42f6-a1a7-ade2d34dc1e9...


In [None]:
import os
import tempfile
import time

import mlflow

from ray import tune
from ray.tune.integration.mlflow import MLflowLoggerCallback, mlflow_mixin
print('mlflow version', mlflow.__version__)

[2m[33m(raylet, ip=10.1.155.4)[0m [2021-12-08 05:30:09,168 E 17 17] agent_manager.cc:134: Not all required Ray dependencies for the runtime_env feature were found. To install the required dependencies, please run `pip install 'ray[default]'`.
[2m[33m(raylet, ip=10.1.155.4)[0m [2021-12-08 05:30:09,168 E 17 17] worker_pool.cc:566: [Eagerly] Couldn't create a runtime environment for job 01000000.


mlflow version 1.17.0


[2m[33m(raylet, ip=10.1.156.4)[0m [2021-12-08 05:30:09,377 E 16 16] agent_manager.cc:134: Not all required Ray dependencies for the runtime_env feature were found. To install the required dependencies, please run `pip install 'ray[default]'`.
[2m[33m(raylet, ip=10.1.156.4)[0m [2021-12-08 05:30:09,377 E 16 16] worker_pool.cc:566: [Eagerly] Couldn't create a runtime environment for job 01000000.


In [None]:
def evaluation_fn(step, width, height):
    return (0.1 + width * step / 100)**(-1) + height * 0.1


In [None]:
def easy_objective(config):
    # Hyperparameters
    width, height = config["width"], config["height"]

    for step in range(config.get("steps", 100)):
        # Iterative training function - can be any arbitrary training procedure
        intermediate_score = evaluation_fn(step, width, height)
        # Feed the score back to Tune.
        tune.report(iterations=step, mean_loss=intermediate_score)
        time.sleep(0.1)


In [None]:
def tune_function(mlflow_tracking_uri, finish_fast=False):
    tune.run(
        easy_objective,
        name="mlflow",
        num_samples=5,
        callbacks=[
            MLflowLoggerCallback(
                tracking_uri=mlflow_tracking_uri,
                experiment_name="mixin_example",
                save_artifact=True)
        ],
        config={
            "width": tune.randint(10, 100),
            "height": tune.randint(0, 100),
            "steps": 5 if finish_fast else 100,
        })

In [None]:
@mlflow_mixin
def decorated_easy_objective(config):
    # Hyperparameters
    width, height = config["width"], config["height"]

    for step in range(config.get("steps", 100)):
        # Iterative training function - can be any arbitrary training procedure
        intermediate_score = evaluation_fn(step, width, height)
        # Log the metrics to mlflow
        mlflow.log_metrics(dict(mean_loss=intermediate_score), step=step)
        # Feed the score back to Tune.
        tune.report(iterations=step, mean_loss=intermediate_score)
        time.sleep(0.1)


In [None]:
def tune_decorated(mlflow_tracking_uri, finish_fast=False):
    # Set the experiment, or create a new one if does not exist yet.
    mlflow.set_tracking_uri(mlflow_tracking_uri)
    mlflow.set_experiment(experiment_name="mixin_example")
    tune.run(
        decorated_easy_objective,
        name="mlflow",
        verbose = 1, 
        num_samples=5,
        config={
            "width": tune.randint(10, 100),
            "height": tune.randint(0, 100),
            "steps": 5 if finish_fast else 100,
            "mlflow": {
                "experiment_name": "mixin_example",
                "tracking_uri": mlflow.get_tracking_uri()
            }
        })


## setup MLFlow tracking URI

In [None]:
import os
mlflow.set_tracking_uri(os.environ.get('DATABASE_URL_NO_PARAMS')[:-12]) ## this one 
tracking_uri = mlflow.get_tracking_uri()
print(tracking_uri)

experiment_name = 'pbt_babi_memnn'


postgresql://postgres:postgres@postgresql.postgres-m288j5y2


In [None]:
tune_decorated(tracking_uri)

2021-12-08 05:36:35,856	INFO tune.py:630 -- Total run time: 18.43 seconds (18.26 seconds for the tuning loop).


In [None]:
stop_ray_cluster(ray_cluster)

Deleting ray-worker-d41fba09-79e7-47c2-8da7-0a75d0eab126
Deleting ray-worker-24b53498-d460-42f6-a1a7-ade2d34dc1e9
