
### Quick examples to demostrate AML Ray/Dask cluster usage

### Interactive use cases

Install at this your conda notebook environment to run interactive examples

In [None]:
pip install --upgrade ray==2.2.0 ray[air]==2.2.0 ray[data]==2.2.0 azure-ai-ml ray-on-aml

Start ML client 

In [2]:
from azure.ai.ml import MLClient
from azure.ai.ml import command, Input
from azure.identity import DefaultAzureCredential
# Enter details of your AML workspace
subscription_id = "840b5c5c-3f4a-459a-94fc-6bad2a969f9d"
resource_group = "ml"
workspace = "ws02ent"
# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)


#### Please make sure to create compute cluster in the same vnet with your compute instance. You need to have vnet, otherwise compute cannot communicate with each other


In [7]:
from ray_on_aml.core import Ray_On_AML

# from src.ray_on_aml.core import Ray_On_AML

import logging
ray_on_aml =Ray_On_AML(ml_client=ml_client, compute_cluster ="d13")
#Note that if you need to customize the pip installation of the cluster, you also needs to support the ray package e.g. ray[data] which 
#match the version of the ray package(s) in your compute instance. If you don't specify pip_packages then ray[default] is inserted 
#automatically

# For use as client mode, uncomment these lines
ray = ray_on_aml.getRay(num_node=2,pip_packages=["ray[air]==2.2.0","ray[data]==2.2.0","torch==1.13.0","fastparquet==2022.12.0", 
"azureml-mlflow==1.48.0", "pyarrow==6.0.1", "dask==2022.12.0", "adlfs==2022.11.2", "fsspec==2022.11.0"])
client = ray.init(f"ray://{ray_on_aml.headnode_private_ip}:10001")

#use CI as head node

# ray = ray_on_aml.getRay(ci_is_head=True, num_node=2)

[32mUploading .tmp (0.01 MBs): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6679/6679 [00:00<00:00, 97620.47i

Waiting cluster to start and return head node's ip
............................................................................................................................................
 cluster is ready, head node ip  10.0.0.6


[2m[33m(raylet, ip=10.0.0.8)[0m   aiogrpc.init_grpc_aio()


In [8]:
ray.cluster_resources()

{'CPU': 16.0,
 'node:10.0.0.6': 1.0,
 'node:10.0.0.8': 1.0,
 'memory': 87326674944.0,
 'object_store_memory': 24481313586.0}

#### Dask on Ray

##### You can use Dask on this Ray cluster by telling Dask to use Ray as the scheduler. By doing this, you will have a cluster with both Dask and Ray without having to setup them saperately

In [9]:
#Scaling up date with Dask dataframe API.
#Please make sure you have pandas version 1.4+ and restart to run this successfully.
# pip install pandas==1.4.2
import dask
from ray.util.dask import enable_dask_on_ray
enable_dask_on_ray()

import dask.dataframe as dd

storage_options = {'account_name': 'azureopendatastorage'}
ddf = dd.read_parquet('az://nyctlc/green/puYear=2015/puMonth=*/*.parquet', storage_options=storage_options)
ddf.count().compute()


vendorID                19233765
lpepPickupDatetime      19233765
lpepDropoffDatetime     19233765
passengerCount          19233765
tripDistance            19233765
puLocationId                   0
doLocationId                   0
pickupLongitude         19233765
pickupLatitude          19233765
dropoffLongitude        19233765
dropoffLatitude         19233765
rateCodeID              19233765
storeAndFwdFlag         19233765
paymentType             19233765
fareAmount              19233765
extra                   19233765
mtaTax                  19233765
improvementSurcharge    19233765
tipAmount               19233765
tollsAmount             19233765
ehailFee                       0
totalAmount             19233765
tripType                19233693
puYear                  19233765
puMonth                 19233765
dtype: int64

### Ray Dataset

In [10]:
#Ray also support Ray dataset. You can read into ray dataset then convert to Dask or other ML format which is convenient for ML training.https://docs.ray.io/en/latest/data/dataset.html
# you may need to upgrade pyarrow to run this successfully
from adlfs import AzureBlobFileSystem

abfs = AzureBlobFileSystem(account_name="azureopendatastorage",  container_name="isdweatherdatacontainer")
#if read all years and months
# data = ray.data.read_parquet("az://isdweatherdatacontainer/ISDWeather//", filesystem=abfs)
data =ray.data.read_parquet(["az://isdweatherdatacontainer/ISDWeather/year=2015/"], filesystem=abfs)
data.count()
# 1,584,481,119 is the count for all data 


[2m[36m(_get_read_tasks pid=589)[0m [dataset]: Run `pip install tqdm` to enable progress reporting.


116341246

### Mount remote data to your ray cluster

In [6]:
client.disconnect()
ray_on_aml.shutdown()

In [None]:
#Example of using input and output for interactive job. You need to define adlsstore0001 in your Azure ML workspace first
from azure.ai.ml import command, Input, Output
from ray_on_aml.core import Ray_On_AML
import logging
ray_on_aml =Ray_On_AML(ml_client=ml_client, compute_cluster ="ds11", verbosity=logging.INFO )

inputs={

    "ISDWeather": Input(
        type="uri_folder",
        path="azureml://datastores/adlsstore0001/paths/ISDWeather/year=2008",
    )
}

outputs={
    "output1": Output(
        type="uri_folder",
        path="azureml://datastores/adlsstore0001/paths/dev",
    ),
    "output2": Output(
        type="uri_folder",
        path="azureml://datastores/adlsstore0001/paths/dev",
    )
}

ray = ray_on_aml.getRay(inputs = inputs,outputs=outputs, num_node=2,
pip_packages=["ray[air]==2.2.0","ray[data]==2.2.0","torch==1.13.0","fastparquet==2022.12.0", 
"azureml-mlflow==1.48.0", "pyarrow==6.0.1", "dask==2022.2.0", "adlfs==2022.11.2", "fsspec==2022.11.0"])
client = ray.init(f"ray://{ray_on_aml.headnode_private_ip}:10001")


In [None]:
data = ray.data.read_parquet(ray_on_aml.mount_points['ISDWeather'])
data.count()

#### Ray Tune for distributed ML tunning

In [None]:
 import sklearn.datasets
 import sklearn.metrics
 from sklearn.model_selection import train_test_split
 import xgboost as xgb

 from ray import tune


 def train_breast_cancer(config):
     # Load dataset
     data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
     # Split into train and test set
     train_x, test_x, train_y, test_y = train_test_split(
         data, labels, test_size=0.25)
     # Build input matrices for XGBoost
     train_set = xgb.DMatrix(train_x, label=train_y)
     test_set = xgb.DMatrix(test_x, label=test_y)
     # Train the classifier
     results = {}
     xgb.train(
         config,
         train_set,
         evals=[(test_set, "eval")],
         evals_result=results,
         verbose_eval=False)
     # Return prediction accuracy
     accuracy = 1. - results["eval"]["error"][-1]
     tune.report(mean_accuracy=accuracy, done=True)


 config = {
     "objective": "binary:logistic",
     "eval_metric": ["logloss", "error"],
     "max_depth": tune.randint(1, 9),
     "min_child_weight": tune.choice([1, 2, 3]),
     "subsample": tune.uniform(0.5, 1.0),
     "eta": tune.loguniform(1e-4, 1e-1)
 }
 analysis = tune.run(
     train_breast_cancer,
     resources_per_trial={"cpu": 1},
     config=config,
     num_samples=10)


#### Distributed XGBoost https://docs.ray.io/en/latest/xgboost-ray.html

In [None]:
from xgboost_ray import RayXGBClassifier, RayParams
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

seed = 42

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.25, random_state=42
)

clf = RayXGBClassifier(
    n_jobs=10,  # In XGBoost-Ray, n_jobs sets the number of actors
    random_state=seed
)

# scikit-learn API will automatically conver the data
# to RayDMatrix format as needed.
# You can also pass X as a RayDMatrix, in which case
# y will be ignored.

clf.fit(X_train, y_train)

pred_ray = clf.predict(X_test)
print(pred_ray.shape)

pred_proba_ray = clf.predict_proba(X_test)
print(pred_proba_ray.shape)

# It is also possible to pass a RayParams object
# to fit/predict/predict_proba methods - will override
# n_jobs set during initialization

clf.fit(X_train, y_train, ray_params=RayParams(num_actors=10))

pred_ray = clf.predict(X_test, ray_params=RayParams(num_actors=10))
print(pred_ray.shape)


In [None]:
from xgboost_ray import RayDMatrix, RayParams, train
from sklearn.datasets import load_breast_cancer

train_x, train_y = load_breast_cancer(return_X_y=True)
train_set = RayDMatrix(train_x, train_y)

evals_result = {}
bst = train(
    {
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    },
    train_set,
    evals_result=evals_result,
    evals=[(train_set, "train")],
    verbose_eval=False,
    ray_params=RayParams(
        num_actors=10,  # Number of remote actors
        cpus_per_actor=1))

bst.save_model("model.xgb")
print("Final training error: {:.4f}".format(
    evals_result["train"]["error"][-1]))


### Reinforcement Learning

In [None]:
## Install library at compute instance: pip install gym,dm-tree

In [None]:
#Install additional library at Ray cluster
ray_on_aml =Ray_On_AML(ws=ws, compute_cluster ="d15-v2",additional_pip_packages=['torch==1.10.0', 'torchvision', 'sklearn', 'pyspark','gym==0.2.1','dm-tree','scikit-image','opencv-python','tensorflow'], maxnode=1)
ray = ray_on_aml.getRay()
time.sleep(20)
ray.cluster_resources()

In [None]:
# Import the RL algorithm (Trainer) we would like to use.
from ray.rllib.agents.ppo import PPOTrainer

# Configure the algorithm.
config = {
    # Environment (RLlib understands openAI gym registered strings).
    "env": "Taxi-v3",
    # Use 2 environment workers (aka "rollout workers") that parallelly
    # collect samples from their own environment clone(s).
    "num_workers": 2,
    # Change this to "framework: torch", if you are using PyTorch.
    # Also, use "framework: tf2" for tf2.x eager execution.
    "framework": "torch",
    # Tweak the default model provided automatically by RLlib,
    # given the environment's observation- and action spaces.
    "model": {
        "fcnet_hiddens": [64, 64],
        "fcnet_activation": "relu",
    },
    # Set up a separate evaluation worker set for the
    # `trainer.evaluate()` call after training (see below).
    "evaluation_num_workers": 1,
    # Only for evaluation runs, render the env.
    "evaluation_config": {
        "render_env": True,
    }
}

# Create our RLlib Trainer.
trainer = PPOTrainer(config=config)

# Run it for n training iterations. A training iteration includes
# parallel sample collection by the environment workers as well as
# loss calculation on the collected batch and a model update.
for _ in range(3):
    print(trainer.train())

# Evaluate the trained Trainer (and render each timestep to the shell's
# output).
trainer.evaluate()



### Shutdown interactive cluster when not used


In [None]:
ray_on_aml.shutdown()


### Ray on Job Cluster with GPU (you don't need interactive Ray cluster to be on to submit the AML job )

In [None]:
from azureml.core import Workspace, Experiment, Environment,ScriptRunConfig
# from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DockerConfiguration,RunConfiguration

#Remember the AML job has to have distribted setings (MPI type) for ray-on-aml to work correctly.
ws = Workspace.from_config()
compute_cluster = 'gpunc6' #This can be another cluster different from the interactive cluster. 
ray_cluster = ComputeTarget(workspace=ws, name=compute_cluster)

aml_run_config_ml = RunConfiguration(communicator='OpenMpi')
docker_config = DockerConfiguration(use_docker=True, shm_size='48gb')


rayEnv = Environment.from_conda_specification(name = "RLEnv",
                                             file_path = "conda_env.yml")
rayEnv.docker.base_image = "mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu18.04:20220329.v1"

aml_run_config_ml.target = ray_cluster
aml_run_config_ml.node_count = 2
aml_run_config_ml.environment = rayEnv
aml_run_config_ml.docker =docker_config

src = ScriptRunConfig(source_directory='../examples/job',
                    script='aml_job.py',
                    run_config = aml_run_config_ml,
                   )

run = Experiment(ws, "rl_on_aml_job").submit(src)


In [None]:
from azureml.widgets import RunDetails
run.cancel()