## Enabling PyTorch distributed training support with Kaen

Save  the latest version of the DC taxi model (as described in the chapter 11) to a `model_v1.py` file in the `src` directory using the `%%writefile` magic as shown in the first line of the following snippet


In [None]:
!mkdir -p src

In [None]:
%%writefile src/model_v1.py
import sys
import json
import time
import torch as pt
import pytorch_lightning as pl
from distutils.util import strtobool

pt.set_default_dtype(pt.float64)
class DcTaxiModel(pl.LightningModule):
    def __init__(self, **kwargs):
      super().__init__()
      self.save_hyperparameters()
      pt.manual_seed(int(self.hparams.seed))

      self.step = 0    
      self.start_ts = time.perf_counter()
      self.train_val_rmse = pt.tensor(0.)

      #create a list of hidden layer neurons, e.g. [3, 5, 8]
      num_hidden_neurons = json.loads(self.hparams.num_hidden_neurons)

      self.layers = pt.nn.Sequential(
          pt.nn.Linear(int(self.hparams.num_features), num_hidden_neurons[0]),
          pt.nn.ReLU(),
          *self.build_hidden_layers(num_hidden_neurons, pt.nn.ReLU()),
          pt.nn.Linear(num_hidden_neurons[-1], 1)
      )

      if 'batch_norm_linear_layers' in self.hparams \
        and strtobool(self.hparams.batch_norm_linear_layers):
        self.layers = self.batch_norm_linear(self.layers)

    def build_hidden_layers(self, num_hidden_neurons, activation):
      linear_layers = [ pt.nn.Linear(num_hidden_neurons[i],
          num_hidden_neurons[i+1]) for i in range(len(num_hidden_neurons) - 1) ]

      classes = [activation.__class__] * len(num_hidden_neurons)

      activation_instances = list(map(lambda x: x(), classes))

      hidden_layer_activation_tuples = list(zip(linear_layers, activation_instances))

      hidden_layers = [i for sublist in hidden_layer_activation_tuples for i in sublist]

      return hidden_layers

    def batch_norm_linear(self, layers):
      idx_linear = list(filter(lambda x: type(x) is int, 
                  [idx if issubclass(layer.__class__, pt.nn.Linear) else None for idx, layer in enumerate(layers)]))
      idx_linear.append(sys.maxsize)
      layer_lists = [list(iter(layers[s:e])) for s, e in zip(idx_linear[:-1], idx_linear[1:])]
      batch_norm_layers = [pt.nn.BatchNorm1d(layer[0].in_features) for layer in layer_lists]
      batch_normed_layer_lists = [ [bn, *layers] for bn, layers in list(zip(batch_norm_layers, layer_lists)) ]
      return pt.nn.Sequential(*[layer for nested_layer in batch_normed_layer_lists for layer in nested_layer ])

    def batchToXy(self, batch):
      batch = batch.squeeze_()
      X, y = batch[:, 1:], batch[:, 0]
      return X, y

    def forward(self, X):
      y_est = self.layers(X)
      return y_est.squeeze_()
    
    def log(self, k, v, **kwargs):        
        super().log(k, v,
                on_step = kwargs['on_step'],
                on_epoch = kwargs['on_epoch'],
                prog_bar = kwargs['prog_bar'],
                logger = kwargs['logger'],)
        
        
    def training_step(self, batch, batch_idx):
        self.step += 1

        X, y = self.batchToXy(batch) #unpack batch into features and label

        y_est = self.forward(X)

        loss = pt.nn.functional.mse_loss(y_est, y)

        for k,v in {
          "train_step": self.step,
          "train_mse": loss.item(),
          "train_rmse": loss.sqrt().item(),
          "train_steps_per_sec": self.step / (time.perf_counter() - self.start_ts),

        }.items():
          self.log(k, v, step = self.step, on_step=True, on_epoch=True, prog_bar=True, logger=True)

        self.train_val_rmse = loss.sqrt()

        return loss

    def validation_step(self, batch, batch_idx):
      X, y = self.batchToXy(batch) 

      with pt.no_grad():
          loss = pt.nn.functional.mse_loss(self.forward(X), y)

      for k,v in {
        "val_mse": loss.item(),
        "val_rmse": loss.sqrt().item(),
        "train_val_rmse": (self.train_val_rmse + loss.sqrt()).item(),
      }.items():
        self.log(k, v, step = self.step, on_step=True, on_epoch=True, prog_bar=True, logger=True)

      return loss
      
    def test_step(self, batch, batch_idx):
      X, y = self.batchToXy(batch) 

      with pt.no_grad():
          loss = pt.nn.functional.mse_loss(self.forward(X), y)

      for k,v in {
          "test_mse": loss.item(),
          "test_rmse": loss.sqrt().item(),
      }.items():
        self.log(k, v, step = self.step, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        

    def configure_optimizers(self):
        optimizers = {'Adam': pt.optim.AdamW,
                      'SGD': pt.optim.SGD}
        optimizer = optimizers[self.hparams.optimizer]

        return optimizer(self.layers.parameters(), 
                            lr = float(self.hparams.lr))


The entrypoint (in a `trainer.py` file of the `src` directory) to the process of building and testing this version of the model starts by loading DC taxi model instance from the `model_v1` package

In [None]:
%%writefile src/trainer.py
from model_v1 import DcTaxiModel

import os
import time
import kaen
import torch as pt
import numpy as np
import pytorch_lightning as pl
import torch.distributed as dist
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel

from kaen.torch import ObjectStorageDataset as osds
        
def train(model, train_glob, val_glob, test_glob = None):    
    #set the pseudo-random number generator seed
    seed = int(model.hparams['seed']) \
                if 'seed' in model.hparams \
                else int( datetime.now().microsecond )
    
    np.random.seed(seed)
    pt.manual_seed(seed)
    
    kaen.torch.init_process_group(model.layers)

    trainer = pl.Trainer(gpus = pt.cuda.device_count() \
                            if pt.cuda.is_available() else 0,
        max_epochs = 1,
        limit_train_batches = int( model.hparams.max_batches ) \
                                 if 'max_batches' in model.hparams else 1,
        limit_val_batches = 1,
        num_sanity_val_steps = 1,
        val_check_interval = min(20, int( model.hparams.max_batches ) ),
        limit_test_batches = 1,
        log_every_n_steps = 1,
        gradient_clip_val=0.5,
        progress_bar_refresh_rate = 0, 
        weights_summary = None,)
    
    train_dl = \
    DataLoader(osds(train_glob,
                    worker = kaen.torch.get_worker_rank(),
                    replicas = kaen.torch.get_num_replicas(),
                    shard_size = int(model.hparams.batch_size),
                    batch_size = int(model.hparams.batch_size),
                    storage_options = {'anon': False},                    
                   ), 
               pin_memory = True)

    val_dl = \
    DataLoader(osds(val_glob,
                    batch_size = int(model.hparams.batch_size),
                    storage_options = {'anon': False},                    
                   ), 
               pin_memory = True)

    trainer.fit(model, 
              train_dataloaders = train_dl,
              val_dataloaders = val_dl)
    if test_glob is not None:      
        test_dl = \
          DataLoader(osds(test_glob,
                          batch_size = int(model.hparams.batch_size),
                          storage_options = {'anon': False},                          
                         ), 
                    pin_memory = True) 

        trainer.test(model, 
                    dataloaders=test_dl)

    return model, trainer

if __name__ == "__main__":    
    model, trainer = train(DcTaxiModel(**{
            "seed": "1686523060",
            "num_features": "8",
            "num_hidden_neurons": "[3, 5, 8]",
            "batch_norm_linear_layers": "1",
            "optimizer": "Adam",
            "lr": "0.03",
            "max_batches": "1",
            "batch_size": str(2 ** 18),}),        

      train_glob = os.environ['KAEN_OSDS_TRAIN_GLOB'] if 'KAEN_OSDS_TRAIN_GLOB' in os.environ \
                    else 'https://raw.githubusercontent.com/osipov/smlbook/master/train.csv',
      val_glob = os.environ['KAEN_OSDS_VAL_GLOB'] if 'KAEN_OSDS_VAL_GLOB' in os.environ \
                    else 'https://raw.githubusercontent.com/osipov/smlbook/master/valid.csv',
      test_glob = os.environ['KAEN_OSDS_TEST_GLOB'] if 'KAEN_OSDS_TEST_GLOB' in os.environ \
                    else 'https://raw.githubusercontent.com/osipov/smlbook/master/valid.csv')
        
    print(trainer.callback_metrics)

Run a simple test to confirm that the implementation works as expected.

In [None]:
!python3 src/trainer.py

## Unit testing model training in a local Kaen container

Ensure that you can authenticate with DockerHub, where you can download the base container image. Once you execute the following code snippet in your Kaen Jupyter environment, you will be prompted to enter your DockerHub username, which is then stored in the `DOCKER_HUB_USER` Python variable

In [None]:
DOCKER_HUB_USER = input()
DOCKER_HUB_USER

Next, enter the DockerHub password for your username when prompted based on the following code snippet. Notice that the password is cleared out from the `DOCKER_HUB_PASSWORD` variable after the authentication is finished.

You should seen an ouput with a message `Login Succeeded` if you specified valid DockerHub credentials.

In [None]:
import getpass
DOCKER_HUB_PASSWORD = getpass.getpass()
!echo "{DOCKER_HUB_PASSWORD}" | docker login --username {DOCKER_HUB_USER} --password-stdin
DOCKER_HUB_PASSWORD = None

The base PyTorch Docker image is quite large, at about 1.9 GB. The Kaen base PyTorch image (`kaenai/pytorch-mlflow-aws-base:latest`), which adds binaries with support for AWS and MLFlow is roughly 2 GB in size so be prepared that the following download will take a few minutes, depending on the speed of your internet connection.

To execute the download, run

In [None]:
!docker pull kaenai/pytorch-mlflow-aws-base:latest

Once the download completes, you can package your source code to an image derived from `kaenai/pytorch-mlflow-aws-base:latest` using the following `Dockerfile`. Notice that the file simply copies the Python source code to the `/workspace` directory of the image file system.

In [None]:
%%writefile Dockerfile
FROM kaenai/pytorch-mlflow-aws-base:latest
COPY *.py /workspace/

Since the source code files `model_v1.py` and `trainer.py` described earlier in this chapter were saved to a `src` directory, notice that the following command to build your Docker image uses the `src/` directory as the root of the Docker image build process. To ensure that the image that you build can be uploaded to DockerHub, the image is tagged using `{DOCKER_HUB_USER}` as the image tag prefix.

In [None]:
!docker build -t {DOCKER_HUB_USER}/dctaxi:latest -f Dockerfile src/

After the `docker build` command is finished, you can run you newly created Docker container using

In [None]:
!docker run -it {DOCKER_HUB_USER}/dctaxi:latest "python /workspace/trainer.py"

which should produce the output identical to the output of running the code using `python src/trainer.py`.

To push (upload) your newly built image to DockerHub, execute

In [None]:
!docker push {DOCKER_HUB_USER}/dctaxi:latest

## Hyperparameter optimization with Optuna


The entire HPO implementation used in this notebook is shown in the following code snippet. Notice that the snippet saves the implementation source code as `hpo.py` file in your `src` folder

In [None]:
%%writefile src/hpo.py
import optuna
import numpy as np
from kaen.hpo.optuna import BaseOptunaService

class DcTaxiHpoService(BaseOptunaService):
  def hparams(self):
    trial = self._trial

    #define hyperparameters
    return {
        "seed": trial.suggest_int('seed', 0, np.iinfo(np.int32).max - 1),
        "optimizer": trial.suggest_categorical('optimizer', ['Adam']),        
        "lr": trial.suggest_loguniform('lr', 0.001, 0.1),
        "num_hidden_neurons": [trial.suggest_categorical(f"num_hidden_layer_{layer}_neurons", [7, 11, 13, 19, 23]) \
                                for layer in range(trial.suggest_categorical('num_layers', [11, 13, 17, 19]))],
        "batch_size": trial.suggest_categorical('batch_size', [2 ** i for i in range(16, 22)]),
        "max_batches": trial.suggest_int('max_batches', 40, 400, log = True)
    }

  def on_experiment_end(self, experiment, parent_run):
    study = self._study
    try:
      for key, fig in {
        "plot_param_importances": optuna.visualization.plot_param_importances(study),
        "plot_parallel_coordinate_all": optuna.visualization.plot_parallel_coordinate(study, params=["max_batches", "lr", "num_hidden_layer_0_neurons", "num_hidden_layer_1_neurons", "num_hidden_layer_2_neurons"]),
        "plot_parallel_coordinate_l0_l1_l2": optuna.visualization.plot_parallel_coordinate(study, params=["num_hidden_layer_0_neurons", "num_hidden_layer_1_neurons", "num_hidden_layer_2_neurons"]),
        "plot_contour_max_batches_lr": optuna.visualization.plot_contour(study, params=["max_batches", "lr"]),
      }.items():
        fig.write_image(key + ".png")
        self.mlflow_client.log_artifact(run_id = parent_run.info.run_id, 
                            local_path = key + ".png")
        
    except:
      print(f"Failed to correctly persist experiment visualization artifacts")
      import traceback
      traceback.print_exc()
              
    #log the dataframe with the study summary  
    study.trials_dataframe().describe().to_html(experiment.name + ".html")  
    self.mlflow_client.log_artifact(run_id = parent_run.info.run_id, 
                        local_path = experiment.name + ".html")
          
    #log the best hyperparameters in the parent run
    self.mlflow_client.log_metric(parent_run.info.run_id, "loss", study.best_value)
    for k, v in study.best_params.items():
      self.mlflow_client.log_param(parent_run.info.run_id, k, v)


With the source code in place, you are ready to package it as a Docker container. Start by pulling a base Kaen container for Optuna and MLFlow:


In [None]:
!docker pull kaenai/optuna-mlflow-hpo-base:latest

and once that's finished, create a Dockerfile for a derived image using the following cell.

Notice that the package prefix for your `DcTaxiHpoService` implementation corresponds to the filename `hpo.py` as specified by the `KAEN_HPO_SERVICE_NAME` and the `KAEN_HPO_SERVICE_PREFIX` environment variables respectively.

In [None]:
%%writefile Dockerfile
FROM kaenai/optuna-mlflow-hpo-base:latest
ENV KAEN_HPO_SERVICE_PREFIX=hpo \
    KAEN_HPO_SERVICE_NAME=DcTaxiHpoService

COPY hpo.py /workspace/

Once the `Dockerfile` is saved, build the image by running

!docker build -t {DOCKER_HUB_USER}/dctaxi-hpo:latest -f Dockerfile src/

and push it to DockerHub using

In [None]:
!docker push {DOCKER_HUB_USER}/dctaxi-hpo:latest

## Enabling MLFlow support

Although the base `kaenai/pytorch-mlflow-aws-base:latest` image includes support for MLFlow, the implementation of training in `trainer.py` does not take advantage of the MLFlow experiment management and tracking. Since MLFlow uses the concept of an experiment to organize a collection of HPO trials and run, Kaen provides a `BaseMLFlowClient` class, which can be used to implement an MLFlow managed experiment for DcTaxiModel. The subclasses of `BaseMLFlowClient` are responsible for instantiating the untrained PyTorch model instances using the hyperparameter values that `BaseMLFlowClient` fetches from MLFlow and Optuna.

Start by saving an instance of your `BaseMLFlowClient` subclass named `DcTaxiExperiment` by running the following to save the code to train your model to the `src/experiment.py` file.



In [None]:
%%writefile src/experiment.py
import os
from model_v1 import DcTaxiModel
from trainer import train
from kaen.hpo.client import BaseMLFlowClient

class DcTaxiExperiment(BaseMLFlowClient):
    
    def on_run_start(self, run_idx, run):
        print(f"{run}({run.info.status}): starting...")

        #create a set of default hyperparameters
        default_hparams = {"seed": "1686523060",
                        "num_features": "8",
                        "num_hidden_neurons": "[3, 5, 8]",
                        "batch_norm_linear_layers": "1",
                        "optimizer": "Adam",
                        "lr": "0.03",
                        "max_batches": "1",
                        "batch_size": str(2 ** 18),}        
        
        #fetch the MLFlow hyperparameters if available
        hparams = run.data.params if run is not None \
                    and run.data is not None else \
                    default_hparams
        
        #override the defaults with the MLFlow hyperparameters
        hparams = {**default_hparams, **hparams}

        untrained_model = DcTaxiModel(**hparams)
        def log(self, k, v, **kwargs):
            if self.mlflow_client and 0 == int(os.environ['KAEN_RANK']):
                if 'step' in kwargs and kwargs['step'] is not None:
                    self.mlflow_client.log_metric(run.info.run_id, k, v, step = kwargs['step']) 
                else:
                    self.mlflow_client.log_metric(run.info.run_id, k, v)                                    
                    
        import types        
        untrained_model.log = types.MethodType(log, self)
        
        model, trainer = train(untrained_model,
                               train_glob = os.environ['KAEN_OSDS_TRAIN_GLOB'],
                               val_glob = os.environ['KAEN_OSDS_VAL_GLOB'],
                               test_glob = os.environ['KAEN_OSDS_TEST_GLOB'])
        
        print(trainer.callback_metrics)


With the experiment support in place, you are ready to build the updated `dctaxi` image using 


In [None]:
%%writefile Dockerfile
FROM kaenai/pytorch-mlflow-aws-base:latest
COPY * /workspace/
ENV KAEN_HPO_CLIENT_PREFIX=experiment \
    KAEN_HPO_CLIENT_NAME=DcTaxiExperiment

Build your `dctaxi` image using

In [None]:
!docker build -t {DOCKER_HUB_USER}/dctaxi:latest -f Dockerfile src/

and push it to DockerHub using 


In [None]:
!docker push {DOCKER_HUB_USER}/dctaxi:latest

## Using HPO for `DcTaxiModel` in a local Kaen provider


Before provisioning the more expensive cloud provider, it is a good idea to start by provisioning a local Kaen provider so you can unit test your HPO and model training code. You can create a Kaen training "dojo" by executing


In [None]:
!kaen dojo init --provider local

which should return an alphanumeric identifier for the newly created Kaen dojo. 

You can list available Kaen dojos in your workspace using 

In [None]:
!kaen dojo ls


which should print out the ID of the dojo you just created. 

Since you are going to want the identifier of the dojo saved a Python variable for future use, you can do so using the Jupyter syntax for assignment of bash scripts to Python variables as follows


In [None]:
[MOST_RECENT_DOJO] = !kaen dojo ls | head -n 1
MOST_RECENT_DOJO

Before a Kaen dojo can be used for training, it should be activated. Activate the dojo specified by the identifier in the `MOST_RECENT_DOJO` variable by running

In [None]:
!kaen dojo activate {MOST_RECENT_DOJO}

Since the Jupyter `!` shell shortcut provides access to Python variables, in the previous code snippet the `{MOST_RECENT_DOJO}` syntax is replaced with the value of the corresponding Python variable.

You can confirm that the dojo is active by inspecting it using

In [None]:
!kaen dojo inspect {MOST_RECENT_DOJO}

which should include an output line with `KAEN_DOJO_STATUS=active`.

Before you can start a training job in the dojo, you need to create one specifying both the dojo and the Kaen image for training. 

To create a job to train the `DcTaxiModel`, execute

In [None]:
!kaen job create --dojo {MOST_RECENT_DOJO} --image {DOCKER_HUB_USER}/dctaxi:latest

Just as with the dojo, you can save the identifer of the job to a Python variable using


In [None]:
[MOST_RECENT_JOB] = !kaen job ls | head -n 1
MOST_RECENT_JOB

Every job in Kaen is configured with dedicated networking settings that you can inspect by running

In [None]:
!kaen job inspect {MOST_RECENT_JOB}

Since you have not yet enabled HPO for this job, the inspected job settings do not include the information about the HPO image used to serve MLFlow experiment management and Optuna hyperparameter values. You can configure the job with a single run of HPO, by executing


In [None]:
!kaen hpo enable \
--image {DOCKER_HUB_USER}/dctaxi-hpo:latest \
--num-runs 1 \
--service-prefix hpo \
--service-name DcTaxiHpoService \
--port 5001 5001 \
{MOST_RECENT_JOB} 

Assuming the `hpo enable` command completes successfully, you can inspect the job again to observe the HPO specific settings


In [None]:
!kaen job inspect {MOST_RECENT_JOB}

Notice that at this time, the output includes the `KAEN_HPO_MANAGER_IP` for the IP address of the internal Docker network (specified by `KAEN_JOB_SUBNET`) that handles the communication across your container instances. 


At this time, the HPO service should be up and running, so you should be able to access the MLFlow user interface by navigating your browser to http://127.0.0.1:5001 which should show a screen similar to the following. Note that you need to open the MLFlow experiment that starts with a `job` prefix on the left side bar of the MLFlow interface before you can explore the details of the HPO experiment. 

In [None]:
import os
os.environ['MOST_RECENT_JOB'] = MOST_RECENT_JOB

os.environ['BUCKET_ID'] = None
os.environ['AWS_ACCESS_KEY_ID'] = None
os.environ['AWS_SECRET_ACCESS_KEY'] = None
os.environ['AWS_DEFAULT_REGION'] = None

In [None]:
%%bash
echo $BUCKET_ID
echo $AWS_ACCESS_KEY_ID
echo $AWS_SECRET_ACCESS_KEY
echo $AWS_DEFAULT_REGION
echo $MOST_RECENT_JOB

In [None]:
!kaen job start \
--replicas 1 \
-e KAEN_HPO_JOB_RUNS 1 \
-e AWS_DEFAULT_REGION $AWS_DEFAULT_REGION \
-e AWS_ACCESS_KEY_ID $AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY $AWS_SECRET_ACCESS_KEY \
-e KAEN_OSDS_TRAIN_GLOB "s3://dc-taxi-$BUCKET_ID-$AWS_DEFAULT_REGION/csv/dev/part*.csv" \
-e KAEN_OSDS_VAL_GLOB "s3://dc-taxi-$BUCKET_ID-$AWS_DEFAULT_REGION/csv/test/part*.csv" \
-e KAEN_OSDS_TEST_GLOB "s3://dc-taxi-$BUCKET_ID-$AWS_DEFAULT_REGION/csv/test/part*.csv" \
$MOST_RECENT_JOB

## Training with Kaen AWS provider

To create a Kaen dojo in AWS, you need to use the `--provider aws` setting when running `kaen init`. By default, when use use the `aws` provider, Kaen provisions `t3.micro` instances as both worker and manager nodes in AWS. Although the `t3.micro` instances are low cost defaults suitable for simple demos, for the `DcTaxiModel`, I recommend provisioning `t3.large` instances as follows


In [None]:
!kaen dojo init --provider aws --worker-instance-type t3.xlarge --manager-instance-type t3.xlarge

which upon a successful provisioning should report the dojo ID.

To configure the `MOST_RECENT_DOJO` Python variable, you should execute

In [None]:
[MOST_RECENT_DOJO] = !kaen dojo ls | head -n 1
MOST_RECENT_DOJO

and then activate the dojo using


In [None]:
!kaen dojo activate {MOST_RECENT_DOJO}

Notice that if you provision underpowered AWS node instances (such as `t3.micro`) the activation process could take a while. Once the activation is finished correctly, you should be able to inspect the Dojo using

In [None]:
!kaen dojo inspect {MOST_RECENT_DOJO}

and the output should include a line that starts with `KAEN_DOJO_STATUS=active` and the timestamp of when the activation completed.

Just as with a local provider, to perform training in AWS, you should start by creating a job

In [None]:
!kaen job create --dojo {MOST_RECENT_DOJO} --image {DOCKER_HUB_USER}/dctaxi:latest

Unlike the case of the local provider, running `kaen job create` in the AWS provider may take a while. This is caused by the fact that the `dctaxi` image that you pushed to DockerHub needs to be downloaded to the AWS node in your dojo. After the job is created, you should save the ID of the job to the `MOST_RECENT_JOB` Python variable using


In [None]:
[MOST_RECENT_JOB] = !kaen job ls | head -n 1
os.environ['MOST_RECENT_JOB'] = MOST_RECENT_JOB
MOST_RECENT_JOB

and then enable HPO for the job using

In [None]:
!kaen hpo enable \
--num-runs 1 \
--image {DOCKER_HUB_USER}/dctaxi-hpo:latest \
--service-prefix hpo \
--service-name DcTaxiHpoService \
--port 5001 5001 \
{MOST_RECENT_JOB} 

Once the `kaen hpo enable` operation is finished, you can open the MLFlow user interface by constructing the URL in your notebook using


In [None]:
!echo "http://$(kaen dojo inspect {MOST_RECENT_DOJO} | grep KAEN_DOJO_MANAGER_IP | cut -d '=' -f 2):5001"

and navigating to the URL in your browser. 

Since it may take a few seconds for the MLFlow UI to become available (depending on the performance of your AWS management node instances), you may need to refresh your browser to get access to this interface.

To start the training, the `kaen job start` command is identical to the one you used before

In [None]:
!kaen job start \
--replicas 1 \
-e AWS_DEFAULT_REGION $AWS_DEFAULT_REGION \
-e AWS_ACCESS_KEY_ID $AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY $AWS_SECRET_ACCESS_KEY \
-e KAEN_OSDS_TRAIN_GLOB "s3://dc-taxi-$BUCKET_ID-$AWS_DEFAULT_REGION/csv/dev/part*.csv" \
-e KAEN_OSDS_VAL_GLOB "s3://dc-taxi-$BUCKET_ID-$AWS_DEFAULT_REGION/csv/test/part*.csv" \
-e KAEN_OSDS_TEST_GLOB "s3://dc-taxi-$BUCKET_ID-$AWS_DEFAULT_REGION/csv/test/part*.csv" \
$MOST_RECENT_JOB

As in the case with the local provider, you can navigate your browser to the MLFlow UI and monitor the metrics as the model trains.

## When you are done, do not forget to remove the AWS training dojo using

In [None]:
!kaen dojo rm {MOST_RECENT_DOJO}