In [46]:
import json
import time
import mlflow
import subprocess

from azureml.core import Workspace, Experiment
from azureml.core.authentication import ArmTokenAuthentication
from azureml.core.compute.aks import AksCompute
from azureml.mlflow import get_portal_url

ws = Workspace.from_config()
experiment_name = "mlflow_on_azureml_aks"

# We will need to update our auth story for the remote setting.
ws._auth = ArmTokenAuthentication(ws.service_context.get_auth()._get_arm_token())

In [47]:
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)

In [48]:
# Get or create AKS cluster
compute_name = "aks-comute-name"  # Typos persist :(
try:
    aks_compute = ws.compute_targets[compute_name]
except KeyError:
    aks_compute = AksCompute.create(ws,  compute_name, AksCompute.provisioning_configuration())
    aks_compute.wait_for_completion()

In [49]:
%%writefile Dockerfile
FROM continuumio/miniconda3:latest

RUN pip install azureml-mlflow mlflow==1.4


Writing Dockerfile


In [57]:
%%writefile kubernetes_job_template.yaml
apiVersion: batch/v1
kind: Job
metadata:
  name: "{replaced with MLflow Project name}"
  namespace: default
spec:
  ttlSecondsAfterFinished: 100
  backoffLimit: 0
  template:
    spec:
      containers:
        - name: "{replaced with MLflow Project name}" 
          image: "{replaced with URI of Docker image created during Project execution}"
          command: ["{replaced with MLflow Project entry point command}"]
      restartPolicy: Never 


Writing kubernetes_job_template.yaml


In [51]:
%%writefile MLproject
name: diffpriv-sql


docker_env:
  image:  mlflow-docker-example

entry_points:
  main:
    command: "python run_query.py"


Writing MLproject


In [52]:
%%writefile run_query.py
import os
print(os.environ)
import mlflow

with mlflow.start_run():
    mlflow.log_metric("hi", 4)

Writing run_query.py


In [53]:
import os
os.listdir()

['.ipynb_checkpoints',
 'cache',
 'Dockerfile',
 'execute_on_aks.py',
 'kuberenetes_job_template.yaml',
 'MlflowOnAzureMLAKS.ipynb',
 'MLproject',
 'outputs',
 'run_query.py']

In [54]:
kube_details = aks_compute.serialize()["properties"]
kube_context = kube_details["resourceId"].split("/")[-1]
repository_uri = "eddeleon/test"
kube_config = {
            "kube-context": kube_context,
            "kube-job-template-path": "kubernetes_job_template.yaml",
            "repository-uri": repository_uri
            }
backend_config = "kubernetes_config_new.json"
with open(backend_config, "w") as stream:
    json.dump(kube_config, stream)

In [55]:
!kubectl config use-context {kube_context}

Switched to context "aks-comute-name30b94132".


In [60]:
start_time = time.time()
mlflow_run = mlflow.projects.run("./",
                          experiment_name="test",
                          backend="kubernetes",
                          backend_config=backend_config)
duration = time.time() - start_time
print("Duration: {} s".format(duration))
print(get_portal_url(mlflow.get_run(mlflow_run.run_id)))

2020/03/23 01:07:47 INFO mlflow.projects: === Building docker image eddeleon/test:d4d9198 ===
2020/03/23 01:07:49 INFO mlflow.projects: Temporary docker context file C:\Users\eddeleon\AppData\Local\Temp\tmpgaoj3cpv was not deleted.
2020/03/23 01:07:50 INFO mlflow.projects.kubernetes: === Pushing docker image eddeleon/test:d4d9198 ===
2020/03/23 01:08:09 INFO mlflow.projects: === Created directory C:\Users\eddeleon\AppData\Local\Temp\tmp11elwctx for downloading remote URIs passed to arguments of type 'path' ===
2020/03/23 01:08:09 INFO mlflow.projects.kubernetes: === Creating Job diffpriv-sql-2020-03-23-01-08-09-282310 ===
2020/03/23 01:08:10 INFO mlflow.projects.kubernetes: Job started.
2020/03/23 01:08:16 INFO mlflow.projects.kubernetes: None
2020/03/23 01:08:16 INFO mlflow.projects: === Run (ID 'cf5fece4-3101-4f73-b1cf-987eb2cdd4c8') succeeded ===


Duration: 38.10667896270752 s
https://mlworkspace.azure.ai/portal/subscriptions/ad203158-bc5d-4e72-b764-2607833a71dc/resourceGroups/akannava/providers/Microsoft.MachineLearningServices/workspaces/akannava/experiments/test/runs/cf5fece4-3101-4f73-b1cf-987eb2cdd4c8


In [61]:
from azureml.core import ScriptRunConfig  # local submit
experiment = Experiment(ws, experiment_name)
src = ScriptRunConfig(source_directory=".", script="run_query.py")
src.run_config.framework = 'python'
src.run_config.environment.python.user_managed_dependencies = True
start = time.time()
local_run = experiment.submit(src)
local_run.wait_for_completion()
duration = time.time() - start
print("Duration: {} s".format(duration))
print(local_run.get_portal_url())

Duration: 33.14914894104004 s
https://ml.azure.com/experiments/mlflow_on_azureml_aks/runs/mlflow_on_azureml_aks_1584940105_257fc3ba?wsid=/subscriptions/ad203158-bc5d-4e72-b764-2607833a71dc/resourcegroups/akannava/workspaces/akannava


First few runs, aks took 30s, after a few submits it is taking ~40 s. Local run takes ~30s

In [29]:
!kubectl get pods --all-namespaces

NAMESPACE     NAME                                            READY   STATUS      RESTARTS   AGE
default       azureml-ba-5d44c5d5f4-mkgff                     1/1     Running     0          5d22h
default       azureml-fe-8697d5d7c-4mg9n                      2/2     Running     0          5d22h
default       azureml-fe-8697d5d7c-8tcrh                      2/2     Running     0          5d22h
default       azureml-fe-8697d5d7c-x9nk6                      2/2     Running     0          5d22h
default       blobfuse-flexvol-installer-2ttmf                1/1     Running     0          5d22h
default       blobfuse-flexvol-installer-dj69d                1/1     Running     0          5d22h
default       blobfuse-flexvol-installer-x5mhp                1/1     Running     0          5d22h
default       diffpriv-sql-2020-03-17-02-17-45-788809-79tcs   0/1     Error       0          5d22h
default       diffpriv-sql-2020-03-17-02-17-45-788809-f8tpd   0/1     Error       0          5d22h
default     

In [30]:
!kubectl logs diffpriv-sql-2020-03-23-00-53-27-660548-6mtlk

environ({'PATH': '/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin', 'HOSTNAME': 'diffpriv-sql-2020-03-23-00-53-27-660548-6mtlk', 'MLFLOW_EXPERIMENT_ID': '938b7e40-d00c-45f0-8bab-2f0b3d1d4c3d', 'MLFLOW_RUN_ID': '640b58b9-0c8d-4d12-9e1b-a921fc4fe613', 'MLFLOW_TRACKING_URI': 'azureml://eastus2.experiments.azureml.net/history/v1.0/subscriptions/ad203158-bc5d-4e72-b764-2607833a71dc/resourceGroups/akannava/providers/Microsoft.MachineLearningServices/workspaces/akannava?auth-type=ArmTokenAuthentication&auth=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiIsIng1dCI6IllNRUxIVDBndmIwbXhvU0RvWWZvbWpxZmpZVSIsImtpZCI6IllNRUxIVDBndmIwbXhvU0RvWWZvbWpxZmpZVSJ9.eyJhdWQiOiJodHRwczovL21hbmFnZW1lbnQuY29yZS53aW5kb3dzLm5ldC8iLCJpc3MiOiJodHRwczovL3N0cy53aW5kb3dzLm5ldC83MmY5ODhiZi04NmYxLTQxYWYtOTFhYi0yZDdjZDAxMWRiNDcvIiwiaWF0IjoxNTg0OTM4MzY2LCJuYmYiOjE1ODQ5MzgzNjYsImV4cCI6MTU4NDk0MjI2NiwiX2NsYWltX25hbWVzIjp7Imdyb3VwcyI6InNyYzEifSwiX2NsYWltX3NvdXJjZXMiOnsic3JjMSI6eyJlbmRwb2ludCI6Imh0dHBzOi8vZ3Jhc