Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
97d7ff3
build_train to use build vs release
dtzar Nov 18, 2019
1d374b8
retrieve pipeline id, agentless task to run
dtzar Nov 18, 2019
4353c9f
refactor register model
dtzar Nov 18, 2019
1bca186
add model helper util
dtzar Nov 18, 2019
53f45d8
refactor evaluate script
dtzar Nov 18, 2019
4b036a1
CI cleanup local dev
dtzar Nov 18, 2019
738f037
Fix linting
dtzar Nov 19, 2019
0855beb
temp disable unit test
dtzar Nov 19, 2019
74689be
replace base_name var
dtzar Nov 19, 2019
d83338b
add registration validation logic
dtzar Nov 19, 2019
8d4ff0f
Workspace svc connection to var
dtzar Nov 19, 2019
976c869
Add new env vars to example
dtzar Nov 19, 2019
eb20d03
fix build_id flow
dtzar Nov 19, 2019
390acad
use buildID vs release, tag vs properties
dtzar Nov 19, 2019
d2e650a
fix lint
dtzar Nov 19, 2019
c8a897f
fix tagging syntax
dtzar Nov 19, 2019
88318bc
local agent, eval tweaks
dtzar Nov 19, 2019
e34a7f7
local no container test
dtzar Nov 19, 2019
76ec635
Revert to hosted agents
dtzar Nov 19, 2019
0962627
move dotenv import under condition
dtzar Nov 19, 2019
5511ac5
fix run_id logic
dtzar Nov 19, 2019
43678c9
fix parent.id
dtzar Nov 19, 2019
7842091
move model helper
dtzar Nov 19, 2019
f94823d
log to train also, fix run_id
dtzar Nov 19, 2019
2bcd6c9
fix paths, exp name
dtzar Nov 20, 2019
be6a66b
better name for validation step
dtzar Nov 20, 2019
9d2a840
fix lint
dtzar Nov 20, 2019
98eebf1
upload file to parent run id
dtzar Nov 20, 2019
ce36d76
fail pipeline if exception thrown to reg
dtzar Nov 20, 2019
2462e33
fix tag suffix
dtzar Nov 20, 2019
8ab1d34
Merge branch 'master' into evalregrefresh
dtzar Nov 20, 2019
5fb45a8
aml workspace svc cleanup
dtzar Nov 20, 2019
156f28e
add register script path
dtzar Nov 20, 2019
0639646
fix firstRegistration var
dtzar Nov 20, 2019
9bfa9c2
use public AzureML task
dtzar Nov 20, 2019
60705db
Fix trigger to delete environment
dtzar Nov 20, 2019
2b8edd5
debugging
Nov 21, 2019
9619bd9
debugging
Nov 21, 2019
f64f2ca
Debugging
Nov 21, 2019
bf32389
Debugging
Nov 21, 2019
e0c65bd
debugging
Nov 21, 2019
cd1da4f
debugging
Nov 21, 2019
53f9aad
Debugging
Nov 21, 2019
eff9483
Linting
Nov 21, 2019
eada62f
Switch build status badge to aidemos AzDO
dtzar Nov 21, 2019
85ed8ad
Merge branch 'evalregrefresh' of github.com:microsoft/MLOpsPython int…
dtzar Nov 21, 2019
d652f87
consistent exception handling
dtzar Nov 21, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ SP_APP_ID = ''
SP_APP_SECRET = ''
RESOUCE_GROUP = 'mlops-rg'

# Mock build/release ID for local testing - update ReleaseID each "release"
# Mock build/release ID for local testing
BUILD_BUILDID = '001'
RELEASE_RELEASEID = '001'

# Azure ML Workspace Variables
WORKSPACE_NAME = ''
WORKSPACE_NAME = 'aml-workspace'
EXPERIMENT_NAME = ''
SCRIPT_FOLDER = './'

# AML Compute Cluster Config
AML_COMPUTE_CLUSTER_NAME = 'train-cluster'
Expand All @@ -36,4 +36,4 @@ SOURCES_DIR_TRAIN = 'code'
DB_CLUSTER_ID = ''

# Optional. Container Image name for image creation
IMAGE_NAME = 'ml-trained'
IMAGE_NAME = 'mltrained'
40 changes: 35 additions & 5 deletions .pipelines/azdo-ci-build-train.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ stages:
jobs:
- job: "Model_CI_Pipeline"
displayName: "Model CI Pipeline"
pool:
pool:
vmImage: 'ubuntu-latest'
container: mcr.microsoft.com/mlops/python:latest
timeoutInMinutes: 0
Expand All @@ -37,17 +37,47 @@ stages:
- stage: 'Trigger_AML_Pipeline'
displayName: 'Train, evaluate, register model via previously published AML pipeline'
jobs:
- job: "Invoke_Model_Pipeline"
- job: "Get_Pipeline_ID"
condition: and(succeeded(), eq(coalesce(variables['auto-trigger-training'], 'true'), 'true'))
displayName: "Invoke Model Pipeline and evaluate results to register"
pool:
displayName: "Get Pipeline ID for execution"
pool:
vmImage: 'ubuntu-latest'
container: mcr.microsoft.com/mlops/python:latest
timeoutInMinutes: 0
steps:
- script: |
python $(Build.SourcesDirectory)/ml_service/pipelines/run_train_pipeline.py
displayName: 'Trigger Training Pipeline'
source $(Build.SourcesDirectory)/tmp.sh
echo "##vso[task.setvariable variable=AMLPIPELINEID;isOutput=true]$AMLPIPELINE_ID"
name: 'getpipelineid'
displayName: 'Get Pipeline ID'
env:
SP_APP_SECRET: '$(SP_APP_SECRET)'
- job: "Run_ML_Pipeline"
dependsOn: "Get_Pipeline_ID"
displayName: "Trigger ML Training Pipeline"
pool: server
variables:
AMLPIPELINE_ID: $[ dependencies.Get_Pipeline_ID.outputs['getpipelineid.AMLPIPELINEID'] ]
steps:
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
displayName: 'Invoke ML pipeline'
inputs:
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
PipelineId: '$(AMLPIPELINE_ID)'
ExperimentName: '$(EXPERIMENT_NAME)'
PipelineParameters: '"model_name": "sklearn_regression_model.pkl"'
- job: "Training_Run_Report"
dependsOn: "Run_ML_Pipeline"
displayName: "Determine if evaluation succeeded and new model is registered"
pool:
vmImage: 'ubuntu-latest'
container: mcr.microsoft.com/mlops/python:latest
timeoutInMinutes: 0
steps:
- script: |
python $(Build.SourcesDirectory)/code/register/register_model.py --build_id $(Build.BuildId) --validate True
displayName: 'Check if new model registered'
env:
SP_APP_SECRET: '$(SP_APP_SECRET)'
- task: CopyFiles@2
Expand Down
4 changes: 2 additions & 2 deletions .pipelines/azdo-variables.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ variables:
value: '1'
# AML Pipeline Config
- name: TRAINING_PIPELINE_NAME
value: 'Training Pipeline'
value: 'Training-Pipeline'
- name: MODEL_PATH
value: ''
- name: EVALUATE_SCRIPT_PATH
Expand All @@ -34,7 +34,7 @@ variables:
- name: SOURCES_DIR_TRAIN
value: code
- name: IMAGE_NAME
value: ''
value: 'mltrained'
# Optional. Used by a training pipeline with R on Databricks
- name: DB_CLUSTER_ID
value: ''
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ description: "Code which demonstrates how to set up and operationalize an MLOps
# MLOps with Azure ML


[![Build Status](https://dev.azure.com/customai/DevopsForAI-AML/_apis/build/status/Build%20%26%20Train?branchName=master)](https://dev.azure.com/customai/DevopsForAI-AML/_build/latest?definitionId=34&branchName=master)
[![Build Status](https://aidemos.visualstudio.com/MLOps/_apis/build/status/microsoft.MLOpsPython-CI?branchName=master)](https://aidemos.visualstudio.com/MLOps/_build/latest?definitionId=127&branchName=master)


MLOps will help you to understand how to build the Continuous Integration and Continuous Delivery pipeline for a ML/AI project. We will be using the Azure DevOps Project for build and release/deployment pipelines along with Azure ML services for model retraining pipeline, model management and operationalization.
Expand Down
151 changes: 90 additions & 61 deletions code/evaluate/evaluate_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,90 +24,119 @@
POSSIBILITY OF SUCH DAMAGE.
"""
import os
from azureml.core import Model, Run
from azureml.core import Model, Run, Workspace, Experiment
import argparse
from azureml.core.authentication import ServicePrincipalAuthentication
import traceback


# Get workspace
run = Run.get_context()
exp = run.experiment
ws = run.experiment.workspace
if (run.id.startswith('OfflineRun')):
from dotenv import load_dotenv
# For local development, set values in this section
load_dotenv()
workspace_name = os.environ.get("WORKSPACE_NAME")
experiment_name = os.environ.get("EXPERIMENT_NAME")
resource_group = os.environ.get("RESOURCE_GROUP")
subscription_id = os.environ.get("SUBSCRIPTION_ID")
tenant_id = os.environ.get("TENANT_ID")
model_name = os.environ.get("MODEL_NAME")
app_id = os.environ.get('SP_APP_ID')
app_secret = os.environ.get('SP_APP_SECRET')
build_id = os.environ.get('BUILD_BUILDID')
service_principal = ServicePrincipalAuthentication(
tenant_id=tenant_id,
service_principal_id=app_id,
service_principal_password=app_secret)

aml_workspace = Workspace.get(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
auth=service_principal
)
ws = aml_workspace
exp = Experiment(ws, experiment_name)
run_id = "e78b2c27-5ceb-49d9-8e84-abe7aecf37d5"
else:
exp = run.experiment
ws = run.experiment.workspace
run_id = 'amlcompute'

parser = argparse.ArgumentParser("evaluate")
parser.add_argument(
"--release_id",
"--build_id",
type=str,
help="The Build ID of the build triggering this pipeline run",
)
parser.add_argument(
"--run_id",
type=str,
help="The ID of the release triggering this pipeline run",
help="Training run ID",
)
parser.add_argument(
"--model_name",
type=str,
help="Name of the Model",
default="sklearn_regression_model.pkl",
)
args = parser.parse_args()

print("Argument 1: %s" % args.release_id)
print("Argument 2: %s" % args.model_name)
args = parser.parse_args()
if (args.build_id is not None):
build_id = args.build_id
if (args.run_id is not None):
run_id = args.run_id
if (run_id == 'amlcompute'):
run_id = run.parent.id
model_name = args.model_name
release_id = args.release_id
metric_eval = "mse"
run.tag("BuildId", value=build_id)

# Paramaterize the matrics on which the models should be compared
# Paramaterize the matrices on which the models should be compared
# Add golden data set on which all the model performance can be evaluated

all_runs = exp.get_runs(
properties={"release_id": release_id, "run_type": "train"},
include_children=True
)
new_model_run = next(all_runs)
new_model_run_id = new_model_run.id
print(f'New Run found with Run ID of: {new_model_run_id}')

try:
# Get most recently registered model, we assume that
# is the model in production.
# Download this model and compare it with the recently
# trained model by running test with same data set.
model_list = Model.list(ws)
production_model = next(
filter(
lambda x: x.created_time == max(
model.created_time for model in model_list),
model_list,
if (len(model_list) > 0):
production_model = next(
filter(
lambda x: x.created_time == max(
model.created_time for model in model_list),
model_list,
)
)
)
production_model_run_id = production_model.tags.get("run_id")
run_list = exp.get_runs()
production_model_run_id = production_model.run_id

# Get the run history for both production model and
# newly trained model and compare mse
production_model_run = Run(exp, run_id=production_model_run_id)
new_model_run = Run(exp, run_id=new_model_run_id)
# Get the run history for both production model and
# newly trained model and compare mse
production_model_run = Run(exp, run_id=production_model_run_id)
new_model_run = run.parent
print("Production model run is", production_model_run)

production_model_mse = production_model_run.get_metrics().get("mse")
new_model_mse = new_model_run.get_metrics().get("mse")
print(
"Current Production model mse: {}, New trained model mse: {}".format(
production_model_mse, new_model_mse
)
)
production_model_mse = \
production_model_run.get_metrics().get(metric_eval)
new_model_mse = new_model_run.get_metrics().get(metric_eval)
if (production_model_mse is None or new_model_mse is None):
print("Unable to find", metric_eval, "metrics, "
"exiting evaluation")
run.parent.cancel()
else:
print(
"Current Production model mse: {}, "
"New trained model mse: {}".format(
production_model_mse, new_model_mse
)
)

promote_new_model = False
if new_model_mse < production_model_mse:
promote_new_model = True
print("New trained model performs better, thus it will be registered")
if (new_model_mse < production_model_mse):
print("New trained model performs better, "
"thus it should be registered")
else:
print("New trained model metric is less than or equal to "
"production model so skipping model registration.")
run.parent.cancel()
else:
print("This is the first model, "
"thus it should be registered")
except Exception:
promote_new_model = True
print("This is the first model to be trained, \
thus nothing to evaluate for now")


# Writing the run id to /aml_config/run_id.json
if promote_new_model:
model_path = os.path.join('outputs', model_name)
new_model_run.register_model(
model_name=model_name,
model_path=model_path,
properties={"release_id": release_id})
print("Registered new model!")
traceback.print_exc(limit=None, file=None, chain=True)
print("Something went wrong trying to evaluate. Exiting.")
raise
Loading