diff --git a/.env.example b/.env.example index b7d0d5b5..2f62dad7 100644 --- a/.env.example +++ b/.env.example @@ -45,4 +45,8 @@ IMAGE_VERSION = '' # ACI Config ACI_CPU_CORES = '' ACI_MEM_GB = '' -ACI_DESCRIPTION = '' \ No newline at end of file +ACI_DESCRIPTION = '' + +# Optional. Used by a training pipeline with R on Databricks +DB_CLUSTER_ID = '' +DATABRICKS_COMPUTE_NAME = '' \ No newline at end of file diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/azdo-ci-build-train.yml index 1b34b892..efecb7f8 100644 --- a/.pipelines/azdo-ci-build-train.yml +++ b/.pipelines/azdo-ci-build-train.yml @@ -15,17 +15,36 @@ variables: steps: + - template: azdo-base-pipeline.yml - + - bash: | - # Invoke the Python building and publishing a training pipeline + # Invoke the Python building and publishing a training pipeline with Python on ML Compute python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline.py failOnStderr: 'false' env: SP_APP_SECRET: '$(SP_APP_SECRET)' - displayName: 'Publish Azure Machine Learning Pipeline' + displayName: 'Publish Azure Machine Learning Pipeline. Python on ML' enabled: 'true' +- bash: | + # Invoke the Python building and publishing a training pipeline with R on ML Compute + python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline_with_r.py + failOnStderr: 'false' + env: + SP_APP_SECRET: '$(SP_APP_SECRET)' + displayName: 'Publish Azure Machine Learning Pipeline. R on ML Compute' + enabled: 'false' + +- bash: | + # Invoke the Python building and publishing a training pipeline with R on DataBricks + python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py + failOnStderr: 'false' + env: + SP_APP_SECRET: '$(SP_APP_SECRET)' + displayName: 'Publish Azure Machine Learning Pipeline. R on DataBricks' + enabled: 'false' + - task: CopyFiles@2 displayName: 'Copy Files to: $(Build.ArtifactStagingDirectory)' inputs: diff --git a/code/training/R/r_train.r b/code/training/R/r_train.r new file mode 100644 index 00000000..c19a58be --- /dev/null +++ b/code/training/R/r_train.r @@ -0,0 +1,41 @@ +print(R.version.string) + +# COMMAND ---------- + +path="weight_data.csv" +print(paste("Reading file from",path)) + +routes<-read.csv(path, header=TRUE) + +# The predictor vector (height). +x <- routes$height +# The response vector (weight). +y <- routes$weight +# Apply the lm() function. +model <- lm(y~x) + +# COMMAND ---------- + +routes + +# COMMAND ---------- + +# Make Predictions +df_test_heights <- data.frame(x = as.numeric(c(115,20))) +result <- predict(model,df_test_heights) +print(result) + +# COMMAND ---------- + +# Save the model to blob storage +model_path="model.rds" +saveRDS(model, model_path) + +# COMMAND ---------- + +# View model details +print(model) + +# COMMAND ---------- + +print('Completed') \ No newline at end of file diff --git a/code/training/R/train_with_r.py b/code/training/R/train_with_r.py new file mode 100644 index 00000000..e2c7d295 --- /dev/null +++ b/code/training/R/train_with_r.py @@ -0,0 +1,3 @@ +import os + +os.system("Rscript r_train.r && ls -ltr model.rds") diff --git a/code/training/R/train_with_r_on_databricks.py b/code/training/R/train_with_r_on_databricks.py new file mode 100644 index 00000000..1a120bd0 --- /dev/null +++ b/code/training/R/train_with_r_on_databricks.py @@ -0,0 +1,15 @@ +import os +import argparse + +parser = argparse.ArgumentParser("train") +parser.add_argument( + "--AZUREML_SCRIPT_DIRECTORY_NAME", + type=str, + help="folder", +) + +args, unknown = parser.parse_known_args() +folder = args.AZUREML_SCRIPT_DIRECTORY_NAME + +os.system("cd " + "/dbfs/" + folder + + " && Rscript r_train.r && ls -ltr model.rds") diff --git a/code/training/R/weight_data.csv b/code/training/R/weight_data.csv new file mode 100644 index 00000000..cc441ee9 --- /dev/null +++ b/code/training/R/weight_data.csv @@ -0,0 +1,30 @@ +height,weight +79,174 +63,250 +75,223 +75,130 +70,120 +76,239 +63,129 +64,185 +59,246 +80,241 +79,217 +65,212 +74,242 +71,223 +61,167 +78,148 +75,229 +75,116 +75,182 +72,237 +72,160 +79,169 +67,219 +61,202 +65,168 +79,181 +81,214 +78,216 +59,245 diff --git a/docs/code_description.md b/docs/code_description.md index d93ef077..bd094ce0 100644 --- a/docs/code_description.md +++ b/docs/code_description.md @@ -20,8 +20,10 @@ ### ML Services -- `ml_service/pipelines/build_train_pipeline.py` : builds and publishes an ML training pipeline. -- `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline via REST API. +- `ml_service/pipelines/build_train_pipeline.py` : builds and publishes an ML training pipeline. It uses Python on ML Compute. +- `ml_service/pipelines/build_train_pipeline_with_r.py` : builds and publishes an ML training pipeline. It uses R on ML Compute. +- `ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py` : builds and publishes an ML training pipeline. It uses R on Databricks Compute. +- `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline (Python on ML Compute) via REST API. - `ml_service/util` : contains common utility functions used to build and publish an ML training pipeline. ### Code @@ -29,6 +31,10 @@ - `code/training/train.py` : a training step of an ML training pipeline. - `code/evaluate/evaluate_model.py` : an evaluating step of an ML training pipeline which registers a new trained model if evaluation shows the new model is more performant than the previous one. - `code/evaluate/register_model.py` : (LEGACY) registers a new trained model if evaluation shows the new model is more performant than the previous one. +- `code/training/R/r_train.r` : training a model with R basing on a sample dataset (weight_data.csv). +- `code/training/R/train_with_r.py` : a python wrapper (ML Pipeline Step) invoking R training script on ML Compute +- `code/training/R/train_with_r_on_databricks.py` : a python wrapper (ML Pipeline Step) invoking R training script on Databricks Compute +- `code/training/R/weight_data.csv` : a sample dataset used by R script (r_train.r) to train a model ### Scoring - code/scoring/score.py : a scoring script which is about to be packed into a Docker Image along with a model while being deployed to QA/Prod environment. diff --git a/docs/getting_started.md b/docs/getting_started.md index ffe175b9..2d041de8 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -33,23 +33,25 @@ Please name your variable group **``devopsforai-aml-vg``** as we are using this The variable group should contain the following variables: -| Variable Name | Suggested Value | -| --------------------------- | ---------------------------- | -| AML_COMPUTE_CLUSTER_CPU_SKU | STANDARD_DS2_V2 | -| AML_COMPUTE_CLUSTER_NAME | train-cluster | -| BASE_NAME | [unique base name] | -| EVALUATE_SCRIPT_PATH | evaluate/evaluate_model.py | -| EXPERIMENT_NAME | mlopspython | -| LOCATION | centralus | -| MODEL_NAME | sklearn_regression_model.pkl | -| REGISTER_SCRIPT_PATH | register/register_model.py | -| SOURCES_DIR_TRAIN | code | -| SP_APP_ID | | -| SP_APP_SECRET | | -| SUBSCRIPTION_ID | | -| TENANT_ID | | -| TRAIN_SCRIPT_PATH | training/train.py | -| TRAINING_PIPELINE_NAME | training-pipeline | +| Variable Name | Suggested Value | +| --------------------------- | -----------------------------------| +| AML_COMPUTE_CLUSTER_CPU_SKU | STANDARD_DS2_V2 | +| AML_COMPUTE_CLUSTER_NAME | train-cluster | +| BASE_NAME | [unique base name] | +| DB_CLUSTER_ID | [Optional Databricks cluster Id] | +| DATABRICKS_COMPUTE_NAME | [Optional Databricks compute name] | +| EVALUATE_SCRIPT_PATH | evaluate/evaluate_model.py | +| EXPERIMENT_NAME | mlopspython | +| LOCATION | centralus | +| MODEL_NAME | sklearn_regression_model.pkl | +| REGISTER_SCRIPT_PATH | register/register_model.py | +| SOURCES_DIR_TRAIN | code | +| SP_APP_ID | | +| SP_APP_SECRET | | +| SUBSCRIPTION_ID | | +| TENANT_ID | | +| TRAIN_SCRIPT_PATH | training/train.py | +| TRAINING_PIPELINE_NAME | training-pipeline | Mark **SP_APP_SECRET** variable as a secret one. @@ -108,6 +110,8 @@ and checkout a published training pipeline in the **mlops-AML-WS** workspace in Great, you now have the build pipeline setup, you can either manually trigger it or it gets automatically triggered everytime there is a change in the master branch. The pipeline performs linting, unit testing, builds and publishes an **ML Training Pipeline** in an **ML Workspace** +**Note:** The building pipeline contains disabled steps to build and publish ML pipelines using R to train a model. Enable these steps if you want to play with this approach. For the pipeline training a model with R on Databricks you have to manually create a Databricks cluster and attach it to the ML Workspace as a compute (Values DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables shoud be specified). + ### 7. Train the Model The next step is to invoke the training pipeline created in the previous step. It can be done with a **Release Pipeline**. Click on the Pipelines/Releases menu, and then **New pipeline**, and then click on "Empty Job" on the "Select a template" window that pops to the right: diff --git a/environment_setup/Dockerfile b/environment_setup/Dockerfile index b6b3be6a..5e7b7581 100644 --- a/environment_setup/Dockerfile +++ b/environment_setup/Dockerfile @@ -8,6 +8,7 @@ LABEL org.label-schema.vendor = "Microsoft" \ COPY environment_setup/requirements.txt /setup/ -RUN apt-get update && apt-get install gcc -y && pip install --upgrade -r /setup/requirements.txt +RUN apt-get update && apt-get install gcc -y && pip install --upgrade -r /setup/requirements.txt && \ + conda install -c r r-essentials CMD ["python"] \ No newline at end of file diff --git a/environment_setup/requirements.txt b/environment_setup/requirements.txt index 23880c0c..8a086c4d 100644 --- a/environment_setup/requirements.txt +++ b/environment_setup/requirements.txt @@ -1,5 +1,6 @@ pytest==4.3.0 requests>=2.22 +azureml>=0.2 azureml-sdk>=1.0 python-dotenv>=0.10.3 flake8 diff --git a/ml_service/pipelines/build_train_pipeline_with_r.py b/ml_service/pipelines/build_train_pipeline_with_r.py new file mode 100644 index 00000000..7eae2c98 --- /dev/null +++ b/ml_service/pipelines/build_train_pipeline_with_r.py @@ -0,0 +1,78 @@ +from azureml.pipeline.steps import PythonScriptStep +from azureml.pipeline.core import Pipeline # , PipelineData +from azureml.core.runconfig import RunConfiguration, CondaDependencies +# from azureml.core import Datastore +import os +import sys +from dotenv import load_dotenv +sys.path.append(os.path.abspath("./ml_service/util")) # NOQA: E402 +from workspace import get_workspace +from attach_compute import get_compute + + +def main(): + load_dotenv() + workspace_name = os.environ.get("BASE_NAME")+"-AML-WS" + resource_group = os.environ.get("BASE_NAME")+"-AML-RG" + subscription_id = os.environ.get("SUBSCRIPTION_ID") + tenant_id = os.environ.get("TENANT_ID") + app_id = os.environ.get("SP_APP_ID") + app_secret = os.environ.get("SP_APP_SECRET") + vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") + compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME") + build_id = os.environ.get("BUILD_BUILDID") + pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") + + # Get Azure machine learning workspace + aml_workspace = get_workspace( + workspace_name, + resource_group, + subscription_id, + tenant_id, + app_id, + app_secret) + print(aml_workspace) + + # Get Azure machine learning cluster + aml_compute = get_compute( + aml_workspace, + compute_name, + vm_size) + if aml_compute is not None: + print(aml_compute) + + run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( + conda_packages=['numpy', 'pandas', + 'scikit-learn', 'tensorflow', 'keras'], + pip_packages=['azure', 'azureml-core', + 'azure-storage', + 'azure-storage-blob']) + ) + run_config.environment.docker.enabled = True + run_config.environment.docker.base_image = "mcr.microsoft.com/mlops/python" + + train_step = PythonScriptStep( + name="Train Model", + script_name="train_with_r.py", + compute_target=aml_compute, + source_directory="code/training/R", + runconfig=run_config, + allow_reuse=False, + ) + print("Step Train created") + + steps = [train_step] + + train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) + train_pipeline.validate() + published_pipeline = train_pipeline.publish( + name=pipeline_name + "_with_R", + description="Model training/retraining pipeline", + version=build_id + ) + print(f'Published pipeline: {published_pipeline.name}') + print(f'for build {published_pipeline.version}') + + +if __name__ == '__main__': + main() diff --git a/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py b/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py new file mode 100644 index 00000000..95de9e55 --- /dev/null +++ b/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py @@ -0,0 +1,70 @@ +from azureml.pipeline.core import Pipeline +import os +import sys +from dotenv import load_dotenv +sys.path.append(os.path.abspath("./ml_service/util")) # NOQA: E402 +from workspace import get_workspace +from attach_compute import get_compute +from azureml.pipeline.steps import DatabricksStep + + +def main(): + load_dotenv() + workspace_name = os.environ.get("BASE_NAME")+"-AML-WS" + resource_group = os.environ.get("BASE_NAME")+"-AML-RG" + subscription_id = os.environ.get("SUBSCRIPTION_ID") + tenant_id = os.environ.get("TENANT_ID") + app_id = os.environ.get("SP_APP_ID") + app_secret = os.environ.get("SP_APP_SECRET") + vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU") + compute_name = os.environ.get("DATABRICKS_COMPUTE_NAME") + db_cluster_id = os.environ.get("DB_CLUSTER_ID") + build_id = os.environ.get("BUILD_BUILDID") + pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME") + + # Get Azure machine learning workspace + aml_workspace = get_workspace( + workspace_name, + resource_group, + subscription_id, + tenant_id, + app_id, + app_secret) + print(aml_workspace) + + # Get Azure machine learning cluster + aml_compute = get_compute( + aml_workspace, + compute_name, + vm_size) + if aml_compute is not None: + print(aml_compute) + + train_step = DatabricksStep( + name="DBPythonInLocalMachine", + num_workers=1, + python_script_name="train_with_r_on_databricks.py", + source_directory="code/training/R", + run_name='DB_Python_R_demo', + existing_cluster_id=db_cluster_id, + compute_target=aml_compute, + allow_reuse=False + ) + + print("Step Train created") + + steps = [train_step] + + train_pipeline = Pipeline(workspace=aml_workspace, steps=steps) + train_pipeline.validate() + published_pipeline = train_pipeline.publish( + name=pipeline_name + "_with_R_on_DB", + description="Model training/retraining pipeline", + version=build_id + ) + print(f'Published pipeline: {published_pipeline.name}') + print(f'for build {published_pipeline.version}') + + +if __name__ == '__main__': + main()