using custom image

microsoft · May 4, 2019 · 0628959 · 0628959
1 parent c353638
commit 0628959
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 18 deletions.
diff --git a/pipelines_build.py b/pipelines_build.py
@@ -9,17 +9,22 @@
 from azureml.pipeline.steps import PythonScriptStep
 from azureml.core.compute import AmlCompute
 from azureml.core.compute import ComputeTarget
-from azureml.core.compute_target import ComputeTargetException
+# from azureml.core.compute_target import ComputeTargetException
 from azureml.core.runconfig import CondaDependencies, RunConfiguration
 from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
 from azureml.pipeline.steps import HyperDriveStep
 from azureml.pipeline.core import PublishedPipeline
 from azureml.train.hyperdrive import choice, loguniform
-from azureml.train.dnn import TensorFlow
+# from azureml.train.dnn import TensorFlow
+from azureml.train.estimator import Estimator
 from azure.storage.blob import BlockBlobService
 from azureml.core.runconfig import DEFAULT_GPU_IMAGE, DEFAULT_CPU_IMAGE
 from azureml.core.authentication import ServicePrincipalAuthentication
 from azureml.core import ScriptRunConfig
+from azureml.core.container_registry import ContainerRegistry
+
+from azureml.core import VERSION
+print("azureml.core.VERSION", VERSION)
 
 def build_pipeline(dataset, ws, config):
     print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name))
@@ -48,7 +53,7 @@ def build_pipeline(dataset, ws, config):
     try:
         cpu_compute_target = AmlCompute(ws, cpu_compute_name)
         print("found existing compute target: %s" % cpu_compute_name)
-    except ComputeTargetException:
+    except:# ComputeTargetException:
         print("creating new compute target")
 
         provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
@@ -66,10 +71,10 @@ def build_pipeline(dataset, ws, config):
     try:
         gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
         print("found existing compute target: %s" % gpu_compute_name)
-    except ComputeTargetException:
+    except: # ComputeTargetException:
         print('Creating a new compute target...')
         provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
-                                                                    max_nodes=5,
+                                                                    max_nodes=10,
                                                                     idle_seconds_before_scaledown=1800)
 
         # create the cluster
@@ -84,7 +89,7 @@ def build_pipeline(dataset, ws, config):
 
     # conda dependencies for compute targets
     cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"])
-    # gpu_cd = CondaDependencies.create(pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azureml-sdk", "horovod==0.13.5", "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0", "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3", "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2"])
+    # gpu_cd = CondaDependencies(".", "conda_dependencies.yml")# .create(pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azureml-sdk", "horovod==0.13.5", "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0", "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3", "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2"])
 
     # Runconfigs
     cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
@@ -93,6 +98,11 @@ def build_pipeline(dataset, ws, config):
     cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
     cpu_compute_run_config.environment.spark.precache_packages = False
 
+
+    # run_config = RunConfiguration.load('.','gpu')
+
+    # script_run_config = ScriptRunConfig(run_config = run_config, source_directory='./scripts')
+
     # gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd)
     # gpu_compute_run_config.environment.docker.enabled = True
     # gpu_compute_run_config.environment.docker.gpu_support = True
@@ -149,16 +159,37 @@ def build_pipeline(dataset, ws, config):
     print("data_prep created")
 
 
-    # est = TensorFlow(source_directory=script_folder,
+
+    acr = ContainerRegistry()
+    acr.address = config['acr_address']
+    acr.username = config['acr_username']
+    acr.password = config['acr_password']
+    # {'address':config['acr_address'], 'username':config['acr_username'], 'password':config['acr_password']})
+    # acr = ContainerRegistry(address=config['acr_address'], username=config['acr_username'], password=config['acr_password'])
+    # acr = ContainerRegistry(config['acr_address'], config['acr_username'], config['acr_password'])
+
+    est = Estimator(source_directory=script_folder,
+                    compute_target=gpu_compute_target,
+                    # pip_packages=["hickle==3.4.3"],#"horovod==0.13.5", 'keras==2.0.8', 'matplotlib', 'hickle'],
+                    # conda_packages=["tensorflow==1.8.0", "tensorflow-gpu==1.8.0",]
+                    entry_script='train.py', 
+                    use_gpu=True,
+                    node_count=1,
+                    custom_docker_image = "wopauli_1.8-gpu:4",
+                    image_registry_details=acr,
+                    user_managed=True
+                    )
+
+    # est = Estimator(source_directory=script_folder,
     #                 compute_target=gpu_compute_target,
-    #                 pip_packages=['keras==2.0.8', 'theano', 'tensorflow==1.8.0', 'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod==0.13.5', 'hickle'],
+    #                 # pip_packages=["horovod==0.13.5", 'keras==2.0.8', 'matplotlib', 'hickle'],
+    #                 # conda_packages=["tensorflow==1.8.0", "tensorflow-gpu==1.8.0",]
     #                 entry_script='train.py', 
     #                 use_gpu=True,
-    #                 node_count=1)
-
-    run_config = RunConfiguration.load('.','gpu')
-
-    script_run_config = ScriptRunConfig(run_config = run_config, source_directory='./scripts')
+    #                 node_count=1,
+    #                 custom_docker_image = "prednetws0551092507.azurecr.io/wopauli_1.8-gpu:2",
+    #                 user_managed=True
+    #                 )
 
     ps = RandomParameterSampling(
         {
@@ -167,14 +198,14 @@ def build_pipeline(dataset, ws, config):
             '--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"),
             '--learning_rate': loguniform(-6, -1),
             '--lr_decay': loguniform(-9, -1),
-            '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2", "3"),
+            '--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
             '--transfer_learning': choice("True", "False")
         }
     )
 
     policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=20)
 
-    hdc = HyperDriveConfig(run_config=script_run_config, 
+    hdc = HyperDriveConfig(estimator=est, # run_config=script_run_config, 
                             hyperparameter_sampling=ps, 
                             policy=policy, 
                             primary_metric_name='val_loss', 

diff --git a/pipelines_master.py b/pipelines_master.py
@@ -11,7 +11,7 @@
 from azureml.core.compute import AmlCompute
 from azureml.core.runconfig import DEFAULT_GPU_IMAGE, DEFAULT_CPU_IMAGE
 from azureml.core.compute import ComputeTarget
-from azureml.core.compute_target import ComputeTargetException
+# from azureml.core.compute_target import ComputeTargetException
 from azureml.core.runconfig import CondaDependencies, RunConfiguration
 from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveRunConfig, PrimaryMetricGoal
 from azureml.pipeline.steps import HyperDriveStep
@@ -22,6 +22,9 @@
 import json
 
 
+from azureml.core import VERSION
+print("azureml.core.VERSION", VERSION)
+
 base_dir = '.'
 
 config_json = os.path.join(base_dir, 'config.json')
@@ -52,7 +55,7 @@
 try:
         cpu_compute_target = AmlCompute(ws, cpu_compute_name)
         print("found existing compute target: %s" % cpu_compute_name)
-except ComputeTargetException:
+except: # ComputeTargetException:
     print("creating new compute target")
 
     provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', 
@@ -84,7 +87,7 @@
 shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
 shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
 shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)
-shutil.copytree(os.path.join(base_dir, '.azureml'), os.path.join(script_folder, '.azureml'))
+# shutil.copytree(os.path.join(base_dir, '.azureml'), os.path.join(script_folder, '.azureml'))
 
 hash_paths = os.listdir(script_folder)
 

diff --git a/train.py b/train.py
@@ -5,6 +5,7 @@
 
 '''
 
+
 print("loading standard modules")
 import os
 import numpy as np
@@ -92,6 +93,8 @@ def str2bool(v):
 # initiate logging if we are running remotely
 if remote_execution:
     print("Running on remote compute target:", remote_execution)
+    from azureml.core import VERSION
+    print("azureml.core.VERSION", VERSION)
     from azureml.core import Run
 
     # start an Azure ML run