Skip to content

Commit

Permalink
using custom image
Browse files Browse the repository at this point in the history
  • Loading branch information
Wolfgang Pauli committed May 4, 2019
1 parent c353638 commit 0628959
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 18 deletions.
61 changes: 46 additions & 15 deletions pipelines_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,22 @@
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
# from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import CondaDependencies, RunConfiguration
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.pipeline.steps import HyperDriveStep
from azureml.pipeline.core import PublishedPipeline
from azureml.train.hyperdrive import choice, loguniform
from azureml.train.dnn import TensorFlow
# from azureml.train.dnn import TensorFlow
from azureml.train.estimator import Estimator
from azure.storage.blob import BlockBlobService
from azureml.core.runconfig import DEFAULT_GPU_IMAGE, DEFAULT_CPU_IMAGE
from azureml.core.authentication import ServicePrincipalAuthentication
from azureml.core import ScriptRunConfig
from azureml.core.container_registry import ContainerRegistry

from azureml.core import VERSION
print("azureml.core.VERSION", VERSION)

def build_pipeline(dataset, ws, config):
print("building pipeline for dataset %s in workspace %s" % (dataset, ws.name))
Expand Down Expand Up @@ -48,7 +53,7 @@ def build_pipeline(dataset, ws, config):
try:
cpu_compute_target = AmlCompute(ws, cpu_compute_name)
print("found existing compute target: %s" % cpu_compute_name)
except ComputeTargetException:
except:# ComputeTargetException:
print("creating new compute target")

provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
Expand All @@ -66,10 +71,10 @@ def build_pipeline(dataset, ws, config):
try:
gpu_compute_target = AmlCompute(workspace=ws, name=gpu_compute_name)
print("found existing compute target: %s" % gpu_compute_name)
except ComputeTargetException:
except: # ComputeTargetException:
print('Creating a new compute target...')
provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
max_nodes=5,
max_nodes=10,
idle_seconds_before_scaledown=1800)

# create the cluster
Expand All @@ -84,7 +89,7 @@ def build_pipeline(dataset, ws, config):

# conda dependencies for compute targets
cpu_cd = CondaDependencies.create(conda_packages=["py-opencv=3.4.2"], pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azure-storage-blob==1.5.0", "hickle==3.4.3", "requests==2.21.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2", "pillow==6.0.0"])
# gpu_cd = CondaDependencies.create(pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azureml-sdk", "horovod==0.13.5", "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0", "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3", "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2"])
# gpu_cd = CondaDependencies(".", "conda_dependencies.yml")# .create(pip_indexurl='https://azuremlsdktestpypi.azureedge.net/sdk-release/Candidate/604C89A437BA41BD942B4F46D9A3591D', pip_packages=["azureml-sdk", "horovod==0.13.5", "keras==2.0.8", "theano==1.0.4", "tensorflow==1.8.0", "tensorflow-gpu==1.8.0", "hickle==3.4.3", "matplotlib==3.0.3", "seaborn==0.9.0", "requests==2.21.0", "bs4==0.0.1", "imageio==2.5.0", "sklearn", "pandas==0.24.2", "azureml-sdk", "numpy==1.16.2"])

# Runconfigs
cpu_compute_run_config = RunConfiguration(conda_dependencies=cpu_cd)
Expand All @@ -93,6 +98,11 @@ def build_pipeline(dataset, ws, config):
cpu_compute_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
cpu_compute_run_config.environment.spark.precache_packages = False


# run_config = RunConfiguration.load('.','gpu')

# script_run_config = ScriptRunConfig(run_config = run_config, source_directory='./scripts')

# gpu_compute_run_config = RunConfiguration(conda_dependencies=gpu_cd)
# gpu_compute_run_config.environment.docker.enabled = True
# gpu_compute_run_config.environment.docker.gpu_support = True
Expand Down Expand Up @@ -149,16 +159,37 @@ def build_pipeline(dataset, ws, config):
print("data_prep created")


# est = TensorFlow(source_directory=script_folder,

acr = ContainerRegistry()
acr.address = config['acr_address']
acr.username = config['acr_username']
acr.password = config['acr_password']
# {'address':config['acr_address'], 'username':config['acr_username'], 'password':config['acr_password']})
# acr = ContainerRegistry(address=config['acr_address'], username=config['acr_username'], password=config['acr_password'])
# acr = ContainerRegistry(config['acr_address'], config['acr_username'], config['acr_password'])

est = Estimator(source_directory=script_folder,
compute_target=gpu_compute_target,
# pip_packages=["hickle==3.4.3"],#"horovod==0.13.5", 'keras==2.0.8', 'matplotlib', 'hickle'],
# conda_packages=["tensorflow==1.8.0", "tensorflow-gpu==1.8.0",]
entry_script='train.py',
use_gpu=True,
node_count=1,
custom_docker_image = "wopauli_1.8-gpu:4",
image_registry_details=acr,
user_managed=True
)

# est = Estimator(source_directory=script_folder,
# compute_target=gpu_compute_target,
# pip_packages=['keras==2.0.8', 'theano', 'tensorflow==1.8.0', 'tensorflow-gpu==1.8.0', 'matplotlib', 'horovod==0.13.5', 'hickle'],
# # pip_packages=["horovod==0.13.5", 'keras==2.0.8', 'matplotlib', 'hickle'],
# # conda_packages=["tensorflow==1.8.0", "tensorflow-gpu==1.8.0",]
# entry_script='train.py',
# use_gpu=True,
# node_count=1)

run_config = RunConfiguration.load('.','gpu')

script_run_config = ScriptRunConfig(run_config = run_config, source_directory='./scripts')
# node_count=1,
# custom_docker_image = "prednetws0551092507.azurecr.io/wopauli_1.8-gpu:2",
# user_managed=True
# )

ps = RandomParameterSampling(
{
Expand All @@ -167,14 +198,14 @@ def build_pipeline(dataset, ws, config):
'--stack_sizes': choice("48, 96, 192", "36, 72, 144", "12, 24, 48"), #, "48, 96"),
'--learning_rate': loguniform(-6, -1),
'--lr_decay': loguniform(-9, -1),
'--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "1", "2", "3"),
'--freeze_layers': choice("0, 1, 2", "1, 2, 3", "0, 1", "1, 2", "2, 3", "0", "3"),
'--transfer_learning': choice("True", "False")
}
)

policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1, delay_evaluation=20)

hdc = HyperDriveConfig(run_config=script_run_config,
hdc = HyperDriveConfig(estimator=est, # run_config=script_run_config,
hyperparameter_sampling=ps,
policy=policy,
primary_metric_name='val_loss',
Expand Down
9 changes: 6 additions & 3 deletions pipelines_master.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from azureml.core.compute import AmlCompute
from azureml.core.runconfig import DEFAULT_GPU_IMAGE, DEFAULT_CPU_IMAGE
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
# from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import CondaDependencies, RunConfiguration
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveRunConfig, PrimaryMetricGoal
from azureml.pipeline.steps import HyperDriveStep
Expand All @@ -22,6 +22,9 @@
import json


from azureml.core import VERSION
print("azureml.core.VERSION", VERSION)

base_dir = '.'

config_json = os.path.join(base_dir, 'config.json')
Expand Down Expand Up @@ -52,7 +55,7 @@
try:
cpu_compute_target = AmlCompute(ws, cpu_compute_name)
print("found existing compute target: %s" % cpu_compute_name)
except ComputeTargetException:
except: # ComputeTargetException:
print("creating new compute target")

provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
Expand Down Expand Up @@ -84,7 +87,7 @@
shutil.copy(os.path.join(base_dir, 'data_preparation.py'), script_folder)
shutil.copy(os.path.join(base_dir, 'model_registration.py'), script_folder)
shutil.copy(os.path.join(base_dir, 'config.json'), script_folder)
shutil.copytree(os.path.join(base_dir, '.azureml'), os.path.join(script_folder, '.azureml'))
# shutil.copytree(os.path.join(base_dir, '.azureml'), os.path.join(script_folder, '.azureml'))

hash_paths = os.listdir(script_folder)

Expand Down
3 changes: 3 additions & 0 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
'''


print("loading standard modules")
import os
import numpy as np
Expand Down Expand Up @@ -92,6 +93,8 @@ def str2bool(v):
# initiate logging if we are running remotely
if remote_execution:
print("Running on remote compute target:", remote_execution)
from azureml.core import VERSION
print("azureml.core.VERSION", VERSION)
from azureml.core import Run

# start an Azure ML run
Expand Down

0 comments on commit 0628959

Please sign in to comment.