# Get workspace and authenticate

In [4]:
import json
import azureml.core
from azureml.core import Workspace, Datastore, Dataset

print(azureml.core.VERSION)

1.10.0


In [5]:
ws = Workspace.from_config()
print(ws.name, ws.subscription_id, ws.resource_group)

magaml 907c8efc-c2c8-4c49-a4e1-aeb880e10c88 aml


In [6]:
ws

Workspace.create(name='magaml', subscription_id='907c8efc-c2c8-4c49-a4e1-aeb880e10c88', resource_group='aml')

# query Key Vault

In [7]:
kv = ws.get_default_keyvault()
print(len(kv.get_secret('adfserviceprincipal')))

32


# Create a data store 

In [8]:
ws.datastores

{'images_datastore': {
   "name": "images_datastore",
   "container_name": "sampledata",
   "account_name": "pipelinedata",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'azureml_globaldatasets': {
   "name": "azureml_globaldatasets",
   "container_name": "globaldatasets",
   "account_name": "mmstorageaustraliaeast",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'godzilla': {
   "name": "godzilla",
   "container_name": "batchml",
   "account_name": "godzillasinastorage",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspacefilestore': {
   "name": "workspacefilestore",
   "container_name": "azureml-filestore-c744e648-3d1a-4b53-9b48-f8dc50fd0d3f",
   "account_name": "magaml1375920863",
   "protocol": "https",
   "endpoint": "core.windows.net"
 },
 'workspaceblobstore': {
   "name": "workspaceblobstore",
   "container_name": "azureml-blobstore-c744e648-3d1a-4b53-9b48-f8dc50fd0d3f",
   "account_name": "magaml1375920863",
   "protocol":

In [9]:
print(type(ws.datastores.items()))

for i,v in ws.datastores.items(): 
    print(i)
    print(type(v))
    print(v.name)
    print(v.container_name)
    print(v.account_name)
    print(v.protocol)
    print(v.endpoint)

    

<class 'dict_items'>
images_datastore
<class 'azureml.data.azure_storage_datastore.AzureBlobDatastore'>
images_datastore
sampledata
pipelinedata
https
core.windows.net
azureml_globaldatasets
<class 'azureml.data.azure_storage_datastore.AzureBlobDatastore'>
azureml_globaldatasets
globaldatasets
mmstorageaustraliaeast
https
core.windows.net
godzilla
<class 'azureml.data.azure_storage_datastore.AzureBlobDatastore'>
godzilla
batchml
godzillasinastorage
https
core.windows.net
workspacefilestore
<class 'azureml.data.azure_storage_datastore.AzureFileDatastore'>
workspacefilestore
azureml-filestore-c744e648-3d1a-4b53-9b48-f8dc50fd0d3f
magaml1375920863
https
core.windows.net
workspaceblobstore
<class 'azureml.data.azure_storage_datastore.AzureBlobDatastore'>
workspaceblobstore
azureml-blobstore-c744e648-3d1a-4b53-9b48-f8dc50fd0d3f
magaml1375920863
https
core.windows.net


In [10]:
datastore_name ='godzilla'
account_name = 'godzillasinastorage'
filesystem_name = 'batchml'

In [11]:
Datastore.get_default(workspace=ws)

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-c744e648-3d1a-4b53-9b48-f8dc50fd0d3f",
  "account_name": "magaml1375920863",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [12]:

serviceprincipal = json.loads( kv.get_secret('magamlworkspace'))
print(serviceprincipal.keys())
print(serviceprincipal['clientid'])

storageaccesskey = json.loads(kv.get_secret('godzillakey'))
print(storageaccesskey.keys())

dict_keys(['clientid', 'secret', 'tenant'])
a97b40c8-f436-4941-8b81-a2db2ed8d1bf
dict_keys(['accountname', 'key', 'container'])


In [13]:
# Datastore.register_azure_data_lake_gen2(
#     workspace=ws,
#     datastore_name=datastore_name,
#     account_name=account_name, # ADLS Gen2 account name
#     filesystem=filesystem_name, # ADLS Gen2 filesystem
#     tenant_id=serviceprincipal['tenant'], # tenant id of service principal
#     client_id=serviceprincipal['clientid'], # client id of service principal
#     client_secret=serviceprincipal['secret'] # the secret of service principal
#     )

In [14]:
Datastore.register_azure_blob_container(
    workspace=ws, 
    datastore_name=datastore_name, 
    container_name=storageaccesskey['container'], 
    account_name=storageaccesskey['accountname'], 
    account_key=storageaccesskey['key'],
    overwrite=True
    )

{
  "name": "godzilla",
  "container_name": "batchml",
  "account_name": "godzillasinastorage",
  "protocol": "https",
  "endpoint": "core.windows.net"
}

In [15]:
godzilla_datastore = Datastore.get(
    workspace=ws, 
    datastore_name=datastore_name
)

# Create and Register Env


In [16]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_GPU_IMAGE

cd = CondaDependencies.create(pip_packages=[#"tensorflow-gpu==1.15.2",
                                            "azureml-core", "azureml-dataprep[fuse]"])

env = Environment(name="MAG-ParallelRunEnv")
env.python.conda_dependencies=cd
env.docker.base_image = DEFAULT_GPU_IMAGE
env.register(workspace=ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04:20200723.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "MAG-ParallelRunEnv",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
       

# create and register images data set



FileDataset can be used as input of an experiment run. It can also be registered to workspace with a specified name and be retrieved by that name later.

FileDataset can be subsetted by invoking different subsetting methods available on this class. The result of subsetting is always a new FileDataset.

The actual data loading happens when FileDataset is asked to deliver the data into another storage mechanism (e.g. files downloaded or mounted to local path).

In [13]:
dataset_path = (godzilla_datastore, 'images')

dataset_name = 'anpr_images'
anpr_images = ws.datasets.get(dataset_name)
if anpr_images is None: 
    print(f'dataset:{dataset_name} does not exit, create and register')
    anpr_images = Dataset.File.from_files(dataset_path)
    anpr_images = anpr_images.register(workspace=ws, name=dataset_name)
    print(anpr_images)

In [14]:
anpr_images.to_path()

['/2020/07/28/20200601_213247.jpg',
 '/2020/07/28/T4636_2.jpg',
 '/20200601_213236.jpg',
 '/20200601_213247 - Copy.jpg',
 '/20200601_213247.jpg',
 '/Cursive_Writing_on_Notebook_paper.jpg',
 '/T46336_1.jpg',
 '/T4636_2.jpg',
 '/TC31930_1.jpg',
 '/TC31930_2.jpg',
 '/YJE00U.jpg',
 '/bike.jpg',
 '/hc1427.jpg',
 '/hsv39.jpg',
 '/mcl650.jpg']

In [15]:
# with anpr_images.mount() as mount_context: 
#     os.listdir(mount_context.mount_point)

In [16]:
# # You can also use the start and stop methods
# mount_context = anpr_images.mount()
# print(mount_context.mount_point)
# mount_context.start()  # this will mount the file streams
# mount_context.stop()  # this will unmount the file streams

## abstract data set operations

exploring what we get with file datasets 


In [17]:
print(type(anpr_images))

<class 'azureml.data.file_dataset.FileDataset'>


In [18]:
named_input = anpr_images.as_named_input('foo')

In [19]:
print(named_input.mode, named_input.name)

direct foo


In [20]:
ws.datasets

{'label_ds': DatasetRegistration(id='cec5662f-6eec-42e2-aabe-110e9c94bb70', name='label_ds', version=1, description='', tags={}), 'input_images': DatasetRegistration(id='c602c25f-cec0-4f30-9bf4-33eb44fcbfb9', name='input_images', version=1, description='', tags={}), 'anpr_images': DatasetRegistration(id='a89fea95-9f2e-4df4-b469-449e1c5c2286', name='anpr_images', version=1, description='', tags={}), 'ojsalesdata': DatasetRegistration(id='a99450ca-5802-4de5-ab60-cab59e63bfb0', name='ojsalesdata', version=1, description='', tags={'opendatasets': 'sample-oj-sales-simulated'})}

In [21]:
ds_anpr = ws.datasets['anpr_images']

In [22]:
type(ds_anpr)

azureml.data.file_dataset.FileDataset

In [23]:
ds_anpr.to_path()

['/2020/07/28/20200601_213247.jpg',
 '/2020/07/28/T4636_2.jpg',
 '/20200601_213236.jpg',
 '/20200601_213247 - Copy.jpg',
 '/20200601_213247.jpg',
 '/Cursive_Writing_on_Notebook_paper.jpg',
 '/T46336_1.jpg',
 '/T4636_2.jpg',
 '/TC31930_1.jpg',
 '/TC31930_2.jpg',
 '/YJE00U.jpg',
 '/bike.jpg',
 '/hc1427.jpg',
 '/hsv39.jpg',
 '/mcl650.jpg']