In [1]:
import json
import azureml.core
from azureml.core import Workspace, Datastore, Dataset, Environment, Experiment
from azureml.data import FileDataset
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter, PipelineRun
from azureml.core.compute import ComputeTarget, AmlCompute


print(azureml.core.VERSION)
version = dict(zip(['major','minor','patch'], azureml.core.VERSION.split('.')))
ws = Workspace.from_config()

1.10.0



# get data set by id

In [3]:
saved_id = 'b3d66173-5608-41b4-b4d4-4b7bd188a2ee'
Dataset.get_by_id(workspace=ws, id=saved_id)

{
  "source": [
    "('godzilla', 'images/2020/08/10')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "b3d66173-5608-41b4-b4d4-4b7bd188a2ee",
    "name": null,
    "version": null,
    "workspace": "Workspace.create(name='magaml', subscription_id='907c8efc-c2c8-4c49-a4e1-aeb880e10c88', resource_group='aml')"
  }
}

# enumerate datastores and datasets

In [19]:
print('>>>>> datastores')
for i,ds in enumerate(ws.datastores): 
    print(i, ds)

print('>>>>> datasets using datasets collection') 
for i, dataset in enumerate(ws.datasets):
    print(i, dataset)


print('>>>>> datasets using get_all') 
for i, dataset in enumerate(Dataset.get_all(workspace=ws)):
    print(i, dataset)

>>>>> datastores
0 images_datastore
1 azureml_globaldatasets
2 godzilla
3 workspacefilestore
4 workspaceblobstore
>>>>> datasets using datasets collection
0 label_ds
1 input_images
2 anpr_images
3 ojsalesdata
>>>>> datasets using get_all
0 label_ds
1 input_images
2 anpr_images
3 ojsalesdata


In [None]:
print([dataset for dataset in ])

In [21]:
# dataset_id = '69bfe260-7f14-4de7-a33b-7bf894858e4c'

datastore_o = ws.datastores['godzilla']

dataset_o = Dataset.get_by_name(workspace=ws, name='anpr_images')

dataset_o.to_path()


['/2020/07/28/20200601_213247.jpg',
 '/2020/07/28/T4636_2.jpg',
 '/2020/08/10/20160924_063249081_iOS.jpg',
 '/2020/08/10/20160924_063332113_iOS.jpg',
 '/2020/08/10/20160924_063342621_iOS.jpg',
 '/2020/08/10/20160924_063825264_iOS.jpg',
 '/2020/08/10/20160924_064218462_iOS.jpg',
 '/2020/08/10/20160924_064241679_iOS.jpg',
 '/2020/08/10/20160924_064251486_iOS.jpg',
 '/20200601_213236.jpg',
 '/20200601_213247 - Copy.jpg',
 '/20200601_213247.jpg',
 '/Cursive_Writing_on_Notebook_paper.jpg',
 '/T46336_1.jpg',
 '/T4636_2.jpg',
 '/TC31930_1.jpg',
 '/TC31930_2.jpg',
 '/YJE00U.jpg',
 '/bike.jpg',
 '/hc1427.jpg',
 '/hsv39.jpg',
 '/mcl650.jpg']

In [22]:
dsconfig = DatasetConsumptionConfig('ds_config',dataset_o).as_mount()
dsconfig.dataset

{
  "source": [
    "('godzilla', 'images')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "a89fea95-9f2e-4df4-b469-449e1c5c2286",
    "name": "anpr_images",
    "version": 1,
    "workspace": "Workspace.create(name='magaml', subscription_id='907c8efc-c2c8-4c49-a4e1-aeb880e10c88', resource_group='aml')"
  }
}

In [25]:
dsconfig.dataset

{
  "source": [
    "('godzilla', 'images')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "a89fea95-9f2e-4df4-b469-449e1c5c2286",
    "name": "anpr_images",
    "version": 1,
    "workspace": "Workspace.create(name='magaml', subscription_id='907c8efc-c2c8-4c49-a4e1-aeb880e10c88', resource_group='aml')"
  }
}

In [55]:
import pandas as pd 
import os
import base64
from pathlib import Path 

filename = dataset_o.to_path()

df = pd.DataFrame({'filename': filename})

df['basename'] =  df.filename.apply(lambda fn: os.path.basename(fn))
df['dirname'] = df.filename.apply(lambda fn: os.path.dirname(fn))
df['encoded'] = df.filename.apply(lambda fn: base64.b64encode(fn.encode()))

In [60]:
df.query('dirname=="/2020/08/10"').to_csv('20200810_images.csv')

In [58]:
df.encoded.apply(lambda _: base64.b64decode(_).decode())

0            /2020/07/28/20200601_213247.jpg
1                    /2020/07/28/T4636_2.jpg
2     /2020/08/10/20160924_063249081_iOS.jpg
3     /2020/08/10/20160924_063332113_iOS.jpg
4     /2020/08/10/20160924_063342621_iOS.jpg
5     /2020/08/10/20160924_063825264_iOS.jpg
6     /2020/08/10/20160924_064218462_iOS.jpg
7     /2020/08/10/20160924_064241679_iOS.jpg
8     /2020/08/10/20160924_064251486_iOS.jpg
9                       /20200601_213236.jpg
10               /20200601_213247 - Copy.jpg
11                      /20200601_213247.jpg
12    /Cursive_Writing_on_Notebook_paper.jpg
13                             /T46336_1.jpg
14                              /T4636_2.jpg
15                            /TC31930_1.jpg
16                            /TC31930_2.jpg
17                               /YJE00U.jpg
18                                 /bike.jpg
19                               /hc1427.jpg
20                                /hsv39.jpg
21                               /mcl650.jpg
Name: enco

In [None]:
str.encode

# pipelines

In [8]:
exp = Experiment(workspace=ws, name='MAG-batch-paramdataset')


In [None]:
exp.runs