In [1]:
from utils import *

In [2]:
from azureml.core import Workspace, Experiment
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name='access_dataset')

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [3]:
from azureml.core import Dataset

path ='https://dprepdata.blob.core.windows.net/demo/Titanic.csv'

# Create a direct dataset
ds = Dataset.Tabular.from_delimited_files(path)
ds

{
  "source": [
    "https://dprepdata.blob.core.windows.net/demo/Titanic.csv"
  ],
  "definition": [
    "GetFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ]
}

In [4]:
# Register the dataset
ds = ds.register(workspace=ws, name="titanic")
ds

{
  "source": [
    "https://dprepdata.blob.core.windows.net/demo/Titanic.csv"
  ],
  "definition": [
    "GetFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "c6fd257f-136b-414e-ae0b-030fb7e8d713",
    "name": "titanic",
    "version": 1,
    "workspace": "Workspace.create(name='mldemows', subscription_id='292890d4-aa6d-4d5e-a085-97c80db3c30a', resource_group='mldemo')"
  }
}

In [5]:
# list all datasets from a workspace
datasets = Dataset.get_all(ws)
datasets

{'titanic': DatasetRegistration(id='c6fd257f-136b-414e-ae0b-030fb7e8d713', name='titanic', version=1, description='', tags={}), 'Melbourne Housing Dataset': DatasetRegistration(id='0ddfd6e9-565e-4f19-93f6-6701bf3a2e1c', name='Melbourne Housing Dataset', version=2, description='Data Cleansing 2 - replaced missing values', tags={})}

In [6]:
# Now we can retrieve the dataset by name (and version)
ds = Dataset.get_by_name(ws, "titanic")
ds

{
  "source": [
    "https://dprepdata.blob.core.windows.net/demo/Titanic.csv"
  ],
  "definition": [
    "GetFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "c6fd257f-136b-414e-ae0b-030fb7e8d713",
    "name": "titanic",
    "version": 1,
    "workspace": "Workspace.create(name='mldemows', subscription_id='292890d4-aa6d-4d5e-a085-97c80db3c30a', resource_group='mldemo')"
  }
}

In [7]:
ds.to_pandas_dataframe()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [8]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory="code",
                      script='access_data_from_dataset.py',
                      arguments=['--input', ds.as_named_input('titanic')],
                      environment=get_current_env())

# Submit the run configuration for your training run
run = exp.submit(src)
run.wait_for_completion(show_output=True)  

RunId: access_dataset_1669285661_97bbf8d4
Web View: https://ml.azure.com/runs/access_dataset_1669285661_97bbf8d4?wsid=/subscriptions/292890d4-aa6d-4d5e-a085-97c80db3c30a/resourcegroups/mldemo/workspaces/mldemows&tid=03ff20ec-51f3-415c-9462-b61ddcf1ce16

Streaming azureml-logs/70_driver_log.txt

[2022-11-24T10:27:43.978972] Entering context manager injector.
[2022-11-24T10:27:46.883331] context_manager_injector.py Command line Options: Namespace(inject=['ProjectPythonPath:context_managers.ProjectPythonPath', 'RunHistory:context_managers.RunHistory', 'TrackUserError:context_managers.TrackUserError'], invocation=['access_data_from_dataset.py', '--input', 'c6fd257f-136b-414e-ae0b-030fb7e8d713'])
Script type = None
[2022-11-24T10:27:46.925413] Entering Run History Context Manager.
[2022-11-24T10:27:50.399114] Current directory: /private/var/folders/bg/qstlmb2n6cncqw_g_cy3jgt00000gn/T/azureml_runs/access_dataset_1669285661_97bbf8d4
[2022-11-24T10:27:50.419730] Preparing to call script [acces

{'runId': 'access_dataset_1669285661_97bbf8d4',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2022-11-24T10:27:43.463581Z',
 'endTimeUtc': '2022-11-24T10:29:09.809287Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '28a5cfb1-b435-4516-b99c-f67d90e9bde2',
  'azureml.git.repository_uri': 'https://github.com/mahtabsyed/Azure-Machine-Learning-Demo.git',
  'mlflow.source.git.repoURL': 'https://github.com/mahtabsyed/Azure-Machine-Learning-Demo.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': 'e2772270a70f5af53dce1f0881404bd020cd1f8f',
  'mlflow.source.git.commit': 'e2772270a70f5af53dce1f0881404bd020cd1f8f',
  'azureml.git.dirty': 'False'},
 'inputDatasets': [{'dataset': {'id': 'c6fd257f-136b-414e-ae0b-030fb7e8d713'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'titanic', 'mechanism': 'Direct'}}, {'dataset': {'id': 'c6fd257f-136b-414e-ae0b-030fb7e8d713'}, 'consu