# Working with AML Datasets & Data in general

In [4]:
## Check core SDK version number
import azureml.core
import mlflow
import os

from azureml.core import (Datastore, Dataset, Environment, Experiment, ScriptRunConfig,
                          Workspace)
from azureml.core.authentication import InteractiveLoginAuthentication
from IPython.display import display



print("[INFO] SDK version:", azureml.core.VERSION)

## due to diferent tenant -> typically customer tenant
# interactive_auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

ws = Workspace.from_config()
print("[SUCCESS] LOGGED IN: ",ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=' @ ')

## set mlflow backend to AML
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

print("[INFO] MLFlow wired to AML:", "experiments.azureml.net" in mlflow.get_tracking_uri())

[INFO] SDK version: 1.17.0
[SUCCESS] LOGGED IN:  @ mlops-demo @ mlops-rg @ westeurope @ 6ee947fa-0d77-4915-bf68-4a83a8bec2a4
[INFO] MLFlow wired to AML: True


## Config

In [5]:
aml_compute = "aml-cluster"
aml_ds = "aml_data"# "mmaadlsgen2_test"
# aml_dset = 'noa_weather'
# aml_dset = "oj_sample_data"
aml_dset = "diabetes_multiple"
aml_experiment = "mlflow-azureml"
loc_data = "data/demo_data"

In [6]:
!rm -rf $loc_data

In [7]:
## List all datastores registered in the current workspace
datastores = ws.datastores
for name, datastore in datastores.items():
    print(f"{name} ({datastore.datastore_type})")



mmaadlsgen2_test (AzureDataLakeGen2)
modeldata (AzureBlob)
dbxstorage_delta (AzureBlob)
aml_data (AzureBlob)
azureml_globaldatasets (AzureBlob)
covidhack_file (AzureFile)
covidhack_blob (AzureBlob)
covidhack (AzureFile)
workspacefilestore (AzureFile)
workspaceblobstore (AzureBlob)


In [8]:
## set the datastore
ds = ws.datastores[aml_ds]
print(f"[INFO] Datastore: {ds.name}, type: {ds.datastore_type}")

[INFO] Datastore: aml_data, type: AzureBlob


## Data

### File Dataset

```python
from_files(path, validate=True)
```

In [9]:
## create a FileDataset pointing to files in 'diabetes' folder and its subfolders recursively

# datastore_paths = [(ds, 'diabetes')]
# datastore_paths = [(ds, 'diabetes/diabetes0.csv'),(ds, 'diabetes/diabetes1.csv'),(ds, 'diabetes/diabetes2.csv')]
datastore_paths = [(ds, 'diabetes/diabetes*.csv')]
# datastore_paths = [(ds, 'ojs/Store140*.csv')]

fds = Dataset.File.from_files(path=datastore_paths)

In [10]:
type(fds)

azureml.data.file_dataset.FileDataset

In [11]:
## show files matching criteria into paths
fds.to_path()

['/diabetes0.csv',
 '/diabetes1.csv',
 '/diabetes2.csv',
 '/diabetes3.csv',
 '/diabetes4.csv',
 '/diabetes5.csv',
 '/diabetes6.csv',
 '/diabetes7.csv',
 '/diabetes8.csv']

In [12]:
## download just a sample of the data
fds.take(3).download(target_path=os.path.join("./",loc_data), overwrite=True)

['/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/diabetes0.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/diabetes1.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/diabetes2.csv']

In [13]:
## check donwloaded files
tmppath = os.path.join("./",loc_data)
onlyfiles = [f for f in os.listdir(tmppath) if os.path.isfile(os.path.join(tmppath, f))]
print(onlyfiles)

['diabetes0.csv', 'diabetes1.csv', 'diabetes2.csv']


### Tabular Dataset

```python
from_delimited_files(path, separator=',', header=<PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS: 3>, encoding=<FileEncoding.UTF8: 0>, quoting=False, infer_column_types=True, skip_rows=0, skip_mode=<SkipLinesBehavior.NO_ROWS: 0>, comment=None, include_path=False, archive_options=None, partition_format=None)

```

In [14]:
## create a FileDataset pointing to files in 'weather' folder and its subfolders recursively

# datastore_paths = [(ds, 'diabetes')]
# datastore_paths = [(ds, 'diabetes/diabetes0.csv'),(ds, 'diabetes/diabetes1.csv'),(ds, 'diabetes/diabetes2.csv')]
datastore_paths = [(ds, 'diabetes/diabetes*.csv')]
# datastore_paths = [(ds, 'ojs/Store140*.csv')]

tds = Dataset.Tabular.from_delimited_files(path=datastore_paths)

In [15]:
pdf = tds.to_pandas_dataframe()

In [16]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
AGE    442 non-null int64
SEX    442 non-null int64
BMI    442 non-null float64
BP     442 non-null float64
S1     442 non-null int64
S2     442 non-null float64
S3     442 non-null float64
S4     442 non-null float64
S5     442 non-null float64
S6     442 non-null int64
Y      442 non-null int64
dtypes: float64(6), int64(5)
memory usage: 38.1 KB


In [None]:
## register Dataset into Workspace for reusability

wtds = tds.register(workspace=ws, name=aml_dset, description='Sample: Diabetes data from Azure Open Datasets',create_new_version=True)

In [None]:
# wtds_profile = wtds.get_profile(ws)

### get data from existing dataset

In [6]:
wtds = Dataset.get_by_name(ws, name=aml_dset)
wtds.to_pandas_dataframe().head(10)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135
5,23,1,22.6,89.0,139,64.8,61.0,2.0,4.1897,68,97
6,36,2,22.0,90.0,160,99.6,50.0,3.0,3.9512,82,138
7,66,2,26.2,114.0,255,185.0,56.0,4.55,4.2485,92,63
8,60,2,32.1,83.0,179,119.4,42.0,4.0,4.4773,94,110
9,29,1,30.0,85.0,180,93.4,43.0,4.0,5.3845,88,310


In [7]:
wtds_profile = wtds.get_profile(ws)



AzureMLException: AzureMLException:
	Message: Unable to fetch profile results. Please submit a new profile run.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Unable to fetch profile results. Please submit a new profile run."
    }
}

## Troubleshooting - Profile

In [8]:
wtds = Dataset.get_by_name(ws, name="noa_weather")
wtds.to_pandas_dataframe().head(10)


Unnamed: 0,index,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,...,precipTime,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version,month
0,9,29580,99999,2018-01-03 05:20:00,61.045,28.144,106.0,180.0,3.1,1.0,...,,,,LAPPEENRANTA,FI,029580-99999,2018,3,1.0,1
1,33,726228,94740,2018-01-02 11:51:00,44.385,-74.207,507.0,0.0,0.0,-28.9,...,6.0,0.0,,ADIRONDACK REGIONAL ARPT,US,726228-94740,2018,2,1.0,1
2,76,999999,94074,2018-01-01 03:55:00,40.807,-104.755,1643.0,,,-12.4,...,,,,NUNN 7 NNE,US,999999-94074,2018,1,1.0,1
3,86,999999,3067,2018-01-03 12:55:00,38.87,-100.963,875.0,,,-10.4,...,,,,OAKLEY 19 SSW,US,999999-03067,2018,3,1.0,1
4,102,999999,94077,2018-01-02 19:50:00,42.425,-103.736,1343.0,,,-3.8,...,,,,HARRISON 20 SSE,US,999999-94077,2018,2,1.0,1
5,108,999999,3067,2018-01-03 15:20:00,38.87,-100.963,875.0,,,-6.8,...,,,,OAKLEY 19 SSW,US,999999-03067,2018,3,1.0,1
6,131,11120,99999,2018-01-01 11:00:00,65.45,12.217,9.0,90.0,4.0,3.1,...,,,,BRONNOY,NO,011120-99999,2018,1,1.0,1
7,136,999999,94077,2018-01-01 15:20:00,42.425,-103.736,1343.0,,,-16.1,...,,,,HARRISON 20 SSE,US,999999-94077,2018,1,1.0,1
8,187,82840,99999,2018-01-03 10:00:00,39.485,-0.475,56.0,260.0,5.7,15.2,...,,,,VALENCIA,SP,082840-99999,2018,3,1.0,1
9,215,486980,99999,2018-01-01 23:30:00,1.35,103.994,7.0,60.0,2.6,25.0,...,,,,SINGAPORE CHANGI INTL,SN,486980-99999,2018,1,1.0,1


In [9]:
wtds.get_profile(ws)



UserErrorException: UserErrorException:
	Message: File with path actions/f6403fbe-8044-45ef-8cf4-88ab1ae27bab/profile_result.json was not found,
available files include: .
	InnerException None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "File with path actions/f6403fbe-8044-45ef-8cf4-88ab1ae27bab/profile_result.json was not found,\navailable files include: ."
    }
}

In [10]:
type(wtds)

azureml.data.tabular_dataset.TabularDataset

In [11]:
wtds

{
  "source": [
    "('aml_data', 'weather/weather*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "d5e42aec-9043-4a6a-8f6b-6c3fa235a686",
    "name": "noa_weather",
    "version": 1,
    "description": "Weather dataset - sample of few days",
    "workspace": "Workspace.create(name='mlops-demo', subscription_id='6ee947fa-0d77-4915-bf68-4a83a8bec2a4', resource_group='mlops-rg')"
  }
}

In [12]:
!pip list

Package                               Version
------------------------------------- -------------------
absl-py                               0.11.0
adal                                  1.2.5
aiohttp                               3.7.2
aiohttp-cors                          0.7.0
aioredis                              1.3.1
alembic                               1.4.1
ansiwrap                              0.8.4
antlr4-python3-runtime                4.7.2
applicationinsights                   0.11.9
argcomplete                           1.12.1
argon2-cffi                           20.1.0
astor                                 0.8.1
astroid                               2.4.2
async-generator                       1.10
async-timeout                         3.0.1
atari-py                              0.2.6
attrs                                 20.2.0
autopep8                              1.5.4
azure-appconfiguration                1.1.1
azure-batch                        

In [14]:
os.getcwd()

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos'

In [15]:
print(wtds._dataflow._steps)

[Step {
  id: 7504e214-b9e2-49c5-8665-9186621d7d09
  type: Microsoft.DPrep.GetDatastoreFilesBlock,
  arguments:  { 'datastores': [ { 'datastoreName': 'aml_data',
                      'path': 'weather/weather*.csv',
                      'resourceGroup': 'mlops-rg',
                      'subscription': '6ee947fa-0d77-4915-bf68-4a83a8bec2a4',
                      'workspaceName': 'mlops-demo'}]}
}, Step {
  id: 66e7d2cb-f2cd-431d-8044-d0dc2dc3d0ba
  type: Microsoft.DPrep.ParseDelimitedBlock,
  arguments:  { 'columnHeadersMode': 3,
    'fileEncoding': 0,
    'handleQuotedLineBreaks': False,
    'preview': False,
    'separator': ',',
    'skipRows': 0,
    'skipRowsMode': 0}
}, Step {
  id: e1018fa0-64f5-4b5d-85a6-afced147cda0
  type: Microsoft.DPrep.DropColumnsBlock,
  arguments:  {'columns': {'details': {'selectedColumns': ['Path']}, 'type': 0}}
}, Step {
  id: 51944314-aeb4-4918-a6ec-0f901be58ac3
  type: Microsoft.DPrep.SetColumnTypesBlock,
  arguments:  { 'columnConversion': [ { 'c