# Working with AML Datasets & Data in general

In [1]:
## Check core SDK version number
import azureml.core
import mlflow
import os

from azureml.core import (Datastore, Dataset, Environment, Experiment, ScriptRunConfig,
                          Workspace)
from azureml.core.authentication import InteractiveLoginAuthentication
from IPython.display import display



print("[INFO] SDK version:", azureml.core.VERSION)

## due to diferent tenant -> typically customer tenant
# interactive_auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

ws = Workspace.from_config()
print("[SUCCESS] LOGGED IN: ",ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=' @ ')

## set mlflow backend to AML
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

print("[INFO] MLFlow wired to AML:", "experiments.azureml.net" in mlflow.get_tracking_uri())

[INFO] SDK version: 1.17.0
[SUCCESS] LOGGED IN:  @ mlops-demo @ mlops-rg @ westeurope @ 6ee947fa-0d77-4915-bf68-4a83a8bec2a4
[INFO] MLFlow wired to AML: True


## Config

In [2]:
aml_compute = "aml-cluster"
aml_ds = "aml_data"# "mmaadlsgen2_test"
# aml_dset = 'noa_weather'
# aml_dset = "oj_sample_data"
aml_dset = "diabetes_multiple"
aml_experiment = "mlflow-azureml"
loc_data = "data/demo_data"

In [3]:
!rm -rf $loc_data

In [4]:
## List all datastores registered in the current workspace
datastores = ws.datastores
for name, datastore in datastores.items():
    print(f"{name} ({datastore.datastore_type})")



mmaadlsgen2_test (AzureDataLakeGen2)
modeldata (AzureBlob)
dbxstorage_delta (AzureBlob)
aml_data (AzureBlob)
azureml_globaldatasets (AzureBlob)
covidhack_file (AzureFile)
covidhack_blob (AzureBlob)
covidhack (AzureFile)
workspacefilestore (AzureFile)
workspaceblobstore (AzureBlob)


In [5]:
## set the datastore
ds = ws.datastores[aml_ds]
print(f"[INFO] Datastore: {ds.name}, type: {ds.datastore_type}")

[INFO] Datastore: aml_data, type: AzureBlob


## Data

### File Dataset

```python
from_files(path, validate=True)
```

In [6]:
## create a FileDataset pointing to files in 'diabetes' folder and its subfolders recursively

# datastore_paths = [(ds, 'diabetes')]
# datastore_paths = [(ds, 'diabetes/diabetes0.csv'),(ds, 'diabetes/diabetes1.csv'),(ds, 'diabetes/diabetes2.csv')]
datastore_paths = [(ds, 'diabetes/diabetes*.csv')]
# datastore_paths = [(ds, 'ojs/Store140*.csv')]

fds = Dataset.File.from_files(path=datastore_paths)

In [7]:
## show files matching criteria into paths
fds.to_path()

['/diabetes0.csv',
 '/diabetes1.csv',
 '/diabetes2.csv',
 '/diabetes3.csv',
 '/diabetes4.csv',
 '/diabetes5.csv',
 '/diabetes6.csv',
 '/diabetes7.csv',
 '/diabetes8.csv']

In [8]:
## download just a sample of the data
fds.take(3).download(target_path=os.path.join("./",loc_data), overwrite=True)

['/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/diabetes0.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/diabetes1.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/diabetes2.csv']

In [9]:
## check donwloaded files
tmppath = os.path.join("./",loc_data)
onlyfiles = [f for f in os.listdir(tmppath) if os.path.isfile(os.path.join(tmppath, f))]
print(onlyfiles)

['diabetes0.csv', 'diabetes1.csv', 'diabetes2.csv']


### Tabular Dataset

```python
from_delimited_files(path, separator=',', header=<PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS: 3>, encoding=<FileEncoding.UTF8: 0>, quoting=False, infer_column_types=True, skip_rows=0, skip_mode=<SkipLinesBehavior.NO_ROWS: 0>, comment=None, include_path=False, archive_options=None, partition_format=None)

```

In [10]:
## create a FileDataset pointing to files in 'weather' folder and its subfolders recursively

# datastore_paths = [(ds, 'diabetes')]
# datastore_paths = [(ds, 'diabetes/diabetes0.csv'),(ds, 'diabetes/diabetes1.csv'),(ds, 'diabetes/diabetes2.csv')]
datastore_paths = [(ds, 'diabetes/diabetes*.csv')]
# datastore_paths = [(ds, 'ojs/Store140*.csv')]

tds = Dataset.Tabular.from_delimited_files(path=datastore_paths)

In [11]:
pdf = tds.to_pandas_dataframe()

In [12]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
AGE    442 non-null int64
SEX    442 non-null int64
BMI    442 non-null float64
BP     442 non-null float64
S1     442 non-null int64
S2     442 non-null float64
S3     442 non-null float64
S4     442 non-null float64
S5     442 non-null float64
S6     442 non-null int64
Y      442 non-null int64
dtypes: float64(6), int64(5)
memory usage: 38.1 KB


In [13]:
## register Dataset into Workspace for reusability

wtds = tds.register(workspace=ws, name=aml_dset, description='Sample: Diabetes data from Azure Open Datasets',create_new_version=True)

In [None]:
# wtds_profile = wtds.get_profile(ws)

### get data from existing dataset

In [14]:
wtds = Dataset.get_by_name(ws, name=aml_dset)
wtds.to_pandas_dataframe().head(10)

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135
5,23,1,22.6,89.0,139,64.8,61.0,2.0,4.1897,68,97
6,36,2,22.0,90.0,160,99.6,50.0,3.0,3.9512,82,138
7,66,2,26.2,114.0,255,185.0,56.0,4.55,4.2485,92,63
8,60,2,32.1,83.0,179,119.4,42.0,4.0,4.4773,94,110
9,29,1,30.0,85.0,180,93.4,43.0,4.0,5.3845,88,310


## Training

In [None]:
mlflow.set_experiment(aml_experiment)
