## Init

In [1]:
## Check core SDK version number
import azureml.core
import mlflow
import os

from azureml.core import (Datastore, Dataset, Environment, Experiment, ScriptRunConfig,
                          Workspace)
from azureml.core.authentication import InteractiveLoginAuthentication
from IPython.display import display



print("[INFO] SDK version:", azureml.core.VERSION)

## due to diferent tenant -> typically customer tenant
# interactive_auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

ws = Workspace.from_config()
print("[SUCCESS] LOGGED IN: ",ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=' @ ')

## set mlflow backend to AML
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

print("[INFO] MLFlow wired to AML:", "experiments.azureml.net" in mlflow.get_tracking_uri())

[INFO] SDK version: 1.17.0
[SUCCESS] LOGGED IN:  @ mlops-demo @ mlops-rg @ westeurope @ 6ee947fa-0d77-4915-bf68-4a83a8bec2a4
[INFO] MLFlow wired to AML: True


## Config

In [8]:
aml_compute = "aml-cluster"
aml_ds = "aml_data"# "mmaadlsgen2_test"
# aml_dset = 'noa_weather'
aml_dset = "oj_sample_data"
aml_experiment = "mlflow-azureml"
loc_data = "data/demo_data"

In [3]:
## List all datastores registered in the current workspace
datastores = ws.datastores
for name, datastore in datastores.items():
    print(f"{name} ({datastore.datastore_type})")



mmaadlsgen2_test (AzureDataLakeGen2)
modeldata (AzureBlob)
dbxstorage_delta (AzureBlob)
aml_data (AzureBlob)
azureml_globaldatasets (AzureBlob)
covidhack_file (AzureFile)
covidhack_blob (AzureBlob)
covidhack (AzureFile)
workspacefilestore (AzureFile)
workspaceblobstore (AzureBlob)


In [4]:
## set the datastore
ds = ws.datastores[aml_ds]
print(f"[INFO] Datastore: {ds.name}, type: {ds.datastore_type}")

[INFO] Datastore: aml_data, type: AzureBlob


## Data

### File Dataset

```python
from_files(path, validate=True)
```

In [5]:
## create a FileDataset pointing to files in 'ojs' folder and its subfolders recursively

# datastore_paths = [(ds, 'ojs')]
# datastore_paths = [(ds, 'ojs/Store140_dominicks.csv'),(ds, 'ojs/Store141_dominicks.csv'),(ds, 'ojs/Store142_dominicks.csv')]
datastore_paths = [(ds, 'ojs/Store140*.csv')]

fds = Dataset.File.from_files(path=datastore_paths)

In [6]:
## show files matching criteria into paths
fds.to_path()

['/Store140_dominicks.csv',
 '/Store140_minute.maid.csv',
 '/Store140_tropicana.csv']

In [9]:
## download just a sample of the data
fds.take(3).download(target_path=os.path.join("./",loc_data), overwrite=True)

['/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/Store140_dominicks.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/Store140_minute.maid.csv',
 '/mnt/batch/tasks/shared/LS_root/mounts/clusters/ntb-dev2/code/Users/mimarusa/_DEMO_/azureml-demos/data/demo_data/Store140_tropicana.csv']

In [10]:
## check donwloaded files
tmppath = os.path.join("./",loc_data)
onlyfiles = [f for f in os.listdir(tmppath) if os.path.isfile(os.path.join(tmppath, f))]
print(onlyfiles)

['Store140_dominicks.csv', 'Store140_minute.maid.csv', 'Store140_tropicana.csv']


### Tabular Dataset

```python
from_delimited_files(path, separator=',', header=<PromoteHeadersBehavior.ALL_FILES_HAVE_SAME_HEADERS: 3>, encoding=<FileEncoding.UTF8: 0>, quoting=False, infer_column_types=True, skip_rows=0, skip_mode=<SkipLinesBehavior.NO_ROWS: 0>, comment=None, include_path=False, archive_options=None, partition_format=None)

```

In [60]:
## create a FileDataset pointing to files in 'weather' folder and its subfolders recursively

# datastore_paths = [(ds, 'ojs')]
# datastore_paths = [(ds, 'ojs/Store140_dominicks.csv'),(ds, 'ojs/Store141_dominicks.csv'),(ds, 'ojs/Store142_dominicks.csv')]
datastore_paths = [(ds, 'ojs/Store14*.csv')]

tds = Dataset.Tabular.from_delimited_files(path=datastore_paths)

In [61]:
pdf = tds.to_pandas_dataframe()

In [62]:
pdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1089 entries, 0 to 1088
Data columns (total 7 columns):
WeekStarting    1089 non-null datetime64[ns]
Store           1089 non-null int64
Brand           1089 non-null object
Quantity        1089 non-null int64
Advert          1089 non-null int64
Price           1089 non-null float64
Revenue         1089 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(3), object(1)
memory usage: 59.7+ KB


In [64]:
## register Dataset into Workspace for reusability

wtds = tds.register(workspace=ws, name=aml_dset, description='Sample: OJ Sales Simulated Data - sample of 3 stores',create_new_version=True)

In [None]:
# wtds_profile = wtds.get_profile(ws)

### get data from existing dataset

In [65]:
wtds = Dataset.get_by_name(ws, name=aml_dset)
wtds.to_pandas_dataframe().head(10)

Unnamed: 0,WeekStarting,Store,Brand,Quantity,Advert,Price,Revenue
0,1990-06-14,140,dominicks,9493,1,2.27,21549.11
1,1990-06-21,140,dominicks,12343,1,2.39,29499.77
2,1990-06-28,140,dominicks,15962,1,2.49,39745.38
3,1990-07-05,140,dominicks,19816,1,2.62,51917.92
4,1990-07-12,140,dominicks,11806,1,2.58,30459.48
5,1990-07-19,140,dominicks,14841,1,2.2,32650.2
6,1990-07-26,140,dominicks,19140,1,2.61,49955.4
7,1990-08-02,140,dominicks,17697,1,2.45,43357.65
8,1990-08-09,140,dominicks,15790,1,2.19,34580.1
9,1990-08-16,140,dominicks,10158,1,2.67,27121.86


## Training

In [None]:
mlflow.set_experiment(aml_experiment)
