In [15]:
from azureml.core import Workspace

src_folder = 'steps'
cluster_name = 'cpu-cluster'
env_name = "data-drift-env"


ws = Workspace.from_config()
print('Ready to work with', ws.name)

Ready to work with evolve-ml


In [16]:
%%writefile -a $src_folder/utils.py

from contextlib import contextmanager
import os
import shutil


@contextmanager
def temp_directory(dir_name = 'temp', **kwds):
    
    os.makedirs(dir_name, exist_ok=True)
    
    try:
        # print(client.server_info())

        yield dir_name

    except Exception:
        print(f"Unable to create '{dir_name}'")

    finally:
        shutil.rmtree(dir_name)

Appending to steps/utils.py


In [17]:
%%writefile $src_folder/collect-data-drift-output.py

import argparse
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.run import _OfflineRun
from azureml.data.dataset_factory import DataType

DATASTORE_NAME = 'workspaceblobstore'
FILE_DATASET_NAME = 'datadrift_file_results'
json_file_path = f'datadrift/metrics/**/output_*.json'


def parse_args():
    parser = argparse.ArgumentParser()
    #parser.add_argument('--output', dest='output', required=True)
    #parser.add_argument('--datadir', dest='datadir', required=True)

    return parser.parse_args()


args = parse_args()
print(f'Arguments: {args.__dict__}')


run = Run.get_context()
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace

# Crate FileDataSet based on datadrift metrics which are saved in datastore as json files
dstore = Datastore.get(ws, DATASTORE_NAME)
file_dataset = Dataset.File.from_files(path=(dstore,json_file_path))
file_dataset.register(ws, FILE_DATASET_NAME, create_new_version=True)

#TODO: 
## add filter dataset
## add arguments instead of constants


Overwriting steps/collect-data-drift-output.py


In [31]:
%%writefile $src_folder/transform-data-drift-output.py

import argparse
import json
import bigjson
import os
import utils
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.run import _OfflineRun

DATASTORE_NAME = 'workspaceblobstore'
DATASTORE_PATH_PREFIX = 'datadrift_results'
TEMP_DIRECTORY = 'temp'

def parse_args():
    parser = argparse.ArgumentParser()
    #parser.add_argument('--output', dest='output', required=True)
    #parser.add_argument('--datadir', dest='datadir', required=True)

    return parser.parse_args()


args = parse_args()
print(f'Arguments: {args.__dict__}')

run = Run.get_context()
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace
dstore = Datastore.get(ws, DATASTORE_NAME)

with utils.temp_directory(TEMP_DIRECTORY):
    # Download json files defined by the dataset to temp directory
    json_file_paths = file_dataset.download(f'{TEMP_DIRECTORY}', overwrite=True)

    # Convert json files to jsonl files (in local directory) 
    for json_path in json_file_paths:
        
        # Read json file in streaming mode
        with open(json_path, 'rb') as f:
            json_data = bigjson.load(f)
            # Replace file name extension
            jsonl_path = os.path.splitext(json_path)[0]+'.jsonl'

            # Open jsonl file  
            with open(jsonl_path, 'w') as jsonl_file:
                # Iterates over input json
                for data in json_data:
                    # Converts json to a Python dict  
                    dict_data = data.to_python()
                    
                    # Saves the data to jsonl file
                    jsonl_file.write(json.dumps(dict_data)+"\n")
                    
        # Delete json file
        os.remove(json_path)

    # Upload jsonl files to datastore
    output_dataset = Dataset.File.upload_directory(f'{TEMP_DIRECTORY}', target=(dstore,DATASTORE_PATH_PREFIX))

#TODO: 
## move to util
## add arguments instead of constants
## pass file_dataset from previous step


Overwriting steps/transform-data-drift-output.py


In [19]:
%%writefile $src_folder/save-data-drift-output.py

import argparse
from azureml.core import Dataset, Datastore, Run
from azureml.core.run import _OfflineRun
from azureml.data.dataset_factory import DataType

DATASTORE_NAME = 'workspaceblobstore'
TABULAR_DATASET_NAME = 'datadrift_tabular_results'
PARTITION_FORMAT = '{DATADRIFT_ID}/{PARTITION_DATE:yyyy/MM/dd}/output_{RUN_ID}.json'
DATASTORE_PATH_PREFIX = 'datadrift_results'
jsonl_file_path = DATASTORE_PATH_PREFIX + '/**/output_*.jsonl'


def parse_args():
    parser = argparse.ArgumentParser()
    #parser.add_argument('--output', dest='output', required=True)
    #parser.add_argument('--datadir', dest='datadir', required=True)

    return parser.parse_args()


args = parse_args()
print(f'Arguments: {args.__dict__}')


run = Run.get_context()
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace

# Crate TabularDataSet based on converted jsonl files
dstore = Datastore.get(ws, DATASTORE_NAME)
output_dataset = Dataset.Tabular.from_json_lines_files(path=(dstore,jsonl_file_path), partition_format=PARTITION_FORMAT)
output_dataset.register(ws, TABULAR_DATASET_NAME, create_new_version=True)

#TODO: 
## add arguments instead of constants

Overwriting steps/save-data-drift-output.py


In [2]:
if False:
    from azureml.core.environment import Environment
    from azureml.core.conda_dependencies import CondaDependencies

    myenv = Environment(name=env_name)
    conda_dep = CondaDependencies()

    conda_dep.add_pip_package('bigjson')
    conda_dep.add_pip_package('azureml-defaults')

    # Adds dependencies to PythonSection of myenv
    myenv.python.conda_dependencies=conda_dep

    myenv.register(ws).build(ws)

In [20]:
from azureml.core import RunConfiguration, ComputeTarget, Environment

run_config = RunConfiguration()
run_config.environment = Environment.get(ws, env_name)
compute_target = ComputeTarget(ws, cluster_name)


In [33]:
from azureml.pipeline.core import PipelineData, PipelineParameter
from azureml.pipeline.steps import PythonScriptStep
from azureml.core import ComputeTarget

#datadir_param = PipelineData('datadir', is_directory=True)
#evolve_param = PipelineData('evolve')
#azure_param = PipelineData('azure')
#compare_param = PipelineData('compare')

#collection_param = PipelineParameter(name="collection", default_value='test_datasets')
#repo_param = PipelineParameter(name="repo", default_value='test')

collect_step = PythonScriptStep(
    name='collect data drift output',
    source_directory=src_folder,
    script_name='collect-data-drift-output.py',
    #arguments=['--output', 'azure.json', '--datadir', datadir_param],        
    #outputs=[datadir_param, azure_param],
    compute_target=compute_target, 
    runconfig=run_config, 
    allow_reuse=False,   
)

transform_step = PythonScriptStep(
    name='transform data drift output',
    source_directory=src_folder,
    script_name='transform-data-drift-output.py',
    #arguments=['--output', 'evolve.json', '--datadir', datadir_param, '--collection', collection_param, '--repo', repo_param],    
    #inputs=[datadir_param],
    #outputs=[evolve_param],
    compute_target=compute_target, 
    runconfig=run_config, 
    allow_reuse=False,    
)

save_step = PythonScriptStep(
    name='save data drift output',
    source_directory=src_folder,
    script_name='save-data-drift-output.py',
    #arguments=['--target', 'evolve.json', '--source', 'azure.json', '--datadir', datadir_param, '--output', 'diff.json'],    
    #inputs=[datadir_param, evolve_param, azure_param],
    #outputs=[compare_param],
    compute_target=compute_target, 
    runconfig=run_config,    
    allow_reuse=False,
)

print("Pipeline steps defined")

Pipeline steps defined


In [34]:
from azureml.core import Experiment
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [collect_step, transform_step, save_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'data-drift-output-exeriment')
pipeline_run = experiment.submit(pipeline) 
print("Pipeline submitted for execution.")

RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)


Pipeline is built.
Created step collect data drift output [b7e380ae][82efed4b-f05c-4261-a3cb-d81e385e4525], (This step will run and generate new outputs)Created step transform data drift output [7ab0f089][2e347405-0cb1-4e7e-b4f1-d4d7ecc466ba], (This step will run and generate new outputs)

Created step save data drift output [dc169218][a39e30c6-23ae-450f-be09-79425a09c71d], (This step will run and generate new outputs)
Submitted PipelineRun 4ed29cd0-bc79-4295-873c-2cb4d3bde788
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4ed29cd0-bc79-4295-873c-2cb4d3bde788?wsid=/subscriptions/06c3d5bb-46a2-4d92-9508-c018d06f6452/resourcegroups/evolve-team-rg/workspaces/evolve-ml&tid=72a43063-967e-43c8-8121-0823266b2701
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 4ed29cd0-bc79-4295-873c-2cb4d3bde788
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/4ed29cd0-bc79-4295-873c-2cb4d3bde788?wsid=/subscriptions/06c3d5bb-46a2-4d92-9508-c018d06f6452/resourcegroups/evolve-team-rg/workspaces/evolve-ml&tid=72a43063-967e-43c8-8121-0823266b2701
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: df55eae4-d8b8-4943-8ee4-9718a7444070
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/df55eae4-d8b8-4943-8ee4-9718a7444070?wsid=/subscriptions/06c3d5bb-46a2-4d92-9508-c018d06f6452/resourcegroups/evolve-team-rg/workspaces/evolve-ml&tid=72a43063-967e-43c8-8121-0823266b2701
StepRun( save data drift output ) Status: Running

StepRun(save data drift output) Execution Summary
StepRun( save data drift output ) Status: Finished
{'runId': 'df55eae4-d8b8-4943-8ee4-9718a7444070', 'target': 'cpu-cluster', 'status': 'Completed', 'startTimeUtc': '2022-04-13T20:16:27.653719Z', 'endTimeUtc': '2022-04-13T20:17:09.48341

'Finished'

In [None]:
from azureml.pipeline.core import PipelineRun
from azureml.core import Experiment

# Publish the pipeline from the run
submitted_pipeline_run = PipelineRun(experiment=Experiment(experiment, run_id=pipeline_run.id))
published_pipeline = submitted_pipeline_run.publish_pipeline(name='data-drift-output-pipeline',
    description='collect, transform and save datadrift output into dataset',
    version='1.0',
    continue_on_step_failure=False)

print('Pipeline scheduled.')

In [None]:
from azureml.pipeline.core import ScheduleRecurrence, Schedule

# Schedules a daily run of a published pipeline
daily = ScheduleRecurrence(frequency='Day', interval=1)
pipeline_schedule = Schedule.create(ws, name='data_drift_output_schedule',
                                        description='update data drift output every day',
                                        pipeline_id=published_pipeline.id,
                                        experiment_name='schedule_data_drift_output_pipeline',
                                        recurrence=daily)

print('Pipeline scheduled.')