In [13]:
from azureml.core import Workspace

src_folder = 'steps'
cluster_name = 'cpu-cluster'
env_name = 'data-drift-env'

DATASTORE_NAME = 'workspaceblobstore'
FILE_DATASET_NAME = 'datadrift_file_results'
json_file_path = f'datadrift/metrics/**/output_*.json'


ws = Workspace.from_config()
print('Ready to work with', ws.name)

Ready to work with evolve-ml


In [15]:
%%writefile -a $src_folder/utils.py

from contextlib import contextmanager
import os
import shutil


@contextmanager
def temp_directory(dir_name = 'temp', **kwds):
    
    os.makedirs(dir_name, exist_ok=True)
    
    try:
        # print(client.server_info())

        yield dir_name

    except Exception:
        print(f"Unable to create '{dir_name}'")

    finally:
        shutil.rmtree(dir_name)

Writing steps/utils.py


In [10]:
%%writefile $src_folder/collect-data-drift-output.py

import argparse
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.run import _OfflineRun
from azureml.data.dataset_factory import DataType



def parse_args():
    parser = argparse.ArgumentParser()
    #parser.add_argument('--output', dest='output', required=True)
    #parser.add_argument('--datadir', dest='datadir', required=True)

    return parser.parse_args()


args = parse_args()
print(f'Arguments: {args.__dict__}')


run = Run.get_context()
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace

# Crate FileDataSet based on datadrift metrics which are saved in datastore as json files
dstore = Datastore.get(ws, DATASTORE_NAME)
file_dataset = Dataset.File.from_files(path=(dstore,json_file_path))
file_dataset.register(ws, FILE_DATASET_NAME, create_new_version=True)

#TODO: 
## add filter dataset
## add arguments instead of constants


Overwriting steps/collect-data-drift-output.py


In [16]:
%%writefile $src_folder/transform-data-drift-output.py

import argparse
import json
import bigjson
import os
import utils
from azureml.core import Workspace, Dataset, Datastore, Run
from azureml.core.run import _OfflineRun

TEMP_DIRECTORY = 'temp'

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
    parser.add_argument('--transformed-data', type=str, dest='transformed_data', default='transformed_data', help='Folder for results')
    return parser.parse_args()

# Get parameters
args = parse_args()
print(f'Arguments: {args.__dict__}')

save_folder = args.prepped_data
os.makedirs(save_folder, exist_ok=True)

# Get the experiment run context
run = Run.get_context()
file_dataset = run.input_datasets['raw_data']
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace

with utils.temp_directory(TEMP_DIRECTORY):
    # Download json files defined by the dataset to temp directory
    json_file_paths = file_dataset.download(f'{TEMP_DIRECTORY}', overwrite=True)

    # Convert json files to jsonl files (in local directory) 
    for json_path in json_file_paths:
        
        # Read json file in streaming mode
        with open(json_path, 'rb') as f:
            json_data = bigjson.load(f)
            # Replace file name extension
            jsonl_path = os.path.splitext(json_path)[0]+'.jsonl'

            # Open jsonl file  
            with open(jsonl_path, 'w') as jsonl_file:
                # Iterates over input json
                for data in json_data:
                    # Converts json to a Python dict  
                    dict_data = data.to_python()
                    
                    # Saves the data to jsonl file
                    jsonl_file.write(json.dumps(dict_data)+"\n")
                    
        # Delete json file
        os.remove(json_path)

    # Upload jsonl files to datastore
    print("Saving Transformed Data...")
    output_dataset = Dataset.File.upload_directory(f'{TEMP_DIRECTORY}', target=save_folder)

#TODO: 
## move to util


Overwriting steps/transform-data-drift-output.py


In [17]:
%%writefile $src_folder/save-data-drift-output.py

import argparse
from azureml.core import Dataset, Datastore, Run
from azureml.core.run import _OfflineRun
from azureml.data.dataset_factory import DataType

PARTITION_FORMAT = '{DATADRIFT_ID}/{PARTITION_DATE:yyyy/MM/dd}/output_{RUN_ID}.json'


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--transformed-data", type=str, dest='transformed_data', help='transformed data')
    return parser.parse_args()

args = parse_args()
print(f'Arguments: {args.__dict__}')
transformed_data = args.transformed_data


run = Run.get_context()
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace

# Crate TabularDataSet based on converted jsonl files
output_dataset = Dataset.Tabular.from_json_lines_files(path=transformed_data, partition_format=PARTITION_FORMAT)
output_dataset = output_dataset.register_on_complete(name='datadrift_results_pipeline', description = 'datadrift results pipeline')


Overwriting steps/save-data-drift-output.py


In [2]:
if False:
    from azureml.core.environment import Environment
    from azureml.core.conda_dependencies import CondaDependencies

    myenv = Environment(name=env_name)
    conda_dep = CondaDependencies()

    conda_dep.add_pip_package('bigjson')
    conda_dep.add_pip_package('azureml-defaults')

    # Adds dependencies to PythonSection of myenv
    myenv.python.conda_dependencies=conda_dep

    myenv.register(ws).build(ws)

In [18]:
from azureml.core import RunConfiguration, ComputeTarget, Environment

run_config = RunConfiguration()
run_config.environment = Environment.get(ws, env_name)
compute_target = ComputeTarget(ws, cluster_name)


In [22]:
from azureml.pipeline.core import PipelineData, PipelineParameter
from azureml.pipeline.steps import PythonScriptStep
from azureml.core import ComputeTarget, Datastore, Dataset
from azureml.data import OutputFileDatasetConfig

#datadir_param = PipelineData('datadir', is_directory=True)
#evolve_param = PipelineData('evolve')
#azure_param = PipelineData('azure')
#compare_param = PipelineData('compare')

#collection_param = PipelineParameter(name="collection", default_value='test_datasets')
#repo_param = PipelineParameter(name="repo", default_value='test')


# Get data-drift output dataset
dstore = Datastore.get(ws, DATASTORE_NAME)
metrics_ds = Dataset.File.from_files(path=(dstore,json_file_path)) # add filter dataset

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
transformed_data = OutputFileDatasetConfig("transformed_data")

# Step 1, Run the data transofrm script (from json to jsonl)
transform_step = PythonScriptStep(
    name='transform data drift output',
    source_directory=src_folder,
    script_name='transform-data-drift-output.py',
    arguments = ['--input-data', metrics_ds.as_named_input('raw_data'),
                '--transformed-data', transformed_data],
    compute_target=compute_target, 
    runconfig=run_config, 
    allow_reuse=False,    
)

# Step 2, Run the saving transformed data script
save_step = PythonScriptStep(
    name='save data drift output',
    source_directory=src_folder,
    script_name='save-data-drift-output.py',
    arguments = ['--transformed-data', transformed_data.as_input()],
    compute_target=compute_target, 
    runconfig=run_config,    
    allow_reuse=False,
)

print("Pipeline steps defined")

Pipeline steps defined


In [23]:
from azureml.core import Experiment
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [transform_step, save_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'data-drift-output-exeriment')
pipeline_run = experiment.submit(pipeline) 
print("Pipeline submitted for execution.")

RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)


Pipeline is built.
Created step transform data drift output [f7993cf1][8ea49e87-4cc2-4f06-a540-485a5a392a8b], (This step will run and generate new outputs)
Created step save data drift output [b445b89f][29f5add6-d329-41c9-a84f-0b7caeeeef17], (This step will run and generate new outputs)
Submitted PipelineRun 15cde99d-5f52-4df1-9ec4-0ed0467abfae
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/15cde99d-5f52-4df1-9ec4-0ed0467abfae?wsid=/subscriptions/06c3d5bb-46a2-4d92-9508-c018d06f6452/resourcegroups/evolve-team-rg/workspaces/evolve-ml&tid=72a43063-967e-43c8-8121-0823266b2701
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 15cde99d-5f52-4df1-9ec4-0ed0467abfae
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/15cde99d-5f52-4df1-9ec4-0ed0467abfae?wsid=/subscriptions/06c3d5bb-46a2-4d92-9508-c018d06f6452/resourcegroups/evolve-team-rg/workspaces/evolve-ml&tid=72a43063-967e-43c8-8121-0823266b2701
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: cd6a389c-63ea-4c06-b85a-3db8e60c0f24
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/cd6a389c-63ea-4c06-b85a-3db8e60c0f24?wsid=/subscriptions/06c3d5bb-46a2-4d92-9508-c018d06f6452/resourcegroups/evolve-team-rg/workspaces/evolve-ml&tid=72a43063-967e-43c8-8121-0823266b2701
StepRun( transform data drift output ) Status: Running

StepRun(transform data drift output) Execution Summary
StepRun( transform data drift output ) Status: Failed

{
  "error": {
    "code": "UserError",
    "severity": null,
    "message": "AzureMLCompute job failed.\nExecutionFailed: [REDACTED]\n\texit_codes: 2",
    "messageFormat

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "{'code': ExecutionFailed, 'message': [{\"exit_code\":2,\"error_message\":\"Execution failed with error: [stderr]usage: transform-data-drift-output.py [-h] [--input-data RAW_DATASET_ID]\\n[stderr]                                      [--prepped-data PREPPED_DATA]\\n[stderr]transform-data-drift-output.py: error: unrecognized arguments: --transformed-data /mnt/azureml/cr/j/0ca39415c7584373a369225e2ec25425/cap/data-capability/wd/transformed_data\\nCleaning up all outstanding Run operations, waiting 300.0 seconds\\n0 items cleaning up...\\nCleanup took 7.152557373046875e-07 seconds\\n[stderr]Traceback (most recent call last):\\n[stderr]  File \\\"transform-data-drift-output.py\\\", line 19, in <module>\\n[stderr]    args = parse_args()\\n[stderr]  File \\\"transform-data-drift-output.py\\\", line 16, in parse_args\\n[stderr]    return parser.parse_args()\\n[stderr]  File \\\"/azureml-envs/azureml_d9438b93de534f7f3a68847348170eaf/lib/python3.6/argparse.py\\\", line 1733, in parse_args\\n[stderr]    self.error(msg % ' '.join(argv))\\n[stderr]  File \\\"/azureml-envs/azureml_d9438b93de534f7f3a68847348170eaf/lib/python3.6/argparse.py\\\", line 2389, in error\\n[stderr]    self.exit(2, _('%(prog)s: error: %(message)s\\\\n') % args)\\n[stderr]  File \\\"/azureml-envs/azureml_d9438b93de534f7f3a68847348170eaf/lib/python3.6/argparse.py\\\", line 2376, in exit\\n[stderr]    _sys.exit(status)\\n[stderr]SystemExit: 2\\n[stderr]\\n\",\"process_name\":\"/azureml-envs/azureml_d9438b93de534f7f3a68847348170eaf/bin/python\",\"error_file\":\"user_logs/std_log.txt\"}], 'target': , 'category': UserError, 'error_details': [{'key': exit_codes, 'value': 2}, ], 'inner_error': null}",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"{'code': ExecutionFailed, 'message': [{\\\"exit_code\\\":2,\\\"error_message\\\":\\\"Execution failed with error: [stderr]usage: transform-data-drift-output.py [-h] [--input-data RAW_DATASET_ID]\\\\n[stderr]                                      [--prepped-data PREPPED_DATA]\\\\n[stderr]transform-data-drift-output.py: error: unrecognized arguments: --transformed-data /mnt/azureml/cr/j/0ca39415c7584373a369225e2ec25425/cap/data-capability/wd/transformed_data\\\\nCleaning up all outstanding Run operations, waiting 300.0 seconds\\\\n0 items cleaning up...\\\\nCleanup took 7.152557373046875e-07 seconds\\\\n[stderr]Traceback (most recent call last):\\\\n[stderr]  File \\\\\\\"transform-data-drift-output.py\\\\\\\", line 19, in <module>\\\\n[stderr]    args = parse_args()\\\\n[stderr]  File \\\\\\\"transform-data-drift-output.py\\\\\\\", line 16, in parse_args\\\\n[stderr]    return parser.parse_args()\\\\n[stderr]  File \\\\\\\"/azureml-envs/azureml_d9438b93de534f7f3a68847348170eaf/lib/python3.6/argparse.py\\\\\\\", line 1733, in parse_args\\\\n[stderr]    self.error(msg % ' '.join(argv))\\\\n[stderr]  File \\\\\\\"/azureml-envs/azureml_d9438b93de534f7f3a68847348170eaf/lib/python3.6/argparse.py\\\\\\\", line 2389, in error\\\\n[stderr]    self.exit(2, _('%(prog)s: error: %(message)s\\\\\\\\n') % args)\\\\n[stderr]  File \\\\\\\"/azureml-envs/azureml_d9438b93de534f7f3a68847348170eaf/lib/python3.6/argparse.py\\\\\\\", line 2376, in exit\\\\n[stderr]    _sys.exit(status)\\\\n[stderr]SystemExit: 2\\\\n[stderr]\\\\n\\\",\\\"process_name\\\":\\\"/azureml-envs/azureml_d9438b93de534f7f3a68847348170eaf/bin/python\\\",\\\"error_file\\\":\\\"user_logs/std_log.txt\\\"}], 'target': , 'category': UserError, 'error_details': [{'key': exit_codes, 'value': 2}, ], 'inner_error': null}\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

In [None]:
from azureml.pipeline.core import PipelineRun
from azureml.core import Experiment

# Publish the pipeline from the run
submitted_pipeline_run = PipelineRun(experiment=Experiment(experiment, run_id=pipeline_run.id))
published_pipeline = submitted_pipeline_run.publish_pipeline(name='data-drift-output-pipeline',
    description='collect, transform and save datadrift output into dataset',
    version='1.0',
    continue_on_step_failure=False)

print('Pipeline scheduled.')

In [None]:
from azureml.pipeline.core import ScheduleRecurrence, Schedule

# Schedules a daily run of a published pipeline
daily = ScheduleRecurrence(frequency='Day', interval=1)
pipeline_schedule = Schedule.create(ws, name='data_drift_output_schedule',
                                        description='update data drift output every day',
                                        pipeline_id=published_pipeline.id,
                                        experiment_name='schedule_data_drift_output_pipeline',
                                        recurrence=daily)

print('Pipeline scheduled.')