In [28]:
import os
import pandas as pd
from azureml.core.model import Model
from azureml.core import Workspace
from azureml.core import Experiment

from azureml.core.dataset import Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.data.output_dataset_config import OutputFileDatasetConfig
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import RunConfiguration

In [29]:
ws = Workspace.from_config()

In [30]:
datastore = ws.get_default_datastore()
dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, 'german_credit_dataset.csv')])
input_data = DatasetConsumptionConfig("input_dataset", dataset)
intermediate_data = OutputFileDatasetConfig(name='intermediate_dataset', destination=(datastore, 'intermediate/{run-id}'))
result_data = OutputFileDatasetConfig(name='result_dataset', destination=(datastore, 'result/{run-id}')).register_on_complete('batch-scoring-results')




In [31]:
compute_name = 'batch-comp'

# checks to see if compute target already exists in workspace, else create it
if compute_name in ws.compute_targets:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
else:
    config = AmlCompute.provisioning_configuration(vm_size="STANDARD_DS11_V2",
                                                   vm_priority="lowpriority",
                                                   min_nodes=1,
                                                   max_nodes=2)

    compute_target = ComputeTarget.create(workspace=ws, name=compute_name, provisioning_configuration=config)
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

In [32]:
conda_dep = CondaDependencies()
conda_dep.add_pip_package("scikit-learn==0.22")
config = RunConfiguration(conda_dependencies=conda_dep)

In [33]:
preprocessing_step = PythonScriptStep(
    script_name="preprocessing_step.py",
    name='preprocessing_step',
    arguments=['--intermediate-data-path', intermediate_data],
    compute_target=compute_target,
    runconfig=config,
    inputs=[input_data],
    outputs=[intermediate_data],
    source_directory='./batch_scripts',
    allow_reuse=True
)
scoring_step = PythonScriptStep(
    script_name="scoring_step.py",
    name='scoring_step',
    arguments=['--intermediate-data-path', intermediate_data, '--result-data-path', result_data],
    compute_target=compute_target,
    runconfig=config,
    inputs=[intermediate_data],
    outputs=[result_data],
    source_directory='./batch_scripts'
)

In [34]:
scoring_pipeline = Pipeline(workspace=ws, steps=[preprocessing_step, scoring_step])
pipeline_run = Experiment(ws, 'batch-score').submit(scoring_pipeline)
pipeline_run.wait_for_completion(show_output=False)

Created step preprocessing_step [46669229][2ab4e371-9029-4c4b-99ba-6826efe70af0], (This step is eligible to reuse a previous run's output)
Created step scoring_step [c5b3306f][d9f1bb8f-b498-43c5-824d-4e91a91d7113], (This step will run and generate new outputs)
Submitted PipelineRun 0c709be1-179b-4216-9058-94594bb17c4e
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/batch-score/runs/0c709be1-179b-4216-9058-94594bb17c4e?wsid=/subscriptions/823af982-da0d-47e1-8124-3c00e4053556/resourcegroups/jrie_test/workspaces/holdev
PipelineRunId: 0c709be1-179b-4216-9058-94594bb17c4e
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/batch-score/runs/0c709be1-179b-4216-9058-94594bb17c4e?wsid=/subscriptions/823af982-da0d-47e1-8124-3c00e4053556/resourcegroups/jrie_test/workspaces/holdev
{'runId': '0c709be1-179b-4216-9058-94594bb17c4e', 'status': 'Completed', 'startTimeUtc': '2020-12-18T17:06:22.004849Z', 'endTimeUtc': '2020-12-18T17:07:58.472964Z', 'properties

'Finished'

In [37]:
dataset = Dataset.get_by_name(ws, name='batch-scoring-results', version = "latest")
df_path = dataset.download('data/batch_scoring_results', overwrite=True)
pd.read_csv(df_path[0]).head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk,prediction
0,0,0,67,male,2,own,,little,1169,6,radio/TV,good,1
1,1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad,0
2,2,2,49,male,1,own,little,,2096,12,education,good,1
3,3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good,0
4,4,4,53,male,2,free,little,little,4870,24,car,bad,1


In [36]:
dataset = Dataset.get_by_name(ws, name='german_credit_dataset', version = "latest")
ds_df = dataset.to_pandas_dataframe()
ds_df.head()


Unnamed: 0,Sno,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
