# Basic scoring pipeline #

This notebooks is just an addition to `create_basic_training_pipeline.ipynb` to demonstrate how to use the trained model to make scoring

In [None]:
from azureml.core import Workspace, Dataset
from azureml.core.datastore import Datastore
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineData
from azureml.pipeline.core import Pipeline
from azureml.core import Experiment
from msrest.exceptions import HttpOperationError

Because we are going to list all files and send them to the Form Recognizer one by one, we will need couple more parameters compare to training pipeline:

- storage_name: a storage name that contains input data
- storage_key: a storage key to get access to the storage with input data
- container_name: the name of the container that contains folder with input data

In [None]:
subscription_id = "<provide it here>"
wrksp_name = "<provide it here>"
resource_group = "<provide it here>"
compute_name = "mycluster"
min_nodes = 0
max_nodes = 4
vm_priority = "lowpriority"
vm_size = "Standard_F2s_v2"
project_folder = "basic_scoring_steps"
fr_endpoint = "<provide it here>"
fr_key = "<provide it here>"
storage_name = "<provide it here>"
storage_key = "<provide it here>"
container_name = "data"
datastore_name = "data_ds"
scoring_ds_name = "basic_scoring"

Getting a reference to the workspace. If it doesn't exist there is no sense to create new one because we don't have any models anyway

In [None]:
try:
    aml_workspace = Workspace.get(
        name=wrksp_name,
        subscription_id=subscription_id,
        resource_group=resource_group)
    print("Found the existing Workspace")
except Exception as e:
    print(f"Workspace doesn't exist")

Create a compute cluster for scoring

In [None]:
if compute_name in aml_workspace.compute_targets:
    compute_target = aml_workspace.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print(f"Found existing compute target {compute_name} so using it")
else:
    compute_config = AmlCompute.provisioning_configuration(
        vm_size=vm_size,
        vm_priority=vm_priority,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
    )

    compute_target = ComputeTarget.create(aml_workspace, compute_name,
                                                  compute_config)
    compute_target.wait_for_completion(show_output=True)

We need to mount blob storage with input data tp scoring compute cluster. To do that we need to register blob container as a data store in AML

In [None]:
try:
    blob_datastore = Datastore.get(aml_workspace, datastore_name)
    print("Found Blob Datastore with name: %s" % datastore_name)
except HttpOperationError:
    blob_datastore = Datastore.register_azure_blob_container(
        workspace=aml_workspace,
        datastore_name=datastore_name,
        account_name=storage_name,
        container_name=container_name,
        account_key=storage_key)
print("Registered blob datastore with name: %s" % datastore_name)

Now, we can create a dataset with all input files there (it doesn't make much sense for this example, but it's very useful for parallel step)

In [None]:
scoring_file_path = blob_datastore.path("Test")
scoring_file_dataset = Dataset.File.from_files(path=scoring_file_path, validate=True)
scoring_file_dataset = scoring_file_dataset.register(
    aml_workspace, scoring_ds_name, create_new_version=True)
print("Dataset has been registered")

We need pipeline data object to store all outputs from scoring

In [None]:
scoring_output = PipelineData(
    "scoring_output",
    datastore=blob_datastore)

Just one step here: scoring. We will list all files and make scoring one by one

In [None]:
scoring_step = PythonScriptStep(
    name = "scoring",
    script_name="score.py",
    inputs=[scoring_file_dataset.as_named_input("scoring_files")],
    outputs=[scoring_output],
    arguments=[
        "--output", scoring_output,
        "--fr_endpoint", fr_endpoint,
        "--fr_key", fr_key],
    compute_target=compute_target,
    source_directory=project_folder
)

In [None]:
steps = [scoring_step]

Create a pipeline object with one step only

In [None]:
pipeline = Pipeline(workspace=aml_workspace, steps=steps)

Execute the pipeline

In [None]:
pipeline_run = Experiment(aml_workspace, 'scoring_basic_exp').submit(pipeline)
pipeline_run.wait_for_completion()

Register the pipeline as an reusable entity in AML

In [None]:
pipeline.publish(
    name="basic_scoring",
    description="Scoring data using form recognizer single model")