# Create a pipe for batch inferencing
imagine a health clinic takes patient measurements all day, saving the details for each patient in a separate file. Then overnight, the diabetes prediction model can be used to process all of the day's patient data as a batch, generating predictions that will be waiting the following morning so that the clinic can follow up with patients who are predicted to be at risk of diabetes. That's what you'll implement in this project. 

## Connect to workspace

In [1]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception unknown locale: UTF-8.


UserErrorException: UserErrorException:
	Message: We could not find config.json in: /Users/xiaoyingliu/Desktop/On_Projects or in its parent directories. Please provide the full path to the config file or ensure that config.json exists in the parent directories.
	InnerException None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "We could not find config.json in: /Users/xiaoyingliu/Desktop/On_Projects or in its parent directories. Please provide the full path to the config file or ensure that config.json exists in the parent directories."
    }
}

## Train and Register a Model

In [2]:
from azureml.core import Experiment
from azureml.core import Model
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace = ws, name = "diabetes-training")
run = experiment.start_logging()
print("Starting experiment:", experiment.name)

# load the diabetes dataset
print("Loading Data...")
diabetes = pd.read_csv('data/diabetes.csv')


# Separate features and labels
X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a decision tree model
print('Training a decision tree model')
model = DecisionTreeClassifier().fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# calculate AUC
y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
print('AUC: ' + str(auc))
run.log('AUC', np.float(auc))

# Save the trained model
model_file = 'diabetes_model.pkl'
joblib.dump(value=model, filename=model_file)
run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)

# Complete the run
run.complete()

# Register the model
run.register_model(model_path='outputs/diabetes_model.pkl', model_name='diabetes_model',
                   tags={'Training context':'Inline Training'},
                   properties={'AUC': run.get_metrics()['AUC'], 'Accuracy': run.get_metrics()['Accuracy']})

print('Model trained and registered.')

NameError: name 'ws' is not defined

## Generate and upload batch data
Since we don't actually have a fully staffed clinic with patients from whom to get new data for this course, you'll generate a random sample from our diabetes CSV file and use those to test the pipeline. Then you'll upload that data to a datastore in the Azure Machine Learning workspace and register a dataset for it.

In [3]:
from azureml.core import Dataset, Datastore
import pandas as pd
import os

# Load the diabetes data
diabetes = pd.read_csv('data/diabetes2.csv')
# Get a 100-item sample of the feature columns (not the diabetic label)
sample = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].sample(n=100).values

# Create a folder
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok=True)
print("Folder created!")

# Save each sample as a separate file
print("Saving files...")
for i in range(100):
    fname = str(i+1) + '.csv'
    sample[i].tofile(os.path.join(batch_folder, fname), sep=",")
print("files saved!")

# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'batch-data/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='batch-data',
                                             description='batch data',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

FileNotFoundError: File b'data/diabetes2.csv' does not exist

## Create Compute
We'll need a compute context for the pipeline, so we'll create an Azure Machine Learning compute cluster in your workspace (or use an existing one if you have created it previously)

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "your-compute-cluster"

try:
    # Check for existing compute target
    inference_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        inference_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        inference_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

NameError: name 'ws' is not defined

## Create Pipeline
Our pipeline will need Python code to perform the batch inferencing, so let's create a folder where we can keep all the files used by the pipeline:
Next we'll create a Python script to do the actual work, and save it in the pipeline folder:
Next we'll define a run context that includes the dependencies required by the script

You're going to use a pipeline to run the batch prediction script, generate predictions from the input data, and save the results as a text file in the output folder. To do this, you can use a ParallelRunStep, which enables the batch data to be processed in parallel and the results collated in a single output file named parallel_run_step.txt.

 It's time to put the step into a pipeline, and run it.

In [None]:
import os 
#create a folder for experiment files 
experiment_folder='batch-pipeline'
os.mkdirs(experiment_folder,exits_ok=True)

print(experiment_folder)

In [None]:
%%writefile $experiment_folder/batch_diabetes.py
import os
import numpy as np
from azureml.core import Model
import joblib

def init():
    #runs when the pipeline step is initialized
    global model
    
    #load the model
    model_path=Model.get_model_path('diabetes_model')
    model=joblib.load(model_path)
    
    
def run(mini_batch):
    #This runs for each batch
    resultList=[]
    #Process each file in the batch
    for f in mini_batch:
        #read the comma delimited data into an array
        data=np.genfromtxt(f,delimiter=',')
        #reshape into a 2-dimensional array for prediction
        prediction=model.predict(data.reshape(1,-1))
        #append predictions to results
        resultList.append('{}':'{}'.format(os.path.basename(f),prediction[0]))
    return resultList        

In [None]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azrureml.core.runconfig import CondaDependencies

#get dependencies requires by model
#for scikit-learn models, you need scikit-learn
#For parallel pipeline steps, you need azureml-core and azureml-dataprep[fuse]
CondaDependencies.create(pip_packages=['scikit-learn','azureml-defaults','azureml-core','azureml-dataprep[fuse]'])

batch_env=Environment(name='batch_environment')
batch_env.python.conda_dependencies=cd
batch_env.docker.enabled=True
batch.env.docker.base_image=DEFAULT_CPU_IMAGE
print('configuration ready.')   

In [5]:
from azureml.pipeline.steps import ParallelRunfig, ParallelRunstep
from azureml.pipeline.core import PipelineData
default_ds=ws.get_default_datastore()
output_dir = PipelineData(name='inferences', 
                          datastore=default_ds, 
                          output_path_on_compute='diabetes/results')

parallel_run_config = ParallelRunConfig(
    source_directory=experiment_folder,
    entry_script="batch_diabetes.py",
    mini_batch_size="5",
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    compute_target=inference_cluster,
    node_count=2)

parallelrun_step = ParallelRunStep(
    name='batch-score-diabetes',
    parallel_run_config=parallel_run_config,
    inputs=[batch_data_set.as_named_input('diabetes_batch')],
    output=output_dir,
    arguments=[],
    allow_reuse=True
)

print('Steps defined')

ValueError: unknown locale: UTF-8

In [6]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])
pipeline_run = Experiment(ws, 'batch_prediction_pipeline').submit(pipeline)
print('Running pipeline...')
pipeline_run.wait_for_completion(show_output=True)

NameError: name 'ws' is not defined

when the pipeline has finished running, the resulting predictions wil have been saved in the outputs of experiments associated with the first step 
in the pipeline, it can be retreived as follows:

In [None]:
import pandas as pd
import shutil

shutil.rmtree('diabetes-results', ignore_errors=True)
prediction_run=next(pipeline.run.get_childeren())
prediction_output=predicton_run.get_output_data('inferences')
prediction_outout.download(local_path='diabetes-results')
for root,files,dirs in os.walk('diabetes-results'):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file=os.path.join(root,file)
            
#cleanup output format
df=pd.read_csv(result_file, delimiter=':', header=None)
df.columns=['File','Prediction']

#display the first 20 results
df.head(20)

## Publish the Pipeline and use its REST interface

now you have a working pipeline for batch inferencing, you can publish it and use a REST endpoint to run it from an application.

In [7]:
published_pipleline=pipeline_run.publish_pipeline(
name='Diabetes_Parallel_Batch_Pipeline',description='Batch scoring of diabetes data', version='1.0')
published_pipeline
#endpoint
rest_endpoint=published_pipeline.endpoint()
print(rest_endpoint)
#make a REST call over http with an authentication header
from azureml.core.authentication import InteractiveLoginAuthentication
interactive_auth=InteractiveLoginAuthentication()
auth_header=interactive_auth.get_authentication_header()
print('Authentication header ready')

import requests
response=requests.post(rest_endpoint,headers=auth_header,json={"ExperimentName": "Batch_Pipeline_via_REST"})
run_id=response.json()['Id']
run_id
#With run_id, we can view the experiment as it runs
from azureml.pipeline.core.run import PipelineRun
published_pipeline_run = PipelineRun(ws.experiments["Batch_Pipeline_via_REST"], run_id)
print('Running pipeline...')
published_pipeline_run.wait_for_completion(show_output=True)
#show results
import pandas as pd
import shutil

shutil.rmtree("diabetes-results", ignore_errors=True)

prediction_run = next(published_pipeline_run.get_children())
prediction_output = prediction_run.get_output_data("inferences")
prediction_output.download(local_path="diabetes-results")


for root, dirs, files in os.walk("diabetes-results"):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)

# cleanup output format
df = pd.read_csv(result_file, delimiter=":", header=None)
df.columns = ["File", "Prediction"]

# Display the first 20 results
df.head(20)

NameError: name 'pipeline_run' is not defined