### Check if Azure ML is working

In [1]:
import azureml.core
print("Ready to use Azure ML", azureml.core.VERSION)

Ready to use Azure ML 1.5.0


### Connect to your workspace

In [2]:
from azureml.core import Workspace

ws = Workspace.get(name='LEAF_ESP_workspace',
                   subscription_id='8e8a0cc2-ae82-484e-84df-2abb7bf63bc1',
                   resource_group='LEAF_ESP_resource_group')
print(ws.name, "loaded")

Performing interactive authentication. Please follow the instructions on the terminal.




Interactive authentication successfully completed.
LEAF_ESP_workspace loaded


### View the resources within your workspace

Here I already had imported the sample data into my workspace. But it can easily be imported through the SDK as well

In [3]:
from azureml.core import ComputeTarget, Datastore, Dataset

print("Compute Targets:")
for compute_name in ws.compute_targets:
    compute = ws.compute_targets[compute_name]
    print("\t", compute.name, ':', compute.type)
    
print("Datastores:")
for datastore_name in ws.datastores:
    datastore = Datastore.get(ws, datastore_name)
    print("\t", datastore.name, ':', datastore.datastore_type)
    
print("Datasets:")
for dataset_name in list(ws.datasets.keys()):
    dataset = Dataset.get_by_name(ws, dataset_name)
    print("\t", dataset.name)

Compute Targets:
	 LEAF-ESP-compute : ComputeInstance
	 LEAF-cluster : AmlCompute
Datastores:
	 leaf_data : AzureBlob
	 workspaceblobstore : AzureBlob
	 workspacefilestore : AzureFile
Datasets:
	 leaf_test
	 leaf_train


In [4]:
import os, shutil
os.getcwd()

'C:\\Users\\nickm\\Desktop\\Cognizant\\Python Repository\\Cloud Technologies\\Azure'

In [5]:
# Create a folder for the experiment files
training_folder = 'leaf-training'
os.makedirs(training_folder, exist_ok=True)

# Copy the data file into the experiment folder
shutil.copy('C:\\Users\\nickm\\Desktop\\Cognizant\\Python Repository\\Evolutionary AI\\esp-xde\\notebooks\\1.7\\data\\train.csv', os.path.join(training_folder, "leaf_train.csv"))

'leaf-training\\leaf_train.csv'

In [6]:
shutil.copy('C:\\Users\\nickm\\Desktop\\Cognizant\\Python Repository\\Evolutionary AI\\esp-xde\\notebooks\\1.7\\data\\test.csv', os.path.join(training_folder, "leaf_test.csv"))

'leaf-training\\leaf_test.csv'

### Create the predictor model training script which will later be referenced as a pipeline step 

In [45]:
%%writefile $training_folder/leaf_training.py
# Import libraries
from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Let Pandas display all the columns. We have 49 columns in our dataset.
pd.set_option('display.max_columns', 50)

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--output_folder', type=str, dest='output_folder', default="predictor_model", help='output folder')
args = parser.parse_args()
output_folder = args.output_folder

# Get the experiment run context
run = Run.get_context()

# load the diabetes data (passed as an input dataset)
print("Loading Data...")
cwd = os.getcwd()
train_df = pd.read_csv(cwd+'\\leaf-training\\leaf_train.csv').drop(columns='Unnamed: 0')
test_df = pd.read_csv(cwd+'\\leaf-training\\leaf_test.csv').drop(columns='Unnamed: 0')
train_df.head()

# print(f"Train: {train_df.shape[0]} rows, {train_df.shape[1]} columns")
# print(f"Test: {test_df.shape[0]} rows, {test_df.shape[1]} columns")

OUTCOME_CATEGORIES = ['Very bad', 'Bad', 'Neutral', 'Good', 'Very good']
def set_outcome_categories(df):
    df["Cost"] = pd.Categorical(df["Cost"], categories=OUTCOME_CATEGORIES,ordered=True)
    df["Schedule"] = pd.Categorical(df["Schedule"], categories=OUTCOME_CATEGORIES,ordered=True)
    df["Quality"] = pd.Categorical(df["Quality"], categories=OUTCOME_CATEGORIES,ordered=True)

set_outcome_categories(train_df)
set_outcome_categories(test_df)

# Train
train_X_df = train_df.drop(['Cost','Schedule','Quality'], axis=1)
train_Y_df = train_df[['Cost','Schedule','Quality']]
# 3 labels: cost, schedule and quality
train_cost_df = train_df[['Cost']]
train_schedule_df = train_df[['Schedule']]
train_quality_df = train_df[['Quality']]

# Test
test_X_df = test_df.drop(['Cost','Schedule','Quality'], axis=1)
test_Y_df = test_df[['Cost','Schedule','Quality']]
# 3 labels: cost, schedule and quality
test_cost_df = test_df[['Cost']]
test_schedule_df = test_df[['Schedule']]
test_quality_df = test_df[['Quality']]

def encode_dataset(reference_df, to_encode_df):
    """
    Encodes the passed dataset and makes it contains the same columns, in the same order, as the reference one.
    See https://stackoverflow.com/questions/41335718/keep-same-dummy-variable-in-training-and-testing-data
    """
    encoded_df = pd.get_dummies(to_encode_df)
    # Get missing columns in the encoded dataset
    missing_cols = set(reference_df.columns ) - set(encoded_df.columns )
    # Add missing columns in encoded set with default value equal to 0
    for c in missing_cols:
        encoded_df[c] = 0
    # Ensure columns in the encoded set are in the same order as in the reference set
    encoded_df = encoded_df[reference_df.columns]
    return encoded_df

# These dataframes MUST contain all the possible values
# It becomes the "reference" in terms of encoded columns
encoded_train_X_df = pd.get_dummies(train_X_df)

# Test set. Make sure we have the SAME columns as in the train set
encoded_test_X_df = encode_dataset(encoded_train_X_df, test_X_df)

# train_X_df.head()

#encoded_test_X_df.head()

# train_Y_df.head()

encoded_context_actions_column_names = list(encoded_train_X_df.columns)

import csv

with open(cwd+"\\leaf-training\\encoded_context_actions_column_names.csv", "w") as csv_file:
     wr = csv.writer(csv_file, quoting=csv.QUOTE_ALL)
     wr.writerow(encoded_context_actions_column_names)

from sklearn.multioutput import MultiOutputClassifier
# n_jobs=-1 means using all processors
rfc = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, n_jobs=-1,  random_state=45, bootstrap=False))
rfc.fit(encoded_train_X_df, train_Y_df)

# Convert y to a 1d array
train_cost_df = train_cost_df.values[:,0]
rfc_cost = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rfc_cost.fit(encoded_train_X_df, train_cost_df)

# Convert y to a 1d array
train_schedule_df = train_schedule_df.values[:,0]
rfc_schedule = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rfc_schedule.fit(encoded_train_X_df, train_schedule_df)

# Convert y to a 1d array
train_quality_df = train_quality_df.values[:,0]
rfc_quality = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)
rfc_quality.fit(encoded_train_X_df, train_quality_df)

preds = rfc_cost.predict(encoded_test_X_df)
preds2 = rfc_quality.predict(encoded_test_X_df)
preds3 = rfc_schedule.predict(encoded_test_X_df)
# preds

print('cost model accuracy: ',accuracy_score(test_cost_df, preds))
print('quality model accuracy: ',accuracy_score(test_quality_df, preds2))
print('schedule model accuracy: ',accuracy_score(test_schedule_df, preds3))

# Save the trained models
os.makedirs(output_folder, exist_ok=True)
output_path1 = output_folder + "/all_preds_model.pkl"
joblib.dump(value=rfc, filename=output_path1)
output_path2 = output_folder + "/cost_model.pkl"
joblib.dump(value=rfc_cost, filename=output_path2)
output_path3 = output_folder + "/quality_model.pkl"
joblib.dump(value=rfc_quality, filename=output_path3)
output_path4 = output_folder + "/schedule_model.pkl"
joblib.dump(value=rfc_schedule, filename=output_path4)

run.complete()

Writing leaf-training/leaf_training.py


The script for the second step of the pipeline will load the model from where it was saved, and then register it in the workspace. It includes a single model_folder parameter that contains the path where the model was saved

In [47]:
%%writefile $training_folder/register_predictor.py
# Import libraries
import argparse
import joblib
from azureml.core import Workspace, Model, Run

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--model_folder', type=str, dest='model_folder', default="predictor_model", help='model location')
args = parser.parse_args()
model_folder = args.model_folder

# Get the experiment run context
run = Run.get_context()

# load the model
print("Loading all outcome model from " + model_folder)
model_file1 = model_folder + "/all_preds_model.pkl"
all_outcome_model = joblib.load(model_file1)

print("Loading cost model from " + model_folder)
model_file2 = model_folder + "/cost_model.pkl"
cost_model = joblib.load(model_file2)

print("Loading quality model from " + model_folder)
model_file3 = model_folder + "/quality_model.pkl"
quality_model = joblib.load(model_file3)

print("Loading schedule model from " + model_folder)
model_file4 = model_folder + "/schedule_model.pkl"
schedule_model = joblib.load(model_file4)

Model.register(workspace=run.experiment.workspace,
               model_path = model_file1,
               model_name = 'all_outcome_model',
               tags={'Training context':'Pipeline'})
Model.register(workspace=run.experiment.workspace,
               model_path = model_file2,
               model_name = 'cost_model',
               tags={'Training context':'Pipeline'})
Model.register(workspace=run.experiment.workspace,
               model_path = model_file3,
               model_name = 'quality_model',
               tags={'Training context':'Pipeline'})
Model.register(workspace=run.experiment.workspace,
               model_path = model_file4,
               model_name = 'schedule_model',
               tags={'Training context':'Pipeline'})

run.complete()

Writing leaf-training/register_predictor.py


### Prepare a compute environment for the pipeline steps
In this exercise, you'll use the same compute for both steps, but it's important to realize that each step is run independently; so you could specify different compute contexts for each step if appropriate.

First, get the compute target. We already created a compute target which can be seen in Studio. For our purposes, let's just use that one. But we can provision code to configure a new one in case we need to delete the old one 

In [48]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "LEAF-cluster"

# Verify that cluster exists
try:
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If not, create it
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS1_V2', 
                                                           max_nodes=2)
    pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

pipeline_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
