# Zillow Compete
## Predict housing prices using regression techniques

This notebook is a Zillow Compete clone which uses Azure Machine Learning components throughout. This notebook covers:

-   Workspace untilization
-   Azure ML Compute provisioning
-   Machine Learning Pipelines
    -   Data Ingestions
    -   Data Preparation
    -   Data Transform
    -   Train, Evaluate, and Register a Model
-  ML Model Deployment
-  CI/CD pipelines (if time permits)


In [None]:
import pandas as pd
import numpy as np
import os


In [None]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

### Import ML Packages

In [None]:
from azureml.core import Workspace              # connect to workspace
from azureml.core import Experiment             # connect/create experiments
from azureml.core import ComputeTarget          # connect to compute
from azureml.core import Environment            # manage e.g. Python environments
from azureml.core import Datastore, Dataset     # work with data
from azureml.core.model import Model            # work with model
import mlflow                                   # work with mlflow


ws = Workspace.from_config()
exp_name = "Zillow-Regr-Exp"
exp = Experiment(ws, exp_name)
mlflow.set_experiment(exp_name)
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

In [None]:
import azureml.core
print("SDK version:", azureml.core.VERSION)

## Getting Datastore, Blobstore and Filestore in the workspace
In this step, we will define the datastore, blobstore and file store

In [None]:
# retrieve an existing datastore in the workspace by name
datastore = ws.get_default_datastore()

# Get the blob storage associated with the workspace
def_blob_store = Datastore(ws, "workspaceblobstore")

In [None]:
def_blob_store.upload_files(
    ["../data/train.csv"],
    target_path="train-dataset",
    overwrite=True)

In [None]:
def_blob_store.upload_files(
    ["../data/test.csv"],
    target_path="test-dataset",
    overwrite=True)

In [None]:
from azureml.core import Dataset
ws = Workspace.from_config()
datastore = Datastore.get(ws, 'workspaceblobstore')
if not 'house_prices_train' in ws.datasets.keys() :
    zillow_housing_dataset = Dataset.Tabular.from_delimited_files([(datastore, '../data/train.csv')])
    zillow_housing_dataset.register(workspace = ws,
                                     name = 'house_prices_train',
                                     description = 'housing training data',
                                     create_new_version = True)


In [None]:

zillow_housing_dataset = Dataset.get_by_name(ws, 'house_prices_train')

In [None]:
ws.datasets.keys()

In [None]:
zillow_housing_dataset

## Create an Environment
Create a docker based environment with sci-kit learn

In [None]:
import hashlib
from azureml.core.runconfig import DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

conda_dep = CondaDependencies.create(
        conda_packages=['pandas','scikit-learn'], 
        pip_packages=['azureml-sdk[automl,explain]', 'azureml-dataprep[fuse,pandas]'], 
        pin_sdk_version=False)

myenv = Environment('Zillow-Regr-AutoML')
myenv.python.conda_dependencies = conda_dep

# Enable Docker
docker_config = DockerConfiguration(use_docker=True)

## Provision a Compute Target


In [None]:
from socket import timeout
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cluster_name = "train-clu"

# Verify that cluster does not exist already
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D14',
                                                            vm_priority='lowpriority',
                                                            min_nodes= 0,
                                                            max_nodes=4,
                                                            idle_seconds_before_scaledown=120)
                                                            
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cluster.wait_for_completion(show_output=True, timeout_in_minutes=20)

## Configure the training run
Make sure remote training cluster has all the dependencies that are required by the training steps. 

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

aml_run_config = RunConfiguration()
# Use just-specified compute target ("cpu-cluster")
aml_run_config.target = cluster

USE_CURATED_ENV = False
if USE_CURATED_ENV :
    curated_environment = Environment.get(workspace=ws, name="Zillow-AutoML-Env")
    aml_run_config.environment = curated_environment
else:
    aml_run_config.environment.python.user_managed_dependencies = False
    
    # Add some packages relied on by data prep step
    aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
        conda_packages=['pandas','scikit-learn'], 
        pip_packages=['azureml-sdk[automl,explain]', 'azureml-dataprep[fuse,pandas]'], 
        pin_sdk_version=False)

## Preparing data for Auto ML regression
In this step, we are doing data preparation to drop columns that wont be used for prediction. This can be extended further to do complete data preparation

In [None]:
import os
# Create a folder for pipeline step files
aml_dir = "../scripts/"
os.makedirs(aml_dir, exist_ok=True)
print(aml_dir)

Now let's create the first script, which will read data from the zillow housing tabular dataset and apply some simple pre-processing to remove any rows with missing data and normalize the numeric features so they're on a similar scale.

The script includes a argument named *--output-path* which references the folder where the resulting data should be saved.

In [None]:
%%writefile $aml_dir/dataprep.py
from azureml.core import Run

import pandas as pd 
import numpy as np 
import pyarrow as pa
import pyarrow.parquet as pq
import argparse
from azureml.core import Run
import mlflow
import pickle
import os

mlflow.autolog()

# Constants
RANDOM_SEED=42

# Get the parameters
parser = argparse.ArgumentParser()
parser.add_argument('--output_path', dest='output_path', required=True)
args = parser.parse_args()
zillow_housing_dataset = Run.get_context().input_datasets['house_prices_train']

# Get the experiment/job run context
run = Run.get_context()

# Load the data (passed as an input dataset)
df_train = zillow_housing_dataset.to_pandas_dataframe()

# Save the prepped data
print('Saving Data...')
os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
pq.write_table(pa.Table.from_pandas(df_train), args.output_path)

# End the run
print(f"Wrote test to {args.output_path} and train to {args.output_path}")
run.complete()

## Define Data preparation step for pipeline
In this step we are defining data preparation step with the python file created earlier for data preparation

In [None]:
from azureml.pipeline.core import PipelineData, Pipeline
from azureml.pipeline.steps import PythonScriptStep

prepped_data_path = PipelineData("house_prices_train",datastore,"direct").as_dataset()

dataprep_step = PythonScriptStep(
    name="dataprep_step",
    source_directory=aml_dir, 
    script_name="dataprep.py", 
    compute_target=cluster, 
    runconfig=aml_run_config,
    arguments=["--output_path", prepped_data_path],
    inputs=[zillow_housing_dataset.as_named_input("house_prices_train")],
    outputs=[prepped_data_path],
    allow_reuse=True
)

## Send data to AutoML Step
The snippet below creates a high-performing PipelineOutputTabularDataset from the PipelineOutputFileDataset output of the data preparation step.

In [None]:
prepped_train_data = prepped_data_path.parse_parquet_files(file_extension=None)

## Specify Automated ML Outputs
The outputs of the AutoMLStep are the final metric scores of the higher-performing model and that model itself. To use these outputs in further pipeline steps, prepare PipelineData objects to receive them.

In [None]:
from azureml.pipeline.core import TrainingOutput

metrics_data = PipelineData(name='metrics_data',
                           datastore=datastore,
                           pipeline_output_name='metrics_output',
                           training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='best_model_data',
                           datastore=datastore,
                           pipeline_output_name='model_output',
                           training_output=TrainingOutput(type='Model'))

## Configure and Create Automated ML Pipeline Step
Once the inputs and outputs are defined, it's time to create the AutoMLConfig and AutoMLStep. The details of the configuration will depend on your task, in this case, it is regression to predict the 'Sales Price' label.

In [None]:
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep
import logging

automl_settings = {
       "n_cross_validations":5,
    #    "primary_metric": 'r2_score',
       "primary_metric": 'normalized_root_mean_squared_error',
      #  "primary_metric":   'normalized_mean_absolute_error',
       "enable_early_stopping": True,
       "experiment_timeout_hours": 1,
       "max_concurrent_iterations": 4,
       "max_cores_per_iteration": -1,
       "verbosity": logging.INFO
   }

automl_config = AutoMLConfig(task = 'regression',
                               path = aml_dir,
                               compute_target = cluster,
                               training_data = prepped_train_data,
                               featurization = 'auto',
                               debug_log = 'automated_ml_errors.log',
                               label_column_name = 'SalePrice',
                               **automl_settings
                               )

train_step = AutoMLStep(name='automl_training_step',
    automl_config=automl_config,
    passthru_automl_config=False,
    outputs=[metrics_data,model_data],
    allow_reuse=True)

## Register the model created by automated ML
The last step in a basic ML pipeline is registering the created model. By adding the model to the workspace's model registry, it will be available in the portal and can be versioned. To register the model, write another PythonScriptStep that takes the model_data output of the AutoMLStep(first and the second cell below this cell performs these steps).


In [None]:
%%writefile $aml_dir/register_model.py
from azureml.core.model import Model, Dataset
from azureml.core.run import Run, _OfflineRun
from azureml.core import Workspace
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--model_name", required=True)
parser.add_argument("--model_path", required=True)
args = parser.parse_args()

print(f"model_name : {args.model_name}")
print(f"model_path: {args.model_path}")

run = Run.get_context()
ws = Workspace.from_config() if type(run) == _OfflineRun else run.experiment.workspace

model = Model.register(workspace=ws,
                       model_path=args.model_path,
                       model_name=args.model_name)

print("Registered version {0} of model {1}".format(model.version, model.name))

In [None]:
from azureml.pipeline.core.graph import PipelineParameter

# The model name with which to register the trained model in the workspace.
model_name = PipelineParameter("model_name", default_value="Zillow-SalesPrices-Regr-mdl")

register_step = PythonScriptStep(script_name="register_model.py",
                                       name="register_model_step",
                                       source_directory=aml_dir,
                                       allow_reuse=True,
                                       arguments=["--model_name", model_name, "--model_path", model_data],
                                       inputs=[model_data],
                                       compute_target=cluster,
                                       runconfig=aml_run_config)

## Create and run the automated ML pipeline
Creating and running a pipeline that contains the AutoML Step

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.core import Workspace,Experiment,Run
from azureml.widgets import RunDetails
azureml._restclient.snapshots_client.SNAPSHOT_MAX_SIZE_BYTES = 20000000000

# Construct the pipeline
pipeline = Pipeline(ws, [dataprep_step, train_step, register_step])
print('Pipeline is built.')

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name='zillow-regr-automl-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=False)

When the pipeline has finished, you can examine the metrics recorded by it's child runs.

In [None]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

## Writing entry script
Write the entry script that will be used to predict on my model.

In [None]:
%%writefile $aml_dir/score.py

import json
import logging
import os
import pickle
import numpy as np
import pandas as pd
import joblib

import azureml.automl.core
from azureml.automl.core.shared import logging_utilities, log_server
from azureml.telemetry import INSTRUMENTATION_KEY

from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType
from inference_schema.parameter_types.standard_py_parameter_type import StandardPythonParameterType

data_sample = PandasParameterType(pd.DataFrame({"Id": pd.Series([0], dtype="int64"), "MSSubClass": pd.Series([0], dtype="int64"), "MSZoning": pd.Series(["example_value"], dtype="object"), "LotFrontage": pd.Series(["example_value"], dtype="object"), "LotArea": pd.Series([0], dtype="int64"), "Street": pd.Series(["example_value"], dtype="object"), "Alley": pd.Series(["example_value"], dtype="object"), "LotShape": pd.Series(["example_value"], dtype="object"), "LandContour": pd.Series(["example_value"], dtype="object"), "Utilities": pd.Series(["example_value"], dtype="object"), "LotConfig": pd.Series(["example_value"], dtype="object"), "LandSlope": pd.Series(["example_value"], dtype="object"), "Neighborhood": pd.Series(["example_value"], dtype="object"), "Condition1": pd.Series(["example_value"], dtype="object"), "Condition2": pd.Series(["example_value"], dtype="object"), "BldgType": pd.Series(["example_value"], dtype="object"), "HouseStyle": pd.Series(["example_value"], dtype="object"), "OverallQual": pd.Series([0], dtype="int64"), "OverallCond": pd.Series([0], dtype="int64"), "YearBuilt": pd.Series([0], dtype="int64"), "YearRemodAdd": pd.Series([0], dtype="int64"), "RoofStyle": pd.Series(["example_value"], dtype="object"), "RoofMatl": pd.Series(["example_value"], dtype="object"), "Exterior1st": pd.Series(["example_value"], dtype="object"), "Exterior2nd": pd.Series(["example_value"], dtype="object"), "MasVnrType": pd.Series(["example_value"], dtype="object"), "MasVnrArea": pd.Series([0.0], dtype="float64"), "ExterQual": pd.Series(["example_value"], dtype="object"), "ExterCond": pd.Series(["example_value"], dtype="object"), "Foundation": pd.Series(["example_value"], dtype="object"), "BsmtQual": pd.Series(["example_value"], dtype="object"), "BsmtCond": pd.Series(["example_value"], dtype="object"), "BsmtExposure": pd.Series(["example_value"], dtype="object"), "BsmtFinType1": pd.Series(["example_value"], dtype="object"), "BsmtFinSF1": pd.Series([0], dtype="int64"), "BsmtFinType2": pd.Series(["example_value"], dtype="object"), "BsmtFinSF2": pd.Series([0], dtype="int64"), "BsmtUnfSF": pd.Series([0], dtype="int64"), "TotalBsmtSF": pd.Series([0], dtype="int64"), "Heating": pd.Series(["example_value"], dtype="object"), "HeatingQC": pd.Series(["example_value"], dtype="object"), "CentralAir": pd.Series([False], dtype="bool"), "Electrical": pd.Series(["example_value"], dtype="object"), "1stFlrSF": pd.Series([0], dtype="int64"), "2ndFlrSF": pd.Series([0], dtype="int64"), "LowQualFinSF": pd.Series([0], dtype="int64"), "GrLivArea": pd.Series([0], dtype="int64"), "BsmtFullBath": pd.Series([0], dtype="int64"), "BsmtHalfBath": pd.Series([0], dtype="int64"), "FullBath": pd.Series([0], dtype="int64"), "HalfBath": pd.Series([0], dtype="int64"), "BedroomAbvGr": pd.Series([0], dtype="int64"), "KitchenAbvGr": pd.Series([0], dtype="int64"), "KitchenQual": pd.Series(["example_value"], dtype="object"), "TotRmsAbvGrd": pd.Series([0], dtype="int64"), "Functional": pd.Series(["example_value"], dtype="object"), "Fireplaces": pd.Series([0], dtype="int64"), "FireplaceQu": pd.Series(["example_value"], dtype="object"), "GarageType": pd.Series(["example_value"], dtype="object"), "GarageYrBlt": pd.Series(["example_value"], dtype="object"), "GarageFinish": pd.Series(["example_value"], dtype="object"), "GarageCars": pd.Series([0], dtype="int64"), "GarageArea": pd.Series([0], dtype="int64"), "GarageQual": pd.Series(["example_value"], dtype="object"), "GarageCond": pd.Series(["example_value"], dtype="object"), "PavedDrive": pd.Series(["example_value"], dtype="object"), "WoodDeckSF": pd.Series([0], dtype="int64"), "OpenPorchSF": pd.Series([0], dtype="int64"), "EnclosedPorch": pd.Series([0], dtype="int64"), "3SsnPorch": pd.Series([0], dtype="int64"), "ScreenPorch": pd.Series([0], dtype="int64"), "PoolArea": pd.Series([0], dtype="int64"), "PoolQC": pd.Series(["example_value"], dtype="object"), "Fence": pd.Series(["example_value"], dtype="object"), "MiscFeature": pd.Series(["example_value"], dtype="object"), "MiscVal": pd.Series([0], dtype="int64"), "MoSold": pd.Series([0], dtype="int64"), "YrSold": pd.Series([0], dtype="int64"), "SaleType": pd.Series(["example_value"], dtype="object"), "SaleCondition": pd.Series(["example_value"], dtype="object")}))
input_sample = StandardPythonParameterType({'data': data_sample})

result_sample = NumpyParameterType(np.array([0]))
output_sample = StandardPythonParameterType({'Results':result_sample})
sample_global_parameters = StandardPythonParameterType(1.0)

try:
    log_server.enable_telemetry(INSTRUMENTATION_KEY)
    log_server.set_verbosity('INFO')
    logger = logging.getLogger('azureml.automl.core.scoring_script_v2')
except:
    pass


def init():
    global model
    # This name is model.id of model that we want to deploy deserialize the model file back
    # into a sklearn model
    model_path = os.path.join(os.getenv('AZUREML_MODEL_DIR'), 'model.pkl')
    path = os.path.normpath(model_path)
    path_split = path.split(os.sep)
    log_server.update_custom_dimensions({'model_name': path_split[-3], 'model_version': path_split[-2]})
    try:
        logger.info("Loading model from path.")
        model = joblib.load(model_path)
        logger.info("Loading successful.")
    except Exception as e:
        logging_utilities.log_traceback(e, logger)
        raise

@input_schema('Inputs', input_sample)
@input_schema('GlobalParameters', sample_global_parameters, convert_to_provided_type=False)
@output_schema(output_sample)
def run(Inputs, GlobalParameters=1.0):
    data = Inputs['data']
    result = model.predict(data)
    return {'Results':result.tolist()}


## Create the InferenceConfig
Create the inference config that will be used when deploying the model

In [None]:
# from logging.config import _RootLoggerConfiguration
from azureml.core.model import InferenceConfig

inf_config = InferenceConfig(entry_script='../scripts/score.py', environment=myenv, runtime='python')

## Provision the AKS cluster

In [None]:
from azureml.core.compute import ComputeTarget, AksCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your AKS cluster
aks_name = 'aks-clu' 

# Verify that cluster does not exist already
try:
    aks_target = ComputeTarget(workspace=ws, name=aks_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # Use the default configuration (can also provide parameters to customize)
    prov_config = AksCompute.provisioning_configuration()

    # Create the cluster
    aks_target = ComputeTarget.create(workspace = ws, 
                                    name = aks_name, 
                                    provisioning_configuration = prov_config)

if aks_target.get_status() != "Succeeded":
    aks_target.wait_for_completion(show_output=True)

## Deploy web service to AKS

In [None]:
# Set the web service configuration
from azureml.core.webservice import Webservice, AksWebservice

aks_config = AksWebservice.deploy_configuration()

In [None]:
model = Model(ws, 'Zillow-SalesPrices-Regr-mdl')

In [None]:
%%time
aks_service_name ='zillow-alpha-service'

aks_service = Model.deploy(workspace=ws,
                           name=aks_service_name,
                           models=[model],
                           inference_config=inf_config,
                           deployment_config=aks_config,
                           deployment_target=aks_target)

aks_service.wait_for_deployment(show_output = True)
print(aks_service.state)

## Test the web service using the run method

In [None]:
test_data = Dataset.get_by_name(ws, 'house_prices_test')

In [None]:
%%time
import json

test_sample = json.dumps(test_data[1:2].to_list())
test_sample = bytes(test_sample,encoding = 'utf8')

prediction = aks_service.run(input_data = test_sample)
print(prediction)