Copyright (c) Microsoft Corporation. All rights reserved.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/NotebookVM/tutorials/regression-part2-automated-ml.png)

# Tutorial: Catboost Demo

In this tutorial, you use  machine learning in Azure Machine Learning to create a regression model leveraging the CatBoot python library

* Download, transform, and clean data using Azure Open Datasets
* Train an machine learning linear regression model
* Deploy your model with ACI & AKS

## Download and prepare data

In [None]:
pip install Catboost

In [None]:
user = 'cat'

In [None]:
import pandas as pd
from azureml.core import Dataset
from datetime import datetime
from dateutil.relativedelta import relativedelta
from catboost import CatBoostRegressor

In [None]:
df_train = pd.read_csv('./train.csv')

df_train.head(5)

In [None]:
df_train.dtypes

In [None]:
df_train.isnull().sum()

In [None]:
df_train['Item_Weight'].fillna(value=df_train['Item_Weight'].mean(), inplace=True)
df_train['Outlet_Size'].fillna(value='unavailable', inplace=True)

df_train.head(5)

In [None]:
df_train.isnull().sum()

In [None]:
df_train.describe()

In [None]:
df_train.head(2)

In [None]:
from azureml.core.workspace import Workspace
ws = Workspace.from_config()

In [None]:
cwd = os. getcwd()
print(cwd)
dataset_name = user + '-bigmart-train.csv'
print(dataset_name)
dataset_dir = './register/'
os.makedirs(dataset_dir, exist_ok=True)
file_path = os.path.join(dataset_dir, dataset_name)
df_train.to_csv(file_path, index=False)

#upload to datastore
from azureml.core.datastore import Datastore
ds = Datastore.get_default(ws)
ds.upload('register/', target_path='data/prepped', overwrite=True)

In [None]:
from azureml.core.dataset import Dataset
#create a dataset object from the uploaded file
dataset_name = user + '-bigmart-train'
dataset_file =  dataset_name + '.csv'
dataset = Dataset.Tabular.from_delimited_files(ds.path('data/prepped/' + dataset_file))
#register dataset
dataset.register(ws, dataset_name, create_new_version=True)

In [None]:
print(dataset_name)
train_dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
label = "Item_Outlet_Sales"

## Configure workspace


Create a workspace object from the existing workspace. A [Workspace](https://docs.microsoft.com/python/api/azureml-core/azureml.core.workspace.workspace?view=azure-ml-py) is a class that accepts your Azure subscription and resource information. It also creates a cloud resource to monitor and track your model runs. `Workspace.from_config()` reads the file **config.json** and loads the authentication details into an object named `ws`. `ws` is used throughout the rest of the code in this tutorial.

In [None]:
from azureml.core.workspace import Workspace
ws = Workspace.from_config()

ws

Here we will save into a register folder the data set that we are going to register for later use. Notice that we have now created a new folder that holds the dataset we would like to use.

### Train model

Create an experiment object in your workspace. An experiment acts as a container for your individual runs. 

In [None]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, user + "-catboost-exp")

### Create Training Script

In [None]:
import os
script_folder = os.path.join(os.getcwd(), "train")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

### Write Training file

Below be use to update the train.py file to **write your user name**

This train script will create a trained model that has been saved to your run outputs folder.

In [None]:
%%writefile $script_folder/train.py

import os
import sys
import argparse
import joblib
import pandas as pd
import numpy as np

from azureml.core import Run
from azureml.core.run import Run
from azureml.core import Dataset
from azureml.core import Workspace

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from catboost import CatBoostRegressor

def getRuntimeArgs():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_user', type=str)
    args = parser.parse_args()
    print("Argument: %s" % args.data_user)
    return args

def main():
    print('In main')
    
    print('About to get args')
    args = getRuntimeArgs()
    
    print("Argument 1: %s" % args.data_user)
    
    run = Run.get_context()

    print("got run context")
    
    dataset_dir = './dataset/'
    os.makedirs(dataset_dir, exist_ok=True)
    ws = run.experiment.workspace
    print(ws)

    dataset_name = args.data_user + '-bigmart-train'
    
    print('dataset name:' + dataset_name)
    
    dataset_lt = Dataset.get_by_name(ws, name=dataset_name)
    
    # Load a TabularDataset & save into pandas DataFrame
    df = dataset_lt.to_pandas_dataframe()
    df.to_csv(os.path.join(dataset_dir, 'dataset.csv'), index = False)
    

    mod = model_train(df, run)
    #copying to "outputs" directory, automatically uploads it to Azure ML
    output_dir = './outputs/'
    os.makedirs(output_dir, exist_ok=True)
    model_name = os.path.join(output_dir, 'cat-model')
    mod.save_model(model_name)

def model_train(ds_df, run):

    y_raw = ds_df['Item_Outlet_Sales']
    X_raw = ds_df.drop('Item_Outlet_Sales', axis=1)

    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=0)


    categorical_features_indices = np.where(X_raw.dtypes != np.float)[0]
    
    model = CatBoostRegressor(iterations=50, depth=2, learning_rate=0.1, loss_function='RMSE')
    model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_test, y_test), plot=False)
    
    # Capture metrics
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    print("Training accuracy: %.3f" % train_acc)
    print("Test data accuracy: %.3f" % test_acc)

    # Log to Azure ML
    run.log('Train accuracy', train_acc)
    run.log('Test accuracy', test_acc)

    return model

if __name__ == "__main__":
    main()

### Create your compute

In [None]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.exceptions import ComputeTargetException
print(user)
compute_name = user + "-cluster"
print(compute_name)

# checks to see if compute target already exists in workspace, else create it
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D13",
                                                   min_nodes=0, 
                                                   max_nodes=1)

    compute_target = ComputeTarget.create(workspace=ws, name=compute_name, provisioning_configuration=config)
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=40)

### Create your Run Config

In [None]:
from azureml.core.conda_dependencies import CondaDependencies
dependencies = CondaDependencies()
dependencies.add_pip_package('numpy==1.17.0')
dependencies.add_pip_package('joblib==0.14.1')
dependencies.add_pip_package('scikit-learn')
dependencies.add_pip_package('Catboost')

#Create a Run Configuration and add this to your pythonscriptstep
from azureml.core.runconfig import RunConfiguration
run_config = RunConfiguration()
run_config.target = compute_name
run_config.environment.python.conda_dependencies = dependencies
run_config.environment.docker.enabled = True

### Select your training script and create a ScriptRunConfig
A ScriptRunConfig object packages together the environment from a RunConfiguration along with your model training script. This object can then be submitted to your experiment and model training will commence on your remote cluster. 

In this sample, we have put the training script in a separate directory which is targeted for training. This separation allows for a snapshot of just the relevant pieces of code to be stored with the Run in your AML workspace. The <code>train.py</code> file here accesses your registered datasets, trains a model, saves a pickled version, and registers the trained model.

ScriptRunConfiguration documentation: https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.scriptrunconfig?view=azure-ml-py

In [None]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory='./train',
                            script='train.py',
                            arguments=['--data_user', user])
src.run_config = run_config

### Submit the training run
Here, the ScriptRunConfiguration is submitted as a run which triggers your model training operation. The cluster you defined above is automatically spun up and the training procedures outlined in ./train/train.py begin. That file contains all the code needed to train and save a pickled version of your trained model. The code below will display the output logs from your training job - you can also monitor training progress inside AML studio.

Note: As you iterate on your model, you should modify the code inside ./train/train.py. The model parameters there were adjusted for rapid training and should not be used for a production scenario.

In [None]:
from azureml.widgets import RunDetails
run = experiment.submit(config=src)
RunDetails(run).show()
run.wait_for_completion(show_output=True)

In [None]:
import os
script_folder = os.path.join(os.getcwd(), "score")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

In [None]:
%%writefile score.py

import json
import os
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.standard_py_parameter_type import StandardPythonParameterType

def init():
    global model
    
    # Update to your model's filename
    model_filename = "cat-model"

    # AZUREML_MODEL_DIR is injected by AML
    model_dir = os.getenv('AZUREML_MODEL_DIR')

    print("Model dir:", model_dir)
    print("Model filename:", model_filename)
    
    model_path = os.path.join(model_dir, model_filename)

    
    model = CatBoostRegressor()
    model.load_model(model_path)

input_sample = [{
        "Item_Identifier" : "FDA15",
        "Item_Weight":9.3,
        "Item_Fat_Content": "Low Fat",
        "Item_Visibility": 0.016047,
        "Item_Type" : "Dairy",
        "Item_MRP" : 249.8092,
        "Outlet_Identifier": "OUT049",
        "Outlet_Establishment_Year": 1999, 
        "Outlet_Size": "Medium", 
        "Outlet_Location_Type": "Tier 1", 
        "Outlet_Type": "Supermarket Type1",
        "Item_Outlet_Sales": 3735.1380
        }
]
output_sample = [8880.0]

# This will automatically unmarshall the data parameter in the HTTP request
@input_schema('data', StandardPythonParameterType(input_sample))
@output_schema(StandardPythonParameterType(output_sample))
def run(data):
    try:
        input_df = pd.DataFrame(data)
        proba = model.predict(input_df)
        
        result = {"result": proba.tolist()}
        return result
    except Exception as e:
        error = str(e)
        return error

In [None]:
from azureml.core.model import Model
model_name = user + '-cat-boost'
trained_model = run.register_model(model_path='outputs/cat-model', model_name=model_name, tags={'Model Type': 'cat-boost regression'})

In [None]:
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig

env = Environment('tutorial-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataprep[pandas,fuse]>=1.1.14', 'azureml-defaults', 'catboost', 'inference-schema'], conda_packages = ['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = ws)

### Model Deployment

 You can register this model and deploy it to an endpoint by defining an inferencing configuration and providing a scoring script. Here the model is deployed to an Azure Container Instance which provides an API endpoint that can be used to make predictions with your model. We utilize an authentication strategy here which requires a key to be provided with any requests sent to the API. These keys can be rotated as needed and allow only approved users to access your endpoint.
 
 Azure Container Instance documentation: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-container-instance

Azure Container Instances are typically lower cost and useful for dev/test purposes during model development, though we recommend deploying to an Azure Kubernetes Service cluster for production purposes.

Below, an InferenceConfig is created which uses the same python dependencies that were used during model training, and references the scoring script located at <code>.score.py</code>. This script loads the trained model upon initialization, and facilitates transforming data submitted to the API endpoint, making predictions with the model, and returning formatted results to the user.

In [None]:
from azureml.core.webservice import AciWebservice

aciconfig = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1, 
                                               auth_enabled=False,
                                               tags={"data": "mart",  "method" : "cat-boost"}, 
                                               description='cat-boost-demo')

In [None]:
model_name

In [None]:
workspaces = Environment.list(workspace=ws)

In [None]:
print(workspaces)

### Register your model and deploy to an authenticated endpoint 

Model registration documentation: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-and-where

In [None]:
%%time
from azureml.core.webservice import Webservice
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core import Workspace
from azureml.core.model import Model

ws = Workspace.from_config()
print('model_name:'+ model_name)

model = Model(ws, model_name)


myenv = Environment.get(workspace=ws, name="tutorial-env", version=None)
inference_config = InferenceConfig(source_directory='.', entry_script="score.py", environment=myenv)


### Deploy your model toACI

In [None]:
service = Model.deploy(workspace=ws, 
                       name=model_name +'-service4', 
                       models=[model], 
                       inference_config=inference_config, 
                       deployment_config=aciconfig)

service.wait_for_deployment(show_output=True)

In [None]:
print('Scoring API available at: {}'.format(service.serialize()['scoringUri']))

In [None]:
import json
#label = "Item_Outlet_Sales"

input_payload = json.dumps({
  "data": [{
        "Item_Identifier" : "FDA15",
        "Item_Weight":9.3,
        "Item_Fat_Content": "Low Fat",
        "Item_Visibility": 0.016047,
        "Item_Type" : "Dairy",
        "Item_MRP" : 249.8092,
        "Outlet_Identifier": "OUT049",
        "Outlet_Establishment_Year": 1999, 
        "Outlet_Size": "Medium", 
        "Outlet_Location_Type": "Tier 1", 
        "Outlet_Type": "Supermarket Type1",
        }
]
})

output = service.run(input_payload)

print(output)

In [None]:
import requests
import json


endpoint_url = service.serialize()['scoringUri']
headers = {
    "Content-Type": "application/json"
}
body =  {
  "data": [{
        "Item_Identifier" : "FDA15",
        "Item_Weight":9.3,
        "Item_Fat_Content": "Low Fat",
        "Item_Visibility": 0.016047,
        "Item_Type" : "Dairy",
        "Item_MRP" : 249.8092,
        "Outlet_Identifier": "OUT049",
        "Outlet_Establishment_Year": 1999, 
        "Outlet_Size": "Medium", 
        "Outlet_Location_Type": "Tier 1", 
        "Outlet_Type": "Supermarket Type1",
        }
]
}
r = requests.post(endpoint_url, headers=headers, data=json.dumps(body))
results = r.json()
results

### Retrieve access keys for your API endpoint
An authentication mechanism was added to the API endpoint that requires a unique key be provided with any requests to the API. These keys can be programmatically retrieved by users who have access to the AML workspace, or retrieved manually from the workspace. It is worth noting, these keys can be rotated at your discretion and old keys will no longer work.

Webservice documentation: https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.webservice(class)?view=azure-ml-py

## Deploy to AKS

Deploy AKS Cluster

In [None]:
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.core.compute import ComputeTarget, AksCompute

# Choose a name for your AKS cluster
aks_name = 'aks-mm'

# Uses the specific FPGA enabled VM (sku: Standard_PB6s)
# Standard_PB6s are available in: eastus, westus2, westeurope, southeastasia
prov_config = AksCompute.provisioning_configuration(vm_size = "Standard_D3_v2",
                                                       agent_count = 1,
                                                       location = "eastus")

In [None]:
# Verify that cluster does not exist already
try:
    aks_target = ComputeTarget(workspace=ws, name=aks_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # Use the default configuration (can also provide parameters to customize)
    prov_config = AksCompute.provisioning_configuration()

    # Create the cluster
    aks_target = ComputeTarget.create(workspace = ws, 
                                    name = aks_name, 
                                    provisioning_configuration = prov_config)

if aks_target.get_status() != "Succeeded":
    aks_target.wait_for_completion(show_output=True)

In [None]:
from azureml.core.webservice import AksWebservice, Webservice
from azureml.core.model import Model

deployment_config = AksWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)
service = Model.deploy(ws, "cat-boost-sv", [model], inference_config, deployment_config, aks_target, overwrite=True)
service.wait_for_deployment(show_output = True)
print(service.state)
print(service.get_logs())

In [None]:
key1, Key2 = service.get_keys()
print(key1)
selected_key = key1

In [None]:
import requests
import json

endpoint_url = service.serialize()['scoringUri']
headers = {
    "Authorization": "Bearer {}".format(selected_key),
    "Content-Type": "application/json"
}
body =  {
  "data": [{
        "Item_Identifier" : "FDA15",
        "Item_Weight":9.3,
        "Item_Fat_Content": "Low Fat",
        "Item_Visibility": 0.016047,
        "Item_Type" : "Dairy",
        "Item_MRP" : 249.8092,
        "Outlet_Identifier": "OUT049",
        "Outlet_Establishment_Year": 1999, 
        "Outlet_Size": "Medium", 
        "Outlet_Location_Type": "Tier 1", 
        "Outlet_Type": "Supermarket Type1",
        },
       {
        "Item_Identifier" : "FDA15",
        "Item_Weight":9.3,
        "Item_Fat_Content": "Low Fat",
        "Item_Visibility": 0.016047,
        "Item_Type" : "Dairy",
        "Item_MRP" : 249.8092,
        "Outlet_Identifier": "OUT049",
        "Outlet_Establishment_Year": 1999, 
        "Outlet_Size": "Medium", 
        "Outlet_Location_Type": "Tier 1", 
        "Outlet_Type": "Supermarket Type1",
        }
]
}
r = requests.post(endpoint_url, headers=headers, data=json.dumps(body))
results = r.json()
results