In [1]:
from azureml.core import Datastore

In [2]:
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Workspace, Dataset


from azureml.pipeline.core import Pipeline

from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

from azureml.core import Dataset, ComputeTarget
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep
from azureml.core import Experiment 
import os

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
ws = Workspace.from_config(auth=InteractiveLoginAuthentication(tenant_id=os.environ["AML_TENANT_ID"]))
ws

Workspace.create(name='pipelines', subscription_id='65a1016d-0f67-45d2-b838-b8f373d6d52e', resource_group='laobri-ml')

In [4]:
compute_name = "cpu-compute3"
if not compute_name in ws.compute_targets :
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                min_nodes=0,
                                                                max_nodes=1)
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # Show the result
    print(compute_target.get_status().serialize())

In [5]:
compute = AmlCompute(ws, compute_name)
print(compute)

AmlCompute(workspace=Workspace.create(name='pipelines', subscription_id='65a1016d-0f67-45d2-b838-b8f373d6d52e', resource_group='laobri-ml'), name=cpu-compute3, id=/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourceGroups/laobri-ml/providers/Microsoft.MachineLearningServices/workspaces/pipelines/computes/cpu-compute3, type=AmlCompute, provisioning_state=Succeeded, location=westus, tags=None)


In [6]:
datastore = ws.get_default_datastore()

In [7]:
aml_run_config = RunConfiguration()
aml_run_config.target = compute

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn', 'pyarrow'], 
    pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]'], 
    pin_sdk_version=False)

## Step 0: Grab an open dataset and register it

This is baseline data. If the `Dataset` does not exist, create and register it. Not a part of the Pipeline.

In [8]:
if not 'titanic_ds' in ws.datasets.keys() :
    # create a TabularDataset from Titanic training data
    web_paths = ['https://dprepdata.blob.core.windows.net/demo/Titanic.csv',
                 'https://dprepdata.blob.core.windows.net/demo/Titanic2.csv']
    titanic_ds = Dataset.Tabular.from_delimited_files(path=web_paths)

    titanic_ds.register(workspace = ws,
                                     name = 'titanic_ds',
                                     description = 'new titanic training data',
                                     create_new_version = True)

titanic_ds = Dataset.get_by_name(ws, 'titanic_ds')

In [9]:
type(titanic_ds)

azureml.data.tabular_dataset.TabularDataset

In [10]:
if not 'titanic_files_ds' in ws.datasets.keys() :
    # create a TabularDataset from Titanic training data
    web_paths = ['https://dprepdata.blob.core.windows.net/demo/Titanic.csv',
                 'https://dprepdata.blob.core.windows.net/demo/Titanic2.csv']
    titanic_ds = Dataset.File.from_files(path=web_paths)

    titanic_ds.register(workspace = ws,
                                     name = 'titanic_files_ds',
                                     description = 'File Dataset of titanic training data',
                                     create_new_version = True)

## Step 1: Dataprep

In [11]:
%%writefile dataprep.py
from azureml.core import Run

import pandas as pd 
import numpy as np 
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
import argparse

RANDOM_SEED=42

def prepare_age(df):
    # Fill in missing Age values from distribution of present Age values 
    mean = df["Age"].mean()
    std = df["Age"].std()
    is_null = df["Age"].isnull().sum()
    # compute enough (== is_null().sum()) random numbers between the mean, std
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = df["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    df["Age"] = age_slice
    df["Age"] = df["Age"].astype(int)
    
    # Quantize age into 5 classes
    df['Age_Group'] = pd.qcut(df['Age'],5, labels=False)
    df.drop(['Age'], axis=1, inplace=True)
    return df

def prepare_fare(df):
    df['Fare'].fillna(0, inplace=True)
    df['Fare_Group'] = pd.qcut(df['Fare'],5,labels=False)
    df.drop(['Fare'], axis=1, inplace=True)
    return df 

def prepare_genders(df):
    genders = {"male": 0, "female": 1, "unknown": 2}
    df['Sex'] = df['Sex'].map(genders)
    df['Sex'].fillna(2, inplace=True)
    df['Sex'] = df['Sex'].astype(int)
    return df

def prepare_embarked(df):
    df['Embarked'].replace('', 'U', inplace=True)
    df['Embarked'].fillna('U', inplace=True)
    ports = {"S": 0, "C": 1, "Q": 2, "U": 3}
    df['Embarked'] = df['Embarked'].map(ports)
    return df
    
parser = argparse.ArgumentParser()
parser.add_argument('--output_path', dest='output_path', required=True)
args = parser.parse_args()
    
titanic_ds = Run.get_context().input_datasets['titanic_ds']
df = titanic_ds.to_pandas_dataframe().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df = prepare_embarked(prepare_genders(prepare_fare(prepare_age(df))))

os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
pq.write_table(pa.Table.from_pandas(df), args.output_path)

print(f"Wrote test to {args.output_path} and train to {args.output_path}")

Overwriting dataprep.py


In [12]:
prepped_data_path = PipelineData("titanic_train", datastore).as_dataset()

In [13]:
dataprep_step = PythonScriptStep(
    name="dataprep", 
    script_name="dataprep.py", 
    compute_target=compute, 
    runconfig=aml_run_config,
    arguments=["--output_path", prepped_data_path],
    inputs=[titanic_ds.as_named_input("titanic_ds")],
    outputs=[prepped_data_path]
)

### Step 2: Train with AutoMLStep

In [14]:
prepped_data_potds = prepped_data_path.parse_parquet_files(file_extension=None)

X = prepped_data_potds.drop_columns('Survived')
y = prepped_data_potds.keep_columns('Survived')

train_model_folder = './scripts/trainmodel'
# Change iterations to a reasonable number (50) to get better accuracy
automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 2,
    "primary_metric" : 'AUC_weighted',
    "n_cross_validations" : 2
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automated_ml_errors.log',
                             path = train_model_folder,
                             compute_target = compute,
                             run_configuration = aml_run_config,
                             featurization = 'auto',
                             X = X,
                             y = y,
                             **automl_settings)
                             
print("AutoML config created.")



AutoML config created.


In [15]:
train_step = AutoMLStep(name='AutoML_Classification',
                                 automl_config=automl_config,
                                 passthru_automl_config=False,
                                 allow_reuse=True)
print("train_step created.")

train_step created.


## Step 3: Register the model

## Submit it

In [16]:

if not 'titanic_automl' in ws.experiments.keys() :
    Experiment(ws, 'titanic_automl')
experiment = ws.experiments['titanic_automl']

In [17]:
pipeline = Pipeline(ws, [dataprep_step, train_step])

In [18]:
run = experiment.submit(pipeline, show_output=True)

Created step dataprep [0237676b][3ce3ad59-16b1-4b97-b50d-105fb39b9464], (This step will run and generate new outputs)
Created step AutoML_Classification [ac25146d][5d8d606e-4425-4104-912e-e80036db4c6d], (This step will run and generate new outputs)
Submitted PipelineRun 346582f4-d0c3-4813-9ec7-951d6e793136
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/346582f4-d0c3-4813-9ec7-951d6e793136?wsid=/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourcegroups/laobri-ml/workspaces/pipelines


In [19]:
run.wait_for_completion()
run.id

PipelineRunId: 346582f4-d0c3-4813-9ec7-951d6e793136
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/346582f4-d0c3-4813-9ec7-951d6e793136?wsid=/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourcegroups/laobri-ml/workspaces/pipelines
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 47ba9fcc-c5d6-4b71-8751-14db926bbca6
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/47ba9fcc-c5d6-4b71-8751-14db926bbca6?wsid=/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourcegroups/laobri-ml/workspaces/pipelines
StepRun( dataprep ) Status: NotStarted
StepRun( dataprep ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_fca2571acf4b6ad479fea6272f49fbd8f62b9b19bb54862876817458092b0ba5_d.txt
2020-04-25T02:38:13Z Starting output-watcher...
2020-04-25T02:38:13Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
Login Succeeded
Using default tag: latest
lates




StepRunId: 43a5ccc3-8432-4258-8c7d-eff101fb7e2e
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/43a5ccc3-8432-4258-8c7d-eff101fb7e2e?wsid=/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourcegroups/laobri-ml/workspaces/pipelines
StepRun( AutoML_Classification ) Status: NotStarted
StepRun( AutoML_Classification ) Status: Queued
StepRun( AutoML_Classification ) Status: Running





StepRun(AutoML_Classification) Execution Summary
StepRun( AutoML_Classification ) Status: Finished
{'runId': '43a5ccc3-8432-4258-8c7d-eff101fb7e2e', 'target': 'cpu-compute3', 'status': 'Completed', 'startTimeUtc': '2020-04-25T02:40:50.110036Z', 'endTimeUtc': '2020-04-25T02:45:15.169786Z', 'properties': {'azureml.runsource': 'azureml.StepRun', 'ContentSnapshotId': 'dae51c9b-e0b5-4a15-8530-c7d3d422ade6', 'StepType': 'AutoMLStep', 'azureml.pipelinerunid': '346582f4-d0c3-4813-9ec7-951d6e793136', 'num_iterations': '2', 'training_type': 'TrainFull', 'acquisition_function': 'EI', 'metrics': 'accuracy', 'primary_metric': 'AUC_weighted', 'train_split': '0', 'MaxTimeSeconds': '600', 'acquisition_parameter': '0', 'num_cross_validation': '2', 'target': 'cpu-compute3', 'RawAMLSettingsString': "{'name':'titanic_automl','subscription_id':'65a1016d-0f67-45d2-b838-b8f373d6d52e','resource_group':'laobri-ml','workspace_name':'pipelines','path':'./scripts/trainmodel','iterations':2,'data_script':None,'pr



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '346582f4-d0c3-4813-9ec7-951d6e793136', 'status': 'Completed', 'startTimeUtc': '2020-04-25T02:37:36.238214Z', 'endTimeUtc': '2020-04-25T02:45:22.308356Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}'}, 'inputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://pipelines8090722083.blob.core.windows.net/azureml/ExperimentRun/dcid.346582f4-d0c3-4813-9ec7-951d6e793136/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=P9MVXKWBvD8dlCPm8ObRgd01uD3ZSty5DUOWKHc%2BZnk%3D&st=2020-04-25T02%3A37%3A09Z&se=2020-04-25T10%3A47%3A09Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://pipelines8090722083.blob.core.windows.net/azureml/ExperimentRun/dcid.346582f4-d0c3-4813-9ec7-951d6e793136/logs/azureml/stderrlogs.txt?sv=2019-02-02&sr=b&sig=EnlJHcVa8znVhoi7hOi%2FlpUIMgEIPYa50x86ZpwJfuk%3D&st=2020-04-25T02%3A37%3A09Z&se=2020-04-25T10%3A47%3A0

'346582f4-d0c3-4813-9ec7-951d6e793136'

In [20]:
automl_run = next(r for r in run.get_children() if r.name == 'AutoML_Classification')
outputs = automl_run.get_outputs()
metrics = outputs['default_metrics_AutoML_Classification']
model = outputs['default_model_AutoML_Classification']

metrics.get_port_data_reference().download('.')
model.get_port_data_reference().download('.')

1

In [21]:
metrics

<azureml.pipeline.core.run.StepRunOutput at 0x1049b7f90>

In [22]:
type(model)

azureml.pipeline.core.run.StepRunOutput

In [23]:
model

<azureml.pipeline.core.run.StepRunOutput at 0x1318cb810>