In [22]:
from azureml.core import Datastore

In [23]:
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Workspace, Dataset


from azureml.pipeline.core import Pipeline

from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

from azureml.core import Dataset, ComputeTarget
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep
from azureml.core import Experiment 
import os

In [24]:
ws = Workspace.from_config(auth=InteractiveLoginAuthentication(tenant_id=os.environ["AML_TENANT_ID"]))
ws

Workspace.create(name='pipelines', subscription_id='65a1016d-0f67-45d2-b838-b8f373d6d52e', resource_group='laobri-ml')

In [25]:
compute_name = "cpu-compute3"
if not compute_name in ws.compute_targets :
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D2_V2",
                                                                min_nodes=0,
                                                                max_nodes=1)
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # Show the result
    print(compute_target.get_status().serialize())


In [26]:
compute = AmlCompute(ws, compute_name)
print(compute)

AmlCompute(workspace=Workspace.create(name='pipelines', subscription_id='65a1016d-0f67-45d2-b838-b8f373d6d52e', resource_group='laobri-ml'), name=cpu-compute3, id=/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourceGroups/laobri-ml/providers/Microsoft.MachineLearningServices/workspaces/pipelines/computes/cpu-compute3, type=AmlCompute, provisioning_state=Succeeded, location=westus, tags=None)


In [27]:
datastore = ws.get_default_datastore()

In [28]:
aml_run_config = RunConfiguration()
aml_run_config.target = compute

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['pandas','scikit-learn', 'pyarrow'], 
    pip_packages=['azureml-sdk', 'azureml-dataprep[fuse,pandas]'], 
    pin_sdk_version=False)

## Step 0: Grab an open dataset and register it

This is baseline data. If the `Dataset` does not exist, create and register it. Not a part of the Pipeline.

In [29]:
if not 'titanic_ds' in ws.datasets.keys() :
    # create a TabularDataset from Titanic training data
    web_paths = ['https://dprepdata.blob.core.windows.net/demo/Titanic.csv',
                 'https://dprepdata.blob.core.windows.net/demo/Titanic2.csv']
    titanic_ds = Dataset.Tabular.from_delimited_files(path=web_paths)

    titanic_ds.register(workspace = ws,
                                     name = 'titanic_ds',
                                     description = 'new titanic training data',
                                     create_new_version = True)

titanic_ds = Dataset.get_by_name(ws, 'titanic_ds')

In [30]:
type(titanic_ds)

azureml.data.tabular_dataset.TabularDataset

In [31]:
if not 'titanic_files_ds' in ws.datasets.keys() :
    # create a TabularDataset from Titanic training data
    web_paths = ['https://dprepdata.blob.core.windows.net/demo/Titanic.csv',
                 'https://dprepdata.blob.core.windows.net/demo/Titanic2.csv']
    titanic_ds = Dataset.File.from_files(path=web_paths)

    titanic_ds.register(workspace = ws,
                                     name = 'titanic_files_ds',
                                     description = 'File Dataset of titanic training data',
                                     create_new_version = True)

# Scratch to understand Parquet

In [32]:
df = titanic_ds.to_pandas_dataframe()

In [33]:
import pyarrow as pa 
import pyarrow.parquet as pq

In [44]:
pa_table = pa.Table.from_pandas(df)
pa_table

pyarrow.Table
PassengerId: int64
Survived: int64
Pclass: int64
Name: string
Sex: string
Age: double
SibSp: int64
Parch: int64
Ticket: string
Fare: double
Cabin: string
Embarked: string

In [45]:
pq.write_table(pa_table, "titanic.parquet")

## Step 1: Dataprep

In [67]:
%%writefile dataprep.py
from azureml.core import Run

import pandas as pd 
import numpy as np 
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
import argparse

RANDOM_SEED=42

def prepare_age(df):
    # Fill in missing Age values from distribution of present Age values 
    mean = df["Age"].mean()
    std = df["Age"].std()
    is_null = df["Age"].isnull().sum()
    # compute enough (== is_null().sum()) random numbers between the mean, std
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = df["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    df["Age"] = age_slice
    df["Age"] = df["Age"].astype(int)
    
    # Quantize age into 5 classes
    df['Age_Group'] = pd.qcut(df['Age'],5, labels=False)
    df.drop(['Age'], axis=1, inplace=True)
    return df

def prepare_fare(df):
    df['Fare'].fillna(0, inplace=True)
    df['Fare_Group'] = pd.qcut(df['Fare'],5,labels=False)
    df.drop(['Fare'], axis=1, inplace=True)
    return df 

def prepare_genders(df):
    genders = {"male": 0, "female": 1, "unknown": 2}
    df['Sex'] = df['Sex'].map(genders)
    df['Sex'].fillna(2, inplace=True)
    df['Sex'] = df['Sex'].astype(int)
    return df

def prepare_embarked(df):
    df['Embarked'].replace('', 'U', inplace=True)
    df['Embarked'].fillna('U', inplace=True)
    ports = {"S": 0, "C": 1, "Q": 2, "U": 3}
    df['Embarked'] = df['Embarked'].map(ports)
    return df
    
parser = argparse.ArgumentParser()
parser.add_argument('--train_path', dest='train_path', required=True)
parser.add_argument('--test_path', dest='test_path', required=True)
args = parser.parse_args()
    
titanic_ds = Run.get_context().input_datasets['titanic_ds']
df = titanic_ds.to_pandas_dataframe().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df = prepare_embarked(prepare_genders(prepare_fare(prepare_age(df))))

#train, test = train_test_split(df, test_size = 0.2)

# Write outpout
os.makedirs(os.path.dirname(args.train_path), exist_ok=True)
#os.makedirs(os.path.dirname(args.test_path), exist_ok=True)

pq.write_table(pa.Table.from_pandas(df), args.train_path)
#pq.write_table(pa.Table.from_pandas(test), args.test_path)

print(f"Wrote test to {args.train_path} and train to {args.test_path}")

Overwriting dataprep.py


In [68]:
train_path = PipelineData("titanic_train", datastore).as_dataset()
test_path = PipelineData("titanic_test", datastore).as_dataset()

In [69]:
dataprep_step = PythonScriptStep(
    name="dataprep", 
    script_name="dataprep.py", 
    compute_target=compute, 
    runconfig=aml_run_config,
    arguments=["--train_path", train_path, "--test_path", test_path],
    inputs=[titanic_ds.as_named_input("titanic_ds")],
    outputs=[train_path, test_path]
)

### Step 2: Train with AutoMLStep

In [70]:
# type(train_path) == PipelineOutputFileDataset
# type(train_potds) == PipelineOutputTabularDataset
train_potds = train_path.parse_parquet_files(file_extension=None)
#test_potds = test_path.parse_parquet_files(file_extension=None)

X = train_potds.drop_columns('Survived')
y = train_potds.keep_columns('Survived')
#test_X = test_potds.drop_columns('Survived')
#test_y = test_potds.keep_columns('Survived')

train_model_folder = './scripts/trainmodel'
# Change iterations to a reasonable number (50) to get better accuracy
automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 2,
    "primary_metric" : 'AUC_weighted',
    "n_cross_validations" : 2
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automated_ml_errors.log',
                             path = train_model_folder,
                             compute_target = compute,
                             run_configuration = aml_run_config,
                             featurization = 'auto',
                             X = X,
                             y = y,
                             #X_valid = test_X,
                             #y_valid = test_y,
                             **automl_settings)
                             
print("AutoML config created.")



AutoML config created.


In [71]:
train_step = AutoMLStep(name='AutoML_Classification',
                                 automl_config=automl_config,
                                 passthru_automl_config=False,
                                 allow_reuse=True)
print("train_step created.")

train_step created.


## Submit it

In [72]:

if not 'titanic_automl' in ws.experiments.keys() :
    Experiment(ws, 'titanic_automl')
experiment = ws.experiments['titanic_automl']

In [73]:
pipeline = Pipeline(ws, [dataprep_step, train_step])

In [74]:
run = experiment.submit(pipeline, show_output=True)

Created step dataprep [046e7a47][07e3bc16-c21c-4645-b8fe-20f5c23ff544], (This step will run and generate new outputs)
Created step AutoML_Classification [00f1e9cb][783a685e-f34d-4e57-8d11-6068cd20b735], (This step will run and generate new outputs)
Submitted PipelineRun 9e4baf6c-532f-4b45-8a45-1f276f53b0f7
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/9e4baf6c-532f-4b45-8a45-1f276f53b0f7?wsid=/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourcegroups/laobri-ml/workspaces/pipelines


In [94]:
run.wait_for_completion()
run.id

PipelineRunId: 9e4baf6c-532f-4b45-8a45-1f276f53b0f7
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/9e4baf6c-532f-4b45-8a45-1f276f53b0f7?wsid=/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourcegroups/laobri-ml/workspaces/pipelines

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '9e4baf6c-532f-4b45-8a45-1f276f53b0f7', 'status': 'Completed', 'startTimeUtc': '2020-04-24T21:53:25.999204Z', 'endTimeUtc': '2020-04-24T22:11:55.315969Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{}'}, 'inputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://pipelines8090722083.blob.core.windows.net/azureml/ExperimentRun/dcid.9e4baf6c-532f-4b45-8a45-1f276f53b0f7/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=2%2FwRmhWqeDOTvsnNY97%2FhYRK4yebP7%2BtGO%2FuOkIFnzM%3D&st=2020-04-24T22%3A04%3A25Z&se=2020-04-25T06%3A14%3A25Z&sp=r', 'logs/azur

'9e4baf6c-532f-4b45-8a45-1f276f53b0f7'

In [None]:
run.pipeline_outputs

In [96]:
metrics_output = run.get_pipeline_output('default_metrics_AutoML_Classification')
num_file_downloaded = metrics_output.download('.', show_progress=True)

ErrorResponseException: (BadRequest) PipelineRun output with name default_metrics_AutoML_Classification does not exist.
Parameter name: pipelineRunOutputName

In [91]:
child_runs[0].find_step_run("AutoML_classification")

AttributeError: 'StepRun' object has no attribute 'find_step_run'