In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="quick-starts-ws-130509")
exp = Experiment(workspace=ws, name="my-experiment")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-130509
Azure region: southcentralus
Subscription id: e8f628b3-bb5b-4edf-947a-8637ca6ea7c2
Resource group: aml-quickstarts-130509


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

# Choose a name for your CPU cluster
cpu_cluster_name = "cpucluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
#from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import uniform, choice
from azureml.core import ScriptRunConfig
from azureml.train.estimator import Estimator
from shutil import copyfile

import os

# Specify parameter sampler
ps = RandomParameterSampling({    
    "--C": uniform(1.0, 1.5),
    "--max_iter": choice(1000,3000)
})


# Specify a Policy
policy = BanditPolicy(evaluation_interval=2, slack_factor = 0.1)


if "training" not in os.listdir():
    os.mkdir("./training")
copyfile('train.py', "./training/train.py")
    
# Create a SKLearn estimator for use with train.py
est = Estimator(source_directory = 'training',
                conda_packages=['scikit-learn'],
                entry_script = 'train.py',
                compute_target=cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hdc = HyperDriveConfig(estimator = est,
                       hyperparameter_sampling = ps,
                       policy = policy,
                       primary_metric_name = 'Accuracy',
                       primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                       max_total_runs=20,
                       max_concurrent_runs=4)

'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.
hdr = exp.submit(config=hdc)



In [5]:
RunDetails(hdr).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [6]:
import joblib
# Get your best run and save the model from that run.
best_run = hdr.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameters_values = best_run.get_details()['runDefinition']['arguments']

print("Best run id: " , best_run.id),
print("Accuracy : " , best_run.get_metrics()['Accuracy']),
print("Params : " , parameters_values)

Best run id:  HD_f352acf5-7474-41ee-80c0-67e2163ecca0_10
Accuracy :  0.9103186646433991
Params :  ['--C', '1.2347051061064243', '--max_iter', '3000']


In [7]:
best_run.download_file(name='outputs/model.pkl', output_file_path='outputs')

In [3]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path="https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv")

In [4]:
from train import clean_data

# Use the clean_data function to clean your data.
x, y = clean_data(ds)

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
x_train['y'] = y_train.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
cpu_cluster

AmlCompute(workspace=Workspace.create(name='quick-starts-ws-130509', subscription_id='e8f628b3-bb5b-4edf-947a-8637ca6ea7c2', resource_group='aml-quickstarts-130509'), name=cpucluster, id=/subscriptions/e8f628b3-bb5b-4edf-947a-8637ca6ea7c2/resourceGroups/aml-quickstarts-130509/providers/Microsoft.MachineLearningServices/workspaces/quick-starts-ws-130509/computes/cpucluster, type=AmlCompute, provisioning_state=Succeeded, location=southcentralus, tags=None)

In [10]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=x_train,
    label_column_name='y',
    n_cross_validations=5)

In [11]:
# Submit your automl run

from azureml.core.experiment import Experiment

experiment = Experiment(ws, "automl_experiment")
run = experiment.submit(config=automl_config, show_output=True)

No run_configuration provided, running on local with default configuration
Running on local machine
Parent Run ID: AutoML_9bf0ef19-9929-424f-9038-0bbddb36d098

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/Auto

In [12]:
# Retrieve and save your best automl model.
best_run_automl = run.get_best_child()

In [13]:
best_run_automl.download_file(name='outputs/model.pkl', output_file_path='outputs/model_auto.pkl')

In [14]:
cpu_cluster.delete()