In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: quick-starts-ws-134212
Azure region: southcentralus
Subscription id: d4ad7261-832d-46b2-b093-22156001df5b
Resource group: aml-quickstarts-134212


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

amlcompute_cluster_name= "cpu-cluster"
compute_config = AmlCompute.provisioning_configuration(vm_size = "Standard_D2_V2", max_nodes= 4 )
aml_compute= ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)
aml_compute.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import normal, uniform, choice
import os


# Specify parameter sampler
ps = ps = RandomParameterSampling(parameter_space={"C": uniform(0.0, 1.0), 
                                 "max_iter": choice(50,100,150,200,250)})
    

# Specify a Policy
policy = BanditPolicy(evaluation_interval=1, slack_factor= 0.1, delay_evaluation= 5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory= "./",entry_script="train.py", compute_target= aml_compute)                                                     

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(estimator= est,
                                    hyperparameter_sampling= ps, 
                                    policy= policy,
                                    primary_metric_name= "Accuracy", 
                                    primary_metric_goal= PrimaryMetricGoal.MAXIMIZE, 
                                    max_total_runs= 40,
                                    max_concurrent_runs=4)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

hdr= exp.submit(hyperdrive_config)
RunDetails(hdr).show()
hdr.wait_for_completion(show_output= True)
assert(hdr.get_status() =="completed")



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_d67155dc-f6c8-4924-90da-2785a6cb8371
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_d67155dc-f6c8-4924-90da-2785a6cb8371?wsid=/subscriptions/d4ad7261-832d-46b2-b093-22156001df5b/resourcegroups/aml-quickstarts-134212/workspaces/quick-starts-ws-134212

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-10T12:40:06.272883][API][INFO]Experiment created<END>\n""<START>[2021-01-10T12:40:06.806806][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-01-10T12:40:06.987290][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-10T12:40:07.6688963Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_d67155dc-f6c8-4924-90da-2785a6cb8371
Web View: https://ml.azure.com/experiments/udacity-project/runs/HD_d67155dc-f6c8-4924-90da-2785a6cb8371?wsid=/subscriptions/d4ad7

AssertionError: 

In [6]:
import joblib
# Get your best run and save the model from that run.

best_run= hdr.get_best_run_by_primary_metric()
print(best_run.get_details()['runDefinition']['arguments'])



['--C', '0.48392209547769616', '--max_iter', '200']


In [7]:
print(best_run.get_file_names())

['azureml-logs/55_azureml-execution-tvmps_21e89a159af33e906c0841e5d0726c1a0856cbf3161fa5b337af366b90d501e0_d.txt', 'azureml-logs/65_job_prep-tvmps_21e89a159af33e906c0841e5d0726c1a0856cbf3161fa5b337af366b90d501e0_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_21e89a159af33e906c0841e5d0726c1a0856cbf3161fa5b337af366b90d501e0_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'logs/azureml/105_azureml.log', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/model.joblib']


In [7]:
model = best_run.register_model(model_name='hyperdrive_bestmodel', model_path='./outputs/model.joblib')

In [8]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds= TabularDatasetFactory.from_delimited_files('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv' )

In [9]:
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= 0.2, random_state= 10)

In [11]:
clean_df= x_train.copy(deep=True)
clean_df['y']= y_train
local_path= './prepared.csv'
data_csv= clean_df.to_csv(local_path)


In [12]:
datastore= ws.get_default_datastore()

In [13]:

datastore.upload(src_dir= './', target_path= './')

Uploading an estimated of 6 files
Uploading ./prepared.csv
Uploaded ./prepared.csv, 1 files out of an estimated total of 6
Uploading ./train.py
Uploaded ./train.py, 2 files out of an estimated total of 6
Uploading ./udacity-project.ipynb
Uploaded ./udacity-project.ipynb, 3 files out of an estimated total of 6
Uploading ./.ipynb_checkpoints/train-checkpoint.py
Uploaded ./.ipynb_checkpoints/train-checkpoint.py, 4 files out of an estimated total of 6
Uploading ./.ipynb_checkpoints/udacity-project-checkpoint.ipynb
Uploaded ./.ipynb_checkpoints/udacity-project-checkpoint.ipynb, 5 files out of an estimated total of 6
Uploading ./__pycache__/train.cpython-36.pyc
Uploaded ./__pycache__/train.cpython-36.pyc, 6 files out of an estimated total of 6
Uploaded 6 files


$AZUREML_DATAREFERENCE_workspaceblobstore

In [14]:
clean_dataset= TabularDatasetFactory.from_delimited_files(datastore.path('./prepared.csv'))

In [15]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    compute_target= aml_compute,
    experiment_timeout_minutes=30,
    task= "classification",
    primary_metric= 'accuracy',
    training_data=clean_dataset,
    label_column_name="y",
    n_cross_validations=5)

In [16]:
# Submit your automl run
remote_run= exp.submit(automl_config,show_output= False)

Running on remote.


In [17]:
# Retrieve and save your best automl model.
remote_run.wait_for_completion(show_output=True)
best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of 

In [19]:
# Deleting Compute Cluster

aml_compute.delete()

Current provisioning state of AmlCompute is "Deleting"

