In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Datastore, Dataset
from azureml.widgets import RunDetails
from azureml.core import ScriptRunConfig

In [2]:
azureml.core.VERSION

'1.28.0'

In [4]:
from azureml.core import Workspace, Experiment

ws= Workspace.from_config()
#ws = Workspace.get(name="udacity-project")
exp = Experiment(workspace=ws, name="udacity-project")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

#run = exp.start_logging()

Workspace name: udacity_pipeline
Azure region: centralindia
Subscription id: 8bb0c18e-6b4b-4695-bd30-aff7b7ff6815
Resource group: udacity


In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
amlcompute_cluster_name = "cpu-cluster"
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
aml_compute = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)
aml_compute.wait_for_completion(show_output=True)


Creating.........
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [9]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, choice
import os
from azureml.core import Environment

sklearn_env = Environment.get(workspace=ws, name='AzureML-sklearn-0.24-ubuntu18.04-py37-cuda11-gpu')

# Specify parameter sampler
ps= RandomParameterSampling({
    "C": uniform(0.8, 1.2),
    "max_iter": choice(75, 100, 120, 130)
    })

# Specify a Policy
policy = BanditPolicy(evaluation_interval= 4, slack_factor = 0.1)


# Create a SKLearn estimator for use with train.py
sk_est = ScriptRunConfig(
			source_directory= "./",
			compute_target= aml_compute,
			script= "train.py",
			environment= sklearn_env
			)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(ps,
										run_config= sk_est,
										primary_metric_name= "Accuracy",
										primary_metric_goal= PrimaryMetricGoal.MAXIMIZE,
										max_total_runs= 100,
										policy= policy
										)

In [10]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

exp_run_obj= exp.submit(hyperdrive_config)

In [11]:
RunDetails(exp_run_obj).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [12]:
for run in exp.get_runs():
    if run.status=='Completed' :
        print(type(run))
        exp_run_obj= run

<class 'azureml.train.hyperdrive.run.HyperDriveRun'>


In [13]:
import joblib

# Get your best run and save the model from that run.

if not os.path.exists("./outputs"):
	os.makedirs("./outputs")

best_run= exp_run_obj.get_best_run_by_primary_metric()
best_run_results= best_run.get_metrics()
print("Best Accuracy= {}".format(best_run_results['Accuracy']))
#model= best_run.register_model(model_name= "scikit_model", 
#						model_path= 'outputs/scikit_model.joblib')

Best Accuracy= 0.9132018209408195


In [14]:
best_run.get_file_names()

['azureml-logs/55_azureml-execution-tvmps_51990ec368e51b19c03cacb484bb3cb6b0659bf826e330802e927ff4ee50f5b8_d.txt',
 'azureml-logs/65_job_prep-tvmps_51990ec368e51b19c03cacb484bb3cb6b0659bf826e330802e927ff4ee50f5b8_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_51990ec368e51b19c03cacb484bb3cb6b0659bf826e330802e927ff4ee50f5b8_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'logs/azureml/95_azureml.log',
 'logs/azureml/dataprep/backgroundProcess.log',
 'logs/azureml/dataprep/backgroundProcess_Telemetry.log',
 'logs/azureml/job_prep_azureml.log',
 'logs/azureml/job_release_azureml.log',
 'outputs/model-C=0.8667554916594451--max_iter=130.joblib']

In [15]:
best_run.download_file(best_run.get_file_names()[-1])

In [16]:
best_run.register_model(model_name= 'hyperdrive_best', 
                        model_path= "./outputs/model-C=0.8667554916594451--max_iter=130.joblib")

Model(workspace=Workspace.create(name='udacity_pipeline', subscription_id='8bb0c18e-6b4b-4695-bd30-aff7b7ff6815', resource_group='udacity'), name=hyperdrive_best, id=hyperdrive_best:1, version=1, tags={}, properties={})

In [17]:
best_run.get_details()

{'runId': 'HD_6d8e2d09-af61-418e-8872-64b778c6a0f3_59',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-06-21T06:28:57.742536Z',
 'endTimeUtc': '2021-06-21T06:29:58.082795Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': 'd1ea2222-1eb2-4080-a768-a481185fe735',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'outputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'command': '',
  'useAbsolutePath': False,
  'arguments': ['--C', '0.8667554916594451', '--max_iter', '130'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'cpu-cluster',
  'dataReferences': {},
  'data': {},
  'outputData': {},
  'jobName': None,
  'maxRunDurationSeconds': 2592000,
  'nodeCount': 1,
  'priority': None,
  'credentialPassthrough': False,
  'identity': None,
  'environment': {'name': 'AzureML-sklearn-0.

In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset
import pandas as pd

# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
data_url= "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
az_dataset= Dataset.Tabular.from_delimited_files(path= data_url)
print(az_dataset.to_pandas_dataframe().head())

   age          job  marital    education  default housing loan    contact  \
0   57   technician  married  high.school       no      no  yes   cellular   
1   55      unknown  married      unknown  unknown     yes   no  telephone   
2   33  blue-collar  married     basic.9y       no      no   no   cellular   
3   36       admin.  married  high.school       no      no   no  telephone   
4   27    housemaid  married  high.school       no     yes   no   cellular   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         1      failure         -1.8   
1   may         thu  ...         2    999         0  nonexistent          1.1   
2   may         fri  ...         1    999         1      failure         -1.8   
3   jun         fri  ...         4    999         0  nonexistent          1.4   
4   jul         fri  ...         2    999         0  nonexistent          1.4   

   cons.price.idx  cons.conf.idx  euribor3m 

In [7]:
from train import clean_data

# Use the clean_data function to clean your data.
X_df, y_df= clean_data(az_dataset)
X_df["y"]= y_df

In [8]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
default_storage= ws.get_default_datastore()
print(default_storage)
automl_data= Dataset.Tabular.register_pandas_dataframe(X_df, default_storage, name= "X_df")
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task='classification',
    primary_metric='accuracy',
    training_data= automl_data,
    label_column_name='y',
    compute_target= aml_compute,
    max_cores_per_iteration= 4,
    max_concurrent_iterations=4
    )

Method register_pandas_dataframe: This is an experimental method, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-6f6a9452-df06-4978-a929-8304a0fddb33",
  "account_name": "udacitypipelin8889280928",
  "protocol": "https",
  "endpoint": "core.windows.net"
}
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/2f5328af-3b63-427d-8495-90862f902b50/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [9]:
automl_data.to_pandas_dataframe().shape

(32950, 40)

In [10]:
# Submit your automl run
exp_automl = Experiment(workspace=ws, name="Automl-Experiment")
#exp.start_logging()
automl_run = exp_automl.submit(automl_config)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
Automl-Experiment,AutoML_3c655ad7-c5b7-40bb-9407-8d2c7455b3c1,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


In [11]:
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [13]:
best_run, fitted_model= automl_run.get_output()
print(best_run)
print("\n\n")
print(fitted_model)

Package:azureml-automl-runtime, training version:1.30.0, current version:1.28.0.post2
Package:azureml-core, training version:1.30.0, current version:1.28.0
Package:azureml-dataset-runtime, training version:1.30.0, current version:1.28.0
Package:azureml-defaults, training version:1.30.0, current version:1.28.0
Package:azureml-interpret, training version:1.30.0, current version:1.28.0
Package:azureml-mlflow, training version:1.30.0, current version:1.28.0
Package:azureml-pipeline-core, training version:1.30.0, current version:1.28.0
Package:azureml-telemetry, training version:1.30.0, current version:1.28.0
Package:azureml-train-automl-client, training version:1.30.0, current version:1.28.0
Package:azureml-train-automl-runtime, training version:1.30.0, current version:1.28.0


Run(Experiment: Automl-Experiment,
Id: AutoML_3c655ad7-c5b7-40bb-9407-8d2c7455b3c1_89,
Type: azureml.scriptrun,
Status: Completed)



Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=False, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/moun...
    gpu_training_param_dict={'processing_unit_type': 'cpu'}
), random_state=0, reg_alpha=0.4166666666666667, reg_lambda=1.0416666666666667, subsample=0.5, tree_method='auto'))], verbose=False))], flatten_transform=None, weights=[0.16666666666666666, 0.25, 0.16666666666666666, 0.16666666666666666, 0.08333333333333333, 0.08333333333333333, 0.08333333333333333]))],
         verbose=False)


In [14]:
best_run.get_file_names()

['accuracy_table',
 'automl_driver.py',
 'azureml-logs/55_azureml-execution-tvmps_672bd27119c50910e1e0108e51d47cf67c5c3e37af6a92a4daa4bbbcf1c8b785_d.txt',
 'azureml-logs/65_job_prep-tvmps_672bd27119c50910e1e0108e51d47cf67c5c3e37af6a92a4daa4bbbcf1c8b785_d.txt',
 'azureml-logs/70_driver_log.txt',
 'azureml-logs/75_job_post-tvmps_672bd27119c50910e1e0108e51d47cf67c5c3e37af6a92a4daa4bbbcf1c8b785_d.txt',
 'azureml-logs/process_info.json',
 'azureml-logs/process_status.json',
 'confusion_matrix',
 'explanation/8bb668e8/classes.interpret.json',
 'explanation/8bb668e8/expected_values.interpret.json',
 'explanation/8bb668e8/features.interpret.json',
 'explanation/8bb668e8/global_names/0.interpret.json',
 'explanation/8bb668e8/global_rank/0.interpret.json',
 'explanation/8bb668e8/global_values/0.interpret.json',
 'explanation/8bb668e8/local_importance_values.interpret.json',
 'explanation/8bb668e8/per_class_names/0.interpret.json',
 'explanation/8bb668e8/per_class_rank/0.interpret.json',
 'explan

In [16]:
best_run.download_file('outputs/model.pkl', "outputs/auto_ml.pkl")
best_run.download_file('outputs/conda_env_v_1_0_0.yml', "outputs/auto_ml_conda_env_v_1_0_0.yml")
best_run.download_file('outputs/env_dependencies.json', "outputs/env_dependencies.json")

In [17]:
best_run.register_model(model_name= 'hyperdrive_best', 
                        model_path= "./outputs/model.pkl")

Model(workspace=Workspace.create(name='udacity_pipeline', subscription_id='8bb0c18e-6b4b-4695-bd30-aff7b7ff6815', resource_group='udacity'), name=hyperdrive_best, id=hyperdrive_best:2, version=2, tags={}, properties={})

In [18]:
aml_compute.delete()

Current provisioning state of AmlCompute is "Deleting"

