In [1]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
ws.get_details()

#ws = Workspace.get(name='udacity-project')
#ws = ws.name
exp = Experiment(workspace=ws, name="udacity-project-06")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

Workspace name: udacity-exercises
Azure region: westeurope
Subscription id: 8c3570f2-6a79-4490-a50a-4126862dd3a4
Resource group: udacity-exercises


In [2]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# TODO: Create compute cluster
# Use vm_size = "Standard_D2_V2" in your provisioning configuration.
# max_nodes should be no greater than 4.

### YOUR CODE HERE ###
project_clusterName = "proj-comcluster"

print(ws.name)

try:
    project_computeTarget = ComputeTarget(workspace = ws, name = project_clusterName)
    print('compute cluster: ' + project_clusterName)
except ComputeTargetException:
    print('a new compute target will be provision, pls wait...')
    project_clusterConfig = AmlCompute.provisioning_configuration(vm_size = 'Standard_D2_V2', max_nodes = 4) # new cluster specification

    project_computeTarget = ComputeTarget.create(ws, project_clusterName, project_clusterConfig) #create a new cluster
    project_computeTarget.wait_for_completion(show_output = True) # wait for cluster operation to complete
    print(project_computeTarget.get_status().serialize()) # get status of the new cluster 

udacity-exercises
compute cluster: proj-comcluster


In [3]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform
from azureml.train.hyperdrive import choice
import os

# Specify parameter sampler
### YOUR CODE HERE ###
ps = RandomParameterSampling(
    {'--C': uniform(1.0,2.0), '--max_iter': choice(100, 110, 120)}
)


# Specify a Policy
### YOUR CODE HERE ###
policy = BanditPolicy(slack_amount = 0.1,delay_evaluation = 1)


if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
### YOUR CODE HERE ###
est = SKLearn(source_directory='.', compute_target = project_computeTarget,
        entry_script='train.py')

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
### YOUR CODE HERE ###
hyperdrive_config = HyperDriveConfig(estimator = est,
                              hyperparameter_sampling = ps,
                              policy = policy,
                              primary_metric_name = 'accuracy',
                              primary_metric_goal = PrimaryMetricGoal.MAXIMIZE,
                              max_total_runs = 16,
                              max_concurrent_runs = 4)

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.
'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


In [4]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

### YOUR CODE HERE ###
hdr_run = exp.submit(hyperdrive_config)
RunDetails(hdr_run).show()



_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [7]:
import joblib
# Get your best run and save the model from that run.

### YOUR CODE HERE ###
best_run_model = hdr_run.get_best_run_by_primary_metric()
best_run_metrics = hdr_run.get_metrics()
parameter_values = hdr_run.get_details()

model_name = 'Logistic_Reg_best_model' + '.pkl'
joblib.dump(best_run_model, model_name)

['Logistic_Reg_best_model.pkl']

In [10]:
from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.core import Dataset
# Create TabularDataset using TabularDatasetFactory
# Data is available at: 
# "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

### YOUR CODE HERE ###
ds_path = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"

ds = Dataset.Tabular.from_delimited_files(path=ds_path) 

In [11]:
#from trainautoml import clean_data
from train import auto_clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
### YOUR DATA OBJECT HERE ###
#x = clean_data(ds)
x = auto_clean_data(ds)
x_train, x_test = train_test_split(x, test_size=0.1, random_state=11)

label = "y"

In [12]:
from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
# NOTE: DO NOT CHANGE THE experiment_timeout_minutes PARAMETER OR YOUR INSTANCE WILL TIME OUT.
# If you wish to run the experiment longer, you will need to run this notebook in your own
# Azure tenant, which will incur personal costs.
automl_config = AutoMLConfig(
    experiment_timeout_minutes = 30,
    task = 'classification',
    primary_metric = 'accuracy',
    training_data = x_train,
    label_column_name = label,
    n_cross_validations = 10)

In [13]:
from azureml.widgets import RunDetails
# Submit your automl run

### YOUR CODE HERE ###
run_automl = exp.submit(config = automl_config, show_output = True)

RunDetails(run_automl).show()
run_automl.wait_for_completion(show_output = True)

No run_configuration provided, running on local with default configuration
Running in the active local environment.


Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-06,AutoML_99efb6c6-bcf3-46c7-b98b-d705c6e7162a,automl,Preparing,Link to Azure Machine Learning studio,Link to Documentation


Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetBalancing. Performing class balancing sweeping
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias toward

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-06,AutoML_99efb6c6-bcf3-46c7-b98b-d705c6e7162a,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation




****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|3322                             |yes                              |29655                                 |
+---------------------------------+---------------------------------+--------------------------------------+

********************************************

{'runId': 'AutoML_99efb6c6-bcf3-46c7-b98b-d705c6e7162a',
 'target': 'local',
 'status': 'Completed',
 'startTimeUtc': '2021-06-04T15:31:49.441107Z',
 'endTimeUtc': '2021-06-04T16:04:27.989959Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '10',
  'target': 'local',
  'DataPrepJsonString': None,
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.28.0", "azureml-train": "1.28.0", "azureml-train-restclients-hyperdrive": "1.28.0", "azureml-train-core": "1.28.0", "azureml-train-automl": "1.28.0", "azureml-train-automl-runtime": "1.28.0", "azureml-train-automl-client": "1.28.0", "azureml-tensorboard": "1.28.0", "azureml-telemetry": "1.28.0", "azureml-sdk": "1.28.0", "azureml-samples": "0+unkno

In [19]:
# Retrieve and save your best automl model.
import joblib
### YOUR CODE HERE ###
best_run_automl_model, fitted_automl_model  = run_automl.get_output()

print(fitted_automl_model)

model_name = best_run_automl_model.properties['model_name']
#model_name = 'Logistic_autoML_model' + '.pkl'
joblib.dump(best_run_automl_model, model_name)

Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=False, enable_feature_sweeping=True, feature_sweeping_config={}, feature_sweeping_timeout=86400, featurization_config=None, force_text_dnn=False, is_cross_validation=True, is_onnx_compatible=False, observer=None, task='classification', working_dir='/mnt/batch/tasks/shared/LS_root/mount...
), random_state=0, reg_alpha=0, reg_lambda=0.5208333333333334, subsample=0.6, tree_method='auto'))], verbose=False)), ('9', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('logisticregression', LogisticRegression(C=2.559547922699533, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='saga', tol=0.0001, verbose=0, warm_start=False))], verbose=False)), ('4', Pipeline(memory=None, steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('randomforestclassifier', Rando

TypeError: can't pickle _thread.RLock objects

In [4]:
# code to delete the cluster
project_computeTarget.delete()