<b>Install necessary Python updates as well as AutoML and analytic libraries (7 minutes)</b>

In [None]:
!pip install --upgrade pip --quiet
!pip install --upgrade google-cloud-automl --quiet
!pip install --upgrade google-cloud-logging --quiet

from google.cloud import storage, automl_v1beta1 as automl
import json
import pandas as pd
from pandas import json_normalize
import time
from datetime import datetime
import random 
import string
from google.cloud import logging
import re

print(f'Requirement set at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')

<b>Create necessary variables (approx. 1 minute)</b>

In [None]:
RANDOM = (''.join([random.choice(string.ascii_letters + string.digits) for n in range(6)])).lower() # RANDOM ID GENERATED
PROJECT_ID = 'auto-modeling-v402' # Project Identifier
REGION = 'us-central1' # Bucket region, not to be changed
RAW_SB = PROJECT_ID + '_raw_data' 
MODELS_SB = PROJECT_ID + '_models_bucket'
GCS_URI = 'gs://' + MODELS_SB # This must be an existing Cloud Storage
DISPLAY_NAME = 'bank_dataset' + '_' + RANDOM # Dataset name
MODEL_DN = 'bank_model' + '_' + RANDOM # Model name
TARGET_COLUMN = 'y' # Select a column to be the target
TRAIN_BUDGET = 2000 # Training budget in milli_node_hour (between 1000 and 72000)
gcs_input_uris = ['gs://auto-modeling-v402_raw_data/raw_bank.csv'] # Dataset in CSV format with requirements: https://cloud.google.com/automl-tables/docs/prepare
MODEL_EXPORT = "tf_saved_model" # Model format for the export
print(f'Varibles set at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}, double check following:')
print(f'   Project \x1b[31m{PROJECT_ID}\x1b[0m must exist...')
print(f'   Cloud Storage Bucket \x1b[31m{GCS_URI}\x1b[0m must exist...')
print(f'   Cloud Storage Bucket \x1b[31mgs://{RAW_SB}\x1b[0m must exist...')
print(f'   Training data \x1b[31m{gcs_input_uris}\x1b[0m must exist in a supported CSV format...')
print(f'   AutoML Tables Dataset name \x1b[31m{DISPLAY_NAME}\x1b[0m must be unique...')
print(f'   AutoML Tables Model name \x1b[31m{MODEL_DN}\x1b[0m must be unique...')

<b>Create our class instance (approx. 1 minute)</b>

In [None]:
log_client = logging.Client(project=PROJECT_ID)
storage_client = storage.Client(project=PROJECT_ID)
tables_gcs_client = automl.GcsClient(client=storage_client, bucket_name=RAW_SB)
automl_client = automl.AutoMlClient()
prediction_client = automl.PredictionServiceClient()
tables_client = automl.TablesClient(project=PROJECT_ID, region=REGION, client=automl_client, gcs_client=tables_gcs_client, prediction_client=prediction_client)

print(f'Varibles set at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')

<b>Create datasets in AutoML (approx. 1 minute)</b>

In [None]:
new_dataset = False
try:
    dataset = tables_client.get_dataset(dataset_display_name=DISPLAY_NAME)
    print(f'Dataset retrieved at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
except:
    new_dataset = True
    dataset = tables_client.create_dataset(DISPLAY_NAME)
    print(f'Dataset created at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')


<b>Import data into AutoML datasets from GCS (approx. 5 minutes)</b>

In [None]:
print(f'Getting dataset import ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
if new_dataset:
    try:
        import_data_operation = tables_client.import_data(dataset=dataset, gcs_input_uris=gcs_input_uris)
        print('Dataset import operation: {}'.format(import_data_operation))
        import_data_operation.result()
        print(f'Dataset import ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
    except:
        print(f'Dataset was already imported at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}') 

<b>Set target columns and update nullable columns</b>

In [None]:
print(f'Getting dataset configuration ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
try:
    table = tables_client.set_target_column(dataset=dataset, column_spec_display_name=TARGET_COLUMN)

    for col in tables_client.list_column_specs(PROJECT_ID,REGION,dataset.name):
        if TARGET_COLUMN == col.display_name:
            continue
        tables_client.update_column_spec(PROJECT_ID,
                                         REGION,
                                         dataset.name,
                                         column_spec_display_name=col.display_name,
                                         type_code=col.data_type.type_code,
                                         nullable=True)

    print(f'Dataset configuration ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
except:
    print(f'Dataset import not finished: try again in a few minutes, {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')

<b>Train the model (approx. 2 hours)</b>

In [None]:
print(f'Getting model trained at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
model = None
try:
    model = tables_client.get_model(model_display_name=MODEL_DN)
except:
    response = tables_client.create_model(
        MODEL_DN,
        dataset=dataset,
        train_budget_milli_node_hours=TRAIN_BUDGET,
        exclude_column_spec_names=[TARGET_COLUMN]
    )
    print('Training model operation: {}'.format(response.operation))
    model = response.result()
    model2 = tables_client.get_model(model_display_name=MODEL_DN)
    MODEL_ID = model2.name.split("/")
    MODEL_ID = MODEL_ID[len(MODEL_ID) - 1]
    print('Training model identifier: ' + MODEL_ID)

print(f'Model training ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')

<b>Generate evaluation metrics for analysis</b>

In [None]:
model_full_id = automl_client.model_path(PROJECT_ID, REGION, MODEL_ID)
for e in automl_client.list_model_evaluations(model_full_id, ""):
    evaluation = e
    break
model_evaluation_id = evaluation.name.split("{}/modelEvaluations/".format(MODEL_ID))[1].split("\n")[0]
model_full_id = automl_client.model_evaluation_path(PROJECT_ID, REGION, MODEL_ID, model_evaluation_id)
response = automl_client.get_model_evaluation(model_full_id)
data = [['AUC PR', response.classification_evaluation_metrics.au_prc], ['AUC ROC', response.classification_evaluation_metrics.au_roc], ['Log Loss', response.classification_evaluation_metrics.log_loss]]   
df = pd.DataFrame(data, columns = ['Metric', 'Value']) 
df

<b>Retrieve models and hyperparameter metrics for analysis (approx. 10 minutes)</b>

In [None]:
print(f'Retrieving model hyperparameters configuration ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
model_hyperparameters = []
parent = automl_client.location_path(project=PROJECT_ID, location=REGION)

# initialize logger for AutoML Model logs
logger = log_client.logger('automl.googleapis.com%2Fmodel')

# list TablesModelStructure (e.g. hyperparameters of final model) entries for the specified model
filter=f'resource.labels.job_id="{MODEL_ID}" jsonPayload.@type="type.googleapis.com/google.cloud.automl.master.TablesModelStructure"'


for entry in logger.list_entries(filter_=filter):
    model_parameters = entry.to_api_repr().get("jsonPayload").get("modelParameters")
    for model_parameter in model_parameters:
      hyperparameters = {}
      hyperparameters.update(model_parameter.get("hyperparameters"))
      model_hyperparameters.append(hyperparameters)

print("Display as Pandas dataframe sorted by model type")
print("Note: one AutoML model can be a combination of multiple models with different hyperparameters!")
df = pd.DataFrame(model_hyperparameters).sort_values(by=["Model type"],ascending=True)
columns = list(df.columns)
columns.insert(0, columns.pop(columns.index('Model type')))
df = df.reindex(columns=columns)
df

<b>Retrieve trials tuning metrics for analysis (approx. 10 minutes)</b>

In [None]:
print(f'Getting trials tunning configuration ready at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
tunning_trials_parameters = []
parent = automl_client.location_path(project=PROJECT_ID, location=REGION)

# initialize logger for AutoML Trials logs
logger = log_client.logger('automl.googleapis.com%2Ftuning')

# list TablesModelStructure (e.g. hyperparameters of final model) entries for the specified model
filter=f'resource.labels.job_id="{MODEL_ID}" jsonPayload.@type="type.googleapis.com/google.cloud.automl.master.TuningTrial"'

for entry in logger.list_entries(filter_=filter):
    trials_parameters = entry.to_api_repr().get("jsonPayload").get("modelStructure").get("modelParameters")
    trainingObjectivePoint = entry.to_api_repr().get("jsonPayload").get("trainingObjectivePoint").get("value")

    for trials_parameter in trials_parameters:
        hyperparameters = {"Objective": trainingObjectivePoint}
        hyperparameters.update(trials_parameter.get("hyperparameters"))
        tunning_trials_parameters.append(hyperparameters)

print("Display as Pandas dataframe sorted descending by trainingObjectivePoint")
df = pd.DataFrame(tunning_trials_parameters).sort_values(by=["Objective"],ascending=False)
columns = list(df.columns)
columns.insert(0, columns.pop(columns.index('Model type')))
df = df.reindex(columns=columns)
df

<b>Export model to GCS (approx. 5 minutes)</b>

In [None]:
print(f'Getting model exported at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
model = None
try:
    GCS_URI=GCS_URI + "/AUTOML_raw"
    output_config = {}
    dataset_full_id = automl_client.model_path(PROJECT_ID, REGION, MODEL_ID)
    gcs_destination = automl.types.GcsDestination(output_uri_prefix=GCS_URI)
    output_config = automl.types.ModelExportOutputConfig(gcs_destination=gcs_destination, model_format=MODEL_EXPORT)
    response = automl_client.export_model(dataset_full_id, output_config)
    print(f'Model exported at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')
except:
    print(f'Model not ready, try again in few hours, at {datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d, %H:%M:%S UTC")}')