In [25]:
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

 Author: Jose Brache \
 Email: jbrache@google.com \
<img src="img/google-cloud-icon.jpg" alt="Drawing" style="width: 200px;"/>

### Create training application package

The easiest (and recommended) way to create a training application package uses gcloud to package and upload the application when you submit your training job. This method allows you to create a very simple file structure. For this tutorial, the file structure of your training application package should appear similar to the following:

```
config/
    config.yaml
trainer/ 
    __init__.py
    task.py
    model.py
    metadata.py
```




**Set Your Application Name, Task Name, and Directories.**

In [1]:
TASK_TYPE = "custom-py-pkg"
TASK_NAME = f"{TASK_TYPE}"
TASK_DIR = f"./{TASK_NAME}"
PYTHON_PACKAGE_APPLICATION_DIR = f"{TASK_NAME}/trainer"

print(f"Task Name:      {TASK_NAME}")
print(f"Task Directory: {TASK_DIR}")
print(f"Python Package Directory: {PYTHON_PACKAGE_APPLICATION_DIR}")

Task Name:      custom-py-pkg
Task Directory: ./custom-py-pkg
Python Package Directory: custom-py-pkg/trainer


In [2]:
# Create the tf_trainer directory and load the trainer files in it
!mkdir -p $PYTHON_PACKAGE_APPLICATION_DIR
!touch $PYTHON_PACKAGE_APPLICATION_DIR/__init__.py

In [3]:
%%writefile {TASK_DIR}/setup.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from setuptools import find_packages
from setuptools import setup

# REQUIRED_PACKAGES = [
#     'tensorflow==2.1.0',
#     'numpy==1.18.0',
#     'pandas==1.2.1',
#     'scipy==1.4.1',
#     'scikit-learn==0.22',
#     'google-cloud-storage==1.23.0',
#     'xgboost==1.3.3',
#     'cloudml-hypertune',
#     ]
 
setup(
    name='trainer',
    version='0.1',
    install_requires=(),    # set install_requires=REQUIRED_PACKAGES, to specify the required packages
    packages=find_packages(),
    include_package_data=True,
    description='Trainer package for scikit-learn Task'
)

Writing ./custom-py-pkg/setup.py


In [4]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/__init__.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.



Overwriting custom-py-pkg/trainer/__init__.py


Create your training code (Example showed here is to use scikit-learn to classify structured mortgage data)

In [5]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/util.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Hold utility functions."""

import os

import pandas as pd
import tensorflow as tf

from sklearn import model_selection as ms
#from sklearn.externals import joblib
import joblib
from trainer import metadata

def data_train_test_split(data_df):
    """Split the DataFrame two subsets for training and testing.
    Args:
      data_df: (pandas.DataFrame) DataFrame the splitting to be performed on
    Returns:
      A Tuple of (pandas.DataFrame, pandas.Series,
                  pandas.DataFrame, pandas.Series)
    """

    if metadata.FEATURE_NAMES is None:
        # Use all the columns as features, except for the target column
        feature_names = list(data_df.columns)
        feature_names.remove(metadata.TARGET_NAME)
        features = data_df[feature_names]
    else:
        # Only use metadata.FEATURE_NAMES
        features = data_df[metadata.FEATURE_NAMES]
    target = data_df[metadata.TARGET_NAME]

    x_train, x_val, y_train, y_val = ms.train_test_split(features,
                                                         target,
                                                         test_size=0.2)
    return x_train.values, y_train, x_val.values, y_val


def read_df_from_bigquery(full_table_path, project_id=None, num_samples=None):
    """Read data from BigQuery and split into train and validation sets.
    Args:
      full_table_path: (string) full path of the table containing training data
        in the format of [project_id.dataset_name.table_name].
      project_id: (string, Optional) Google BigQuery Account project ID.
      num_samples: (int, Optional) Number of data samples to read.
    Returns:
      pandas.DataFrame
    """

    query = metadata.BASE_QUERY.format(table=full_table_path)
    limit = ' LIMIT {}'.format(num_samples) if num_samples else ''
    query += limit

    # Use "application default credentials"
    # Use SQL syntax dialect
    data_df = pd.read_gbq(query, project_id=project_id, dialect='standard')

    return data_df


def read_df_from_gcs(file_pattern):
    """Read data from Google Cloud Storage, split into train and validation sets
    Assume that the data on GCS is in csv format without header.
    The column names will be provided through metadata
    Args:
      file_pattern: (string) pattern of the files containing training data.
      For example: [gs://bucket/folder_name/prefix]
    Returns:
      pandas.DataFrame
    """

    # Download the files to local /tmp/ folder
    df_list = []

    for filepath in tf.io.gfile.glob(file_pattern):
        with tf.io.gfile.GFile(filepath, 'r') as f:
            if metadata.CSV_COLUMNS is None:
                df_list.append(pd.read_csv(f))
            else:
                df_list.append(pd.read_csv(f, names=metadata.CSV_COLUMNS,
                                           header=None))

    data_df = pd.concat(df_list)

    return data_df


def upload_to_gcs(local_path, gcs_path):
    """Upload local file to Google Cloud Storage.
    Args:
      local_path: (string) Local file
      gcs_path: (string) Google Cloud Storage destination
    Returns:
      None
    """
    tf.io.gfile.copy(local_path, gcs_path)


def dump_object(object_to_dump, output_path):
    """Pickle the object and save to the output_path.
    Args:
      object_to_dump: Python object to be pickled
      output_path: (string) output path which can be Google Cloud Storage
    Returns:
      None
    """

    if not tf.io.gfile.exists(output_path):
        tf.io.gfile.makedirs(os.path.dirname(output_path))
    with tf.io.gfile.GFile(output_path, 'w') as wf:
        joblib.dump(object_to_dump, wf)


def boolean_mask(columns, target_columns):
    """Create a boolean mask indicating location of target_columns in columns.
    Args:
      columns: (List[string]), list of all columns considered.
      target_columns: (List[string]), columns whose position
        should be masked as 1.
    Returns:
      List[bool]
    """
    target_set = set(target_columns)
    return [x in target_set for x in columns]

Writing custom-py-pkg/trainer/util.py


In [6]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/metadata.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Dataset metadata."""

# If the input CSV file has a header row, then set CSV_COLUMNS to None.
# Otherwise, set CSV_COLUMNS to a list of target and feature names:
# i.e. CSV_COLUMNS = None
CSV_COLUMNS = [
    'dimension_1',
    'dimension_2'
]

# Target name
# i.e. TARGET_NAME = 'tip'
TARGET_NAME = None

# The features to be used for training.
# If FEATURE_NAMES is None, then all the available columns will be
# used as features, except for the target column.
# i.e. FEATURE_NAMES = ['trip_miles','trip_seconds','fare','trip_start_month','trip_start_hour','trip_start_day',]
FEATURE_NAMES = None

# If the model is serialized using joblib
# then use 'model.joblib' for the model name
MODEL_FILE_NAME = 'model.joblib'

# Set to True if you want to tune some hyperparameters
HYPERPARAMTER_TUNING = False

# Used only if the dataset is to be read from BigQuery
BASE_QUERY = '''
    SELECT
      *
    FROM
      `{table}`
  '''

Writing custom-py-pkg/trainer/metadata.py


In [7]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/model.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""ML model definitions."""

import numpy as np
from sklearn.ensemble import IsolationForest

# this is just a Tensorflow example has a more powerful example
# https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/self-paced-labs/ai-platform-qwikstart/ai_platform_qwik_start.ipynb

def get_estimator(arguments):
    """Create an Isolation Forest classifier for anomaly detection 
    # Generate ML Pipeline which include both pre-processing and model training
    
    Args:
      arguments: (argparse.ArgumentParser), parameters passed from command-line
    Returns:
      classifier - the Isolation Forests classifier(still needs to be trained)
    """

    # max_samples and random_state_seed are expected to be passed as
    # command line argument to task.py
    
    # max_samples: “auto”, int or float, default=”auto”
    # The number of samples to draw from X to train each base estimator.
    
    # random_stateint, RandomState instance or None, default=None
    # Controls the pseudo-randomness of the selection of the feature and split values for each branching step and each tree in the forest.
    
    estimator = IsolationForest(
        max_samples=arguments.max_samples,
        random_state=arguments.random_state_seed)

    return estimator

Writing custom-py-pkg/trainer/model.py


In [8]:
%%writefile {PYTHON_PACKAGE_APPLICATION_DIR}/task.py
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Executes model training and evaluation."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import logging
import os

#import hypertune
import numpy as np
from datetime import datetime
from sklearn.ensemble import IsolationForest

from trainer import model
from trainer import util
from trainer import metadata

def train_and_evaluate(estimator, dataset, output_dir):
    """Runs model training and evaluation.
    Args:
      estimator: (pipeline.Pipeline), Pipeline instance, assemble pre-processing
        steps and model training
      dataset: (pandas.DataFrame), DataFrame containing training data
      output_dir: (string), directory that the trained model will be exported
    Returns:
      None
    """
    #x_train, y_train, x_val, y_val = util.data_train_test_split(dataset)
    x_train = dataset

    estimator.fit(x_train)

    # Write model and eval metrics to `output_dir`
    model_output_path = os.path.join(output_dir,
                                     metadata.MODEL_FILE_NAME)

    util.dump_object(estimator, model_output_path)

def run_experiment(arguments):
    """Testbed for running model training and evaluation."""
    # Get data for training and evaluation

    logging.info('Arguments: %s', arguments)
    
    # Get the training data
    dataset_df = util.read_df_from_gcs(arguments.input)
    dataset = dataset_df.to_numpy()

    # Get estimator
    estimator = model.get_estimator(arguments)

    # Run training and evaluation
    train_and_evaluate(estimator, dataset, arguments.job_dir)

def parse_args():
    """Parses command-line arguments."""
    """Argument parser.

    Returns:
      Dictionary of arguments.
    """

    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--log-level',
        help='Logging level.',
        choices=[
            'DEBUG',
            'ERROR',
            'FATAL',
            'INFO',
            'WARN',
        ],
        default='INFO',
    )

    parser.add_argument(
        '--input',
        help='''Dataset to use for training and evaluation.
              Can be BigQuery table or a file (CSV).
              If BigQuery table, specify as as PROJECT_ID.DATASET.TABLE_NAME.
            ''',
        required=True,
    )

    parser.add_argument(
        '--job-dir',
        help='Output directory for exporting model and other metadata.',
        required=True,
    )
    
    parser.add_argument(
        '--max-samples',
        type=int,
        default=100,
        help='maximum number of random samples to generate, default=100')
    
    parser.add_argument(
        '--random-state-seed',
        type=int,
        default=42,
        help='random seed used to initialize the pseudo-random number generator, default=42')

    parser.add_argument(
        '--n-estimators',
        help='Number of trees in the forest.',
        default=10,
        type=int,
    )

    parser.add_argument(
        '--max-depth',
        help='The maximum depth of the tree.',
        type=int,
        default=3,
    )

    return parser.parse_args()

if __name__ == '__main__':
    """Entry point"""

    arguments = parse_args()
    logging.basicConfig(level=arguments.log_level)
    # Run the train and evaluate experiment
    time_start = datetime.utcnow()
    run_experiment(arguments)
    time_end = datetime.utcnow()
    time_elapsed = time_end - time_start
    logging.info('Experiment elapsed time: {} seconds'.format(
        time_elapsed.total_seconds()))

Writing custom-py-pkg/trainer/task.py


### Configure for Vertex AI Training
Create config file for Cloud AI Platform training

In [9]:
# Create the config directory and load the trainer files in it
!mkdir -p $TASK_NAME/config

In [10]:
%%writefile $TASK_NAME/config/config.yaml
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#trainingInput:
#  scaleTier: CUSTOM
#  masterType: n1-highmem-8
#  masterConfig:
#    acceleratorConfig:
#      count: 1
#      type: NVIDIA_TESLA_T4

trainingInput:
  scaleTier: STANDARD-1


Writing custom-py-pkg/config/config.yaml


### Next, open the **sklearn-pb-ctr.ipynb** notebook