# Installing Libraries & Dependencies

In [None]:
#!pip3 install -r requirements.txt

# Importing Librarires

In [None]:
import os
import shutil
from datetime import datetime
import pkg_resources
import tensorflow as tf
import tensorflow_transform as tft

print('INFO: TF version -- {}'.format(tf.__version__))
print('INFO: TFT version -- {}'.format(pkg_resources.get_distribution("tensorflow_transform").version))


# Input Arguments

Example of input arguments for the data validation component

In [None]:
PROJECT = "irn-70656-dev-1307100302"
REGION = 'europe-west1'
BUCKET = "bike-sharing-pipeline-metadata"
PIPELINE_VERSION = "v0_1"
DATA_VERSION = "200909_154702"
MODEL_VERSION = datetime.now().strftime('%y%m%d_%H%M%S')

In [None]:
# Features, labels, and key columns
NUMERIC_FEATURE_KEYS=["temp", "atemp", "humidity", "windspeed"] 
CATEGORICAL_FEATURE_KEYS=["season", "weather", "daytype"] 
KEY_COLUMN = "datetime"
LABEL_COLUMN = "count"

def transformed_name(key):
    return key 
    

# Setting Paths 

Setting up some globals for the gcs files

In [None]:
# Set up some globals for gcs file
HANDLER = 'gs://' # ../ for local data, gs:// for cloud data

BASE_DIR = HANDLER + BUCKET+'/'+PIPELINE_VERSION
RUN_DIR = BASE_DIR+'/run/'+DATA_VERSION
DATA_DIR = RUN_DIR+'/data_transform'
OUTPUT_DIR = RUN_DIR+'/model_training/' + MODEL_VERSION

In [None]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION
os.environ['PIPELINE_VERSION'] = PIPELINE_VERSION
os.environ['DATA_DIR'] = DATA_DIR
os.environ['OUTPUT_DIR'] = OUTPUT_DIR

Set up GCP project

In [None]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Train and Evaluate input functions

In [None]:
def input_fn(data_path, label_column, tf_transform_output, batch_size, mode = tf.estimator.ModeKeys.TRAIN):
    """Create an input function reading TFRecord files using the data API.
    Args:
        data_path: path of the data in tfrecords format
        mode: tf estimator mode key
        batch_size: number of observations in batch

    Returns:
        input_fn: data input function
    """
    
    features_spec = tf_transform_output.transformed_feature_spec()

    def _input_fn():
        # Create list of files in the data path
        file_list = tf.io.gfile.glob(data_path)

        # Create dataset from file list
        dataset = tf.data.TFRecordDataset(filenames=file_list, compression_type = "GZIP", num_parallel_reads=5)
        def parse_example(example):
            parsed_features = tf.io.parse_single_example(example, features_spec)
            label = parsed_features.pop(label_column)
            return parsed_features, label
        
        if mode == tf.estimator.ModeKeys.TRAIN:
            num_epochs = None # indefinitely, we'll set this in train spec
            dataset = dataset.shuffle(buffer_size=10*batch_size)
        else:
            num_epochs = 1 # end-of-input after one epoch

        dataset = dataset.repeat(num_epochs)
        dataset = dataset.map(parse_example, num_parallel_calls=5)
        dataset = dataset.batch(batch_size)
        dataset = dataset.prefetch(buffer_size=1)
        
        return dataset
    
    return _input_fn

# Feature Engineering

In [None]:
# Train and Evaluate input functions
def create_feature_columns(tf_transform_output):
    
    numeric_columns = [
      tf.feature_column.numeric_column(transformed_name(key))
      for key in NUMERIC_FEATURE_KEYS
    ]
    
    categorical_columns = [
      tf.feature_column.categorical_column_with_vocabulary_file(
        transformed_name(key), 
        vocabulary_file=tf_transform_output.vocabulary_file_by_name(
            vocab_filename=key), 
        dtype=tf.dtypes.string,
        default_value=None, 
        num_oov_buckets=0)
      for key in CATEGORICAL_FEATURE_KEYS
    ]
    
    indicator_columns = [
      tf.feature_column.indicator_column(categorical_column)
      for categorical_column in categorical_columns
    ]    
    
    feature_columns = numeric_columns + indicator_columns
    
    return feature_columns

# Build Custom Keras Model

In [None]:
# Evaluation Metrics
def rmse(labels, predictions): # Root Mean Squared Error
    rmse = tf.sqrt(x = tf.reduce_mean(input_tensor = tf.square(x = labels - predictions)))
    return  rmse

def mae(labels, predictions): # Root Mean Squared Erro
    mae = tf.reduce_mean(input_tensor = tf.abs(x = labels - predictions))
    return mae 

# Build Custom Keras Model
def create_keras_model(features_columns, hidden_units_1, hidden_units_2, hidden_units_3, learning_rate):
    
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.DenseFeatures(feature_columns=features_columns, name="input"))
    model.add(tf.keras.layers.Dense(units = hidden_units_1, activation = "relu", name = "dense1"))
    model.add(tf.keras.layers.Dense(units = hidden_units_2, activation = "relu", name = "dense2"))
    model.add(tf.keras.layers.Dense(units = hidden_units_3, activation = "relu", name = "dense3"))
    model.add(tf.keras.layers.Dense(units = 1, activation = None, name = "output"))

    model.compile(
        optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate),
        loss = "mean_squared_error",
        metrics = [rmse, mae])
    
    
    return model

# Serving input function

# Train and Evaluate

To train our model, we can use train_and_evaluate. Note that we use tf.keras.estimator.model_to_estimator to create our estimator. It takes as arguments the compiled keras model, the OUTDIR, and optionally a tf.estimator.Runconfig.

In [None]:
def get_dataset_size(file_path):
    """Function that fetchs the size of the Tfrecords dataset."""
    size = 1
    file_list = tf.io.gfile.glob(file_path)
    for file in file_list:
        for record in tf.compat.v1.io.tf_record_iterator(file, options=tf.io.TFRecordOptions(
    compression_type='GZIP')):
            size += 1
    return size

In [None]:
def train_and_evaluate(params):
    
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) # so loss is printed during training
    
    # Extract params from task.py
    DATA_DIR = params["data_dir"]
    OUTPUT_DIR = params["output_dir"]
    HIDDEN_UNITS_1 = params["hidden_units_1"]
    HIDDEN_UNITS_2 = params["hidden_units_2"]
    HIDDEN_UNITS_3 = params["hidden_units_3"]
    BATCH_SIZE = params["batch_size"]
    NUM_EPOCHS = params["num_epochs"]
    LEARNING_RATE = params["learning_rate"]
    
    # Setting up paths 
    TRAIN_PATH = DATA_DIR+'/train*'
    VAL_PATH = DATA_DIR+'/val*'
    TEST_PATH = DATA_DIR+'/test*'

    # Define key and label columns
    KEY_COLUMN = 'datetime'
    LABEL_COLUMN = 'count'
    
    # Training set size
    TRAIN_SIZE = get_dataset_size(TRAIN_PATH)

    NUM_STEPS = TRAIN_SIZE / BATCH_SIZE * NUM_EPOCHS # total steps for which to train model
    CHECKPOINTS_STEPS = 20 # checkpoint every N steps

    # number of feature columns for keras input layer
    
    tf_transform_output = tft.TFTransformOutput(os.path.join(DATA_DIR, 'tft_output'))

    features_columns = create_feature_columns(tf_transform_output)
    
    keras_model = create_keras_model(features_columns, HIDDEN_UNITS_1, HIDDEN_UNITS_2, 
                                     HIDDEN_UNITS_3, LEARNING_RATE)

    # Setup TensorBoard callback.
    log_dir = os.path.join(OUTPUT_DIR, 'logs')
    tensorboard_cb = tf.keras.callbacks.TensorBoard(
      log_dir=log_dir, histogram_freq=1)

    # Setup Metric callback.
    class metric_cb(tf.keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            tf.summary.scalar('rmse', logs['rmse'], epoch)
            
    train_data = input_fn(TRAIN_PATH, LABEL_COLUMN, tf_transform_output, BATCH_SIZE, tf.estimator.ModeKeys.TRAIN)()
    val_data = input_fn(VAL_PATH, LABEL_COLUMN, tf_transform_output, BATCH_SIZE, tf.estimator.ModeKeys.EVAL)()
    
    # Train keras model
    keras_model.fit(
        train_data,
        steps_per_epoch=NUM_STEPS,
        epochs=NUM_EPOCHS,
        validation_data=val_data,
        validation_steps=5,
        verbose=1,
    callbacks=[tensorboard_cb, metric_cb()])

    export_path = os.path.join(OUTPUT_DIR, 'export')
    tf.keras.models.save_model(keras_model, export_path, overwrite = True, save_format="tf")
    print('Model exported to: {}'.format(export_path))
    
    return keras_model

In [None]:
params = {
    "data_dir": DATA_DIR,
    "output_dir": OUTPUT_DIR,
    "hidden_units_1": 16,
    "hidden_units_2": 32,
    "hidden_units_3": 64,
    "batch_size": 64,
    "num_epochs": 5,
    "learning_rate": 0.0001,

}
model = train_and_evaluate(params)

In [None]:
%load_ext tensorboard

In [None]:
#%tensorboard --logdir $OUTPUT_DIR