In [1]:
import datetime
import logging
import os

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from tensorflow import feature_column as fc
from tensorflow.keras import layers
from tensorflow.keras import models

import pandas as pd
import numpy as np


# set TF error log verbosity
logging.getLogger("tensorflow").setLevel(logging.ERROR)

print(tf.version.VERSION)

2.3.4


In [2]:
def generate_col_lists(train):
    LABEL_COLUMN = ['total_consumption_T']#define labels
    ITERATIVE_HOLIDAY_COLS = ['holiday_T'] #define string columns (user_id)
    STRING_COLS = ['user_id']
    ITERATIVE_NUMERIC_COLS = ['min_temp_T','max_temp_T','total_consumption_T_minus_','day_of_year_sin_T0','day_of_year_cos_T0'] #define number columns
    EXCLUDE_COLS = ['time']

    numeric_cols = []
    string_cols = []
    holiday_cols = []
    label_cols = []
    exclusion_cols = []
    for c in train.columns:
        if any(substring in c for substring in ITERATIVE_NUMERIC_COLS):
            numeric_cols.append(c)
        if any(substring in c for substring in STRING_COLS):
            string_cols.append(c)
        if any(substring in c for substring in ITERATIVE_HOLIDAY_COLS):
            holiday_cols.append(c)
        if any((substring in c and 'minus' not in c) for substring in LABEL_COLUMN):
            label_cols.append(c)
        if any(substring in c  for substring in EXCLUDE_COLS):
            exclusion_cols.append(c)
    return numeric_cols, string_cols, holiday_cols, exclusion_cols, label_cols

def features_and_labels(row_data):
    """Splits features and labels from feature dictionary.

    Args:
        row_data: Dictionary of CSV column names and tensor values.
    Returns:
        Dictionary of feature tensors and label tensor.
    """
    for c in exclusion_cols:
        row_data.pop(c)
        
    labels= []
    for c in label_cols:
        label_val = row_data.pop(c)
        labels.append(label_val)
    label = tf.stack(labels, axis=1)
    return row_data, label  # features, label


def load_dataset(pattern, columns, batch_size=1, mode=tf.estimator.ModeKeys.EVAL):
    """Loads dataset using the tf.data API from CSV files.

    Args:
        pattern: str, file pattern to glob into list of files.
        batch_size: int, the number of examples per batch.
        mode: tf.estimator.ModeKeys to determine if training or evaluating.
    Returns:
        `Dataset` object.
    """
    # Make a CSV dataset
    dataset = tf.data.experimental.make_csv_dataset(
        file_pattern=pattern,
        batch_size=batch_size,
        column_names=columns)

    # Map dataset to features and label
    dataset = dataset.map(map_func=features_and_labels)  # features, label

    # Shuffle and repeat for training
    # if mode == tf.estimator.ModeKeys.TRAIN:
    #     dataset = dataset.shuffle(buffer_size=1000).repeat()

    # Take advantage of multi-threading; 1=AUTOTUNE
    dataset = dataset.prefetch(buffer_size=1)

    return dataset

def create_input_layers():
        #INPUT LAYER
    inputs = {
        colname: layers.Input(name=colname, shape=(), dtype='float32')
        for colname in numeric_cols
    }
    inputs2 = {
        colname: layers.Input(name=colname, shape=(), dtype='string')
        for colname in holiday_cols
    }
    inputs3 = {
        colname: layers.Input(name=colname, shape=(), dtype='string')
        for colname in string_cols
    }

    inputs.update(inputs2)
    inputs.update(inputs3)
    return inputs

def create_feature_columns(user_ids):
    #FEATURE COLUMNS
    feature_columns = []
    # numeric cols
    feature_columns = {
        colname : fc.numeric_column(key=colname)
               for colname in numeric_cols
    }
    feature_columns2 = {
        colname : fc.indicator_column(fc.categorical_column_with_vocabulary_list(key=colname, vocabulary_list=['no holiday', 'minor', 'major']))
               for colname in holiday_cols
    }
    feature_columns3 = {
        colname : fc.indicator_column(fc.categorical_column_with_vocabulary_list(key=colname, vocabulary_list=user_ids))
               for colname in string_cols
    }

    feature_columns.update(feature_columns2)
    feature_columns.update(feature_columns3)
    return feature_columns

In [3]:
def get_model_outputs(inputs):
    """Creates model architecture and returns outputs.

    Args:
        inputs: Dense tensor used as inputs to model.
    Returns:
        Dense tensor output from the model.
    """
    # Create two hidden layers of [64, 32] just in like the BQML DNN
    h1 = tf.keras.layers.Dense(300, activation="relu", name="h1")(inputs)
    h2 = tf.keras.layers.Dense(150, activation="relu", name="h2")(h1)
    h3 = tf.keras.layers.Dense(84, activation="relu", name="h3")(h2)
    h4 = tf.keras.layers.Dense(42, activation="relu", name="4h")(h3)

    # Final output is a linear activation because this is regression
    output = tf.keras.layers.Dense(
        units=28, activation="linear", name="weight")(h4)

    return output

In [4]:
def rmse(y_true, y_pred):
    """Calculates RMSE evaluation metric.

    Args:
        y_true: tensor, true labels.
        y_pred: tensor, predicted labels.
    Returns:
        Tensor with value of RMSE between true and predicted labels.
    """
    return tf.sqrt(tf.reduce_mean((y_pred - y_true) ** 2))

In [5]:
def build_dnn_model(user_ids):
    """Builds simple DNN using Keras Functional API.

    Returns:
        `tf.keras.models.Model` object.
    """
    # Create input layer
    inputs = create_input_layers()

    # Create feature columns
    feature_columns = create_feature_columns(user_ids)

    # The constructor for DenseFeatures takes a list of numeric columns
    # The Functional API in Keras requires: LayerConstructor()(inputs)
    dnn_inputs = tf.keras.layers.DenseFeatures(
        feature_columns=feature_columns.values())(inputs)

    # Get output of model given inputs
    output = get_model_outputs(dnn_inputs)

    # Build model and compile it all together
    model = tf.keras.models.Model(inputs=inputs, outputs=output)
    model.compile(optimizer="adam", loss="mse", metrics=[rmse, "mse"])

    return model



In [6]:
train = pd.read_csv('./data/train_energy.csv')

In [None]:
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 1000
NUM_TRAIN_EXAMPLES = 10000 * 5  # training dataset repeats, it'll wrap around
NUM_EVALS = 5  # how many times to evaluate
# Enough to get a reasonable sample, but not so much that it slows down
NUM_EVAL_EXAMPLES = 10000



numeric_cols, string_cols, holiday_cols, exclusion_cols, label_cols = generate_col_lists(train)

trainds = load_dataset(
    pattern="./data/train*",
    columns=train.columns,
    batch_size=TRAIN_BATCH_SIZE,
    mode=tf.estimator.ModeKeys.TRAIN)

evalds = load_dataset(
    pattern="./data/test*",
    columns=train.columns,
    batch_size=TEST_BATCH_SIZE,
    mode=tf.estimator.ModeKeys.EVAL).take(count=NUM_EVAL_EXAMPLES // 1000)

steps_per_epoch = NUM_TRAIN_EXAMPLES // (TRAIN_BATCH_SIZE * NUM_EVALS)

logdir = os.path.join(
    "logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=logdir, histogram_freq=1)


model = build_dnn_model(train.user_id.unique())

history = model.fit(
    trainds,
    validation_data=evalds,
    epochs=NUM_EVALS,
    steps_per_epoch=steps_per_epoch,
    callbacks=[tensorboard_callback])


2021-10-29 17:07:17.174363: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2299995000 Hz
2021-10-29 17:07:17.174739: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x557203e8ab60 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-10-29 17:07:17.174763: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-10-29 17:07:17.176946: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2021-10-29 17:07:19.681966: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session started.


Epoch 1/5


  [n for n in tensors.keys() if n not in ref_input_names])


  4/312 [..............................] - ETA: 13s - loss: 1773114112.0000 - rmse: 30854.0137 - mse: 1773114112.0000

2021-10-29 17:07:32.226533: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session started.
2021-10-29 17:07:32.283303: I tensorflow/core/profiler/rpc/client/save_profile.cc:176] Creating directory: logs/20211029-170719/train/plugins/profile/2021_10_29_17_07_32
2021-10-29 17:07:32.287189: I tensorflow/core/profiler/rpc/client/save_profile.cc:182] Dumped gzipped tool data for trace.json.gz to logs/20211029-170719/train/plugins/profile/2021_10_29_17_07_32/daniel-dlvm.trace.json.gz
2021-10-29 17:07:32.305653: I tensorflow/core/profiler/rpc/client/save_profile.cc:176] Creating directory: logs/20211029-170719/train/plugins/profile/2021_10_29_17_07_32
2021-10-29 17:07:32.306161: I tensorflow/core/profiler/rpc/client/save_profile.cc:182] Dumped gzipped tool data for memory_profile.json.gz to logs/20211029-170719/train/plugins/profile/2021_10_29_17_07_32/daniel-dlvm.memory_profile.json.gz
2021-10-29 17:07:32.306848: I tensorflow/python/profiler/internal/profiler_wrapper.cc:11

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5