In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
0,application_1613683661894_0002,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7ff87cd27f10>

# Prepare feature groups for training sales_model

In [2]:
import hsfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

## Data Engineering

We are going to use a dataset containing information related to a chain of deparment stores. The dataset is taken from [Kaggke](https://www.kaggle.com/manjeetsingh/retaildataset?select=Features+data+set.csv).

We are going to create 3 feature groups:
- `stores_fg`: it's going to contain features related to the store itself. Mainly the category, the number of deparmetns and the size.
- `sales_fg`: it's going to contain sales features for each store/deparment over the weeks. 
- `exogenous_fg`: it's going to contain features which are not related to the stores themselves, but they have an effect on sales. These features are, for instance, the gas price, the unemployment rate, temperature in the area and so on.

In [3]:
from hops import hdfs
from pyspark.sql import functions as F

stores_csv = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Jupyter/hsfs/archive/stores data-set.csv".format(hdfs.project_name()))

exogenous_csv = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Jupyter/hsfs/archive/Features data set.csv".format(hdfs.project_name()))

sales_csv = spark.read\
             .option("inferSchema", "true")\
             .option("header", "true")\
             .format("csv")\
             .load("hdfs:///Projects/{}/Jupyter/hsfs/archive/sales data-set.csv".format(hdfs.project_name()))

In [4]:
stores_depts_count = stores_csv\
                    .join(sales_csv, "store")\
                    .groupBy("store")\
                    .agg(F.countDistinct("dept"))\
                    .withColumnRenamed("count(DISTINCT dept)", "num_depts")

stores_fg = stores_csv\
            .join(stores_depts_count, "store")

#### Create `store_fg` feature group

Create a feature group named `store_fg`. The store is the primary key uniquely identifying all the remaining features in this feature group. Note that online feature store must be enabled by `online_enabled=True` to be able to retrieve featues of online serving. 

In [5]:
store_fg_meta = fs.create_feature_group(name="store_fg",
                                       version=1,
                                       primary_key=['store'],
                                       description="Store related features",
                                       time_travel_format=None,
                                       online_enabled=True, 
                                       statistics_config={"enabled": True, "histograms": True, "correlations": True})

In [6]:
store_fg_meta.save(stores_fg)

<hsfs.feature_group.FeatureGroup object at 0x7ff82f78c050>

#### Feature Engineering Sales

In [7]:
from pyspark.sql import Window
days = lambda i: i * 86400 

sales_df = sales_csv.withColumn('date', F.to_date("date", 'dd/MM/yyy'))\
                    .withColumn('timestamp', F.unix_timestamp("date"))

# Define aggregation window to compute sales performances over the past period of time
last_month_window_store_dep = Window.partitionBy(['store', 'dept']).orderBy(F.col("timestamp").cast("long")).rangeBetween(days(-30), days(-1))
last_quarter_window_store_dep = Window.partitionBy(['store', 'dept']).orderBy(F.col("timestamp").cast("long")).rangeBetween(days(-90), days(-1))
last_six_month_window_store_dep = Window.partitionBy(['store', 'dept']).orderBy(F.col("timestamp").cast("long")).rangeBetween(days(-180), days(-1))
last_year_window_store_dep = Window.partitionBy(['store', 'dept']).orderBy(F.col("timestamp").cast("long")).rangeBetween(days(-365), days(-1))

last_month_window_store = Window.partitionBy('store').orderBy(F.col("timestamp").cast("long")).rangeBetween(days(-30), days(-1))
last_quarter_window_store = Window.partitionBy('store').orderBy(F.col("timestamp").cast("long")).rangeBetween(days(-90), days(-1))
last_six_month_window_store = Window.partitionBy('store').orderBy(F.col("timestamp").cast("long")).rangeBetween(days(-180), days(-1))
last_year_window_store = Window.partitionBy('store').orderBy(F.col("timestamp").cast("long")).rangeBetween(days(-365), days(-1))

# Build feature group dataframe
sales_fg = sales_df.withColumn("sales_last_month_store_dep", F.sum("weekly_sales").over(last_month_window_store_dep))\
        .withColumn("sales_last_quarter_store_dep", F.sum("weekly_sales").over(last_quarter_window_store_dep))\
        .withColumn("sales_last_six_month_store_dep", F.sum("weekly_sales").over(last_six_month_window_store_dep))\
        .withColumn("sales_last_year_store_dep", F.sum("weekly_sales").over(last_year_window_store_dep))\
        .withColumn("sales_last_month_store", F.sum("weekly_sales").over(last_month_window_store))\
        .withColumn("sales_last_quarter_store", F.sum("weekly_sales").over(last_quarter_window_store))\
        .withColumn("sales_last_six_month_store", F.sum("weekly_sales").over(last_six_month_window_store))\
        .withColumn("sales_last_year_store", F.sum("weekly_sales").over(last_year_window_store))\
        .drop("timestamp")\
        .fillna(0)

#### Create `sales_fg` feature group

Differently from the `store_fg`, for the `sales_fg` we are going to define a composite primary key. This means that each entry in the `sales_fg` is going to be uniquely identified by the store, the department and the week. In this case we are going to specify also a partition key. Partitioning is a tool available at your disposal to improve the performances of querying a feature group. Note that online feature store must be enabled by `online_enabled=True` to be able to retrieve featues of online serving. 

In [8]:
sales_fg_meta = fs.create_feature_group(name="sales_fg",
                                        version=1,
                                        primary_key=['store', 'dept', 'date'],
                                        description="Sales related features",
                                        time_travel_format=None,  
                                        online_enabled=True,                                         
                                        statistics_config=False)

In [9]:
sales_fg_meta.save(sales_fg)

<hsfs.feature_group.FeatureGroup object at 0x7ff82fa11ad0>

#### Feature Engineering Exogenous features

This feature group will contain exogenous features that can influence sales, but are not under the control of the distribution chain. These are the unemployment, the consumer price index (cpi) and so on.
We are going to write these features as they are in the feature store. Note that online feature store must be enabled by `online_enabled=True` to be able to retrieve featues of online serving. 

In [10]:
exogenous_fg = exogenous_csv.withColumn('date', F.to_date("date", 'dd/MM/yyy'))

exogenous_fg_meta = fs.create_feature_group(name="exogenous_fg",
                                            version=1,
                                            primary_key=['store', 'date'],
                                            description="External features that influence sales, but are not under the control of the distribution chain",
                                            time_travel_format=None,   
                                            online_enabled=True,                                                                                     
                                            statistics_config={"enabled": True, "histograms": True, "correlations": True})
exogenous_fg_meta.save(exogenous_fg)

<hsfs.feature_group.FeatureGroup object at 0x7ff82fa11e90>

### Create training dataset using query object


In [11]:
sales_fg_meta = fs.get_feature_group("sales_fg",1)
store_fg_meta = fs.get_feature_group("store_fg",1)
exogenous_fg_meta = fs.get_feature_group("exogenous_fg",1)


In [12]:
query = sales_fg_meta.select(["weekly_sales", "sales_last_month_store", "sales_last_quarter_store", 
                         "sales_last_year_store_dep", "sales_last_month_store_dep", "sales_last_quarter_store_dep", 
                         "sales_last_six_month_store_dep", "sales_last_six_month_store", "sales_last_year_store"])\
                .join(store_fg_meta.select(["num_depts", "size"]))\
                .join(exogenous_fg_meta.select(['fuel_price']))

In [14]:
from hsfs.storage_connector import StorageConnector
td = fs.create_training_dataset(name="sales_model",
                               description="Dataset to train the sales model",
                               data_format="tfrecord",
                               label = ["weekly_sales"],                                
                               version=1)

td.save(query)

<hsfs.training_dataset.TrainingDataset object at 0x7ff82f86a150>

# Train Tensorflow Keras model and save with SavedModel format
---

<font color='red'> <h3>Tested with TensorFlow 2.4.0</h3></font>

## Retrieve traing dataset from feature store and prepare tf data input
### Input the training dataset to a model training loop
If you are training a model, HSFS provides `tf_data` method that returns `TFDataEngine` object with utility methods to read training dataset as `tf.data.Dataset` object to read the training dataset and feed it to a model training loop efficiently. 
* Currently `TFDataEngine` provides 2 utility methods `tf_record_dataset` and `tf_csv_dataset` for reading `.tfrecord` and `.csv` files, respectivelly.
* Both methods support only following feature types `string`, `short`, `int`, `long`, `float` and `double`.
* In both methods you can set `process` argument to `True` and they will return `PrefetchDataset` ready to input to model training loop.
* If you would like to apply your own logic to feature transformation using `tf.data.Dataset` then set `process` argument to `False`.

In [15]:
td = fs.get_training_dataset("sales_model", 1)

In [16]:
train_input = td.tf_data(target_name=td.label[0], is_training=True)

<p>
<h1>Machine Learning on <a href="https://github.com/logicalclocks/hopsworks">Hopsworks
</a></h1> 
</p>

## The `hops` python module

`hops` is a helper library for Hops that facilitates development by hiding the complexity of running applications and iteracting with services.

Have a feature request or encountered an issue? Please let us know on <a href="https://github.com/logicalclocks/hops-util-py">github</a>.

### Using the `experiment` module

To be able to run your Machine Learning code in Hopsworks, the code for the whole program needs to be provided and put inside a wrapper function. Everything, from importing libraries to reading data and defining the model and running the program needs to be put inside a wrapper function.

The `experiment` module provides an api to Python programs such as TensorFlow, Keras and PyTorch on a Hopsworks on any number of machines and GPUs.

An Experiment could be a single Python program, which we refer to as an **Experiment**. 

Grid search or genetic hyperparameter optimization such as differential evolution which runs several Experiments in parallel, which we refer to as **Parallel Experiment**. 

ParameterServerStrategy, CollectiveAllReduceStrategy and MultiworkerMirroredStrategy making multi-machine/multi-gpu training as simple as invoking a function for orchestration. This mode is referred to as **Distributed Training**.

### Using the `tensorboard` module
The `tensorboard` module allow us to get the log directory for summaries and checkpoints to be written to the TensorBoard we will see in a bit. The only function that we currently need to call is `tensorboard.logdir()`, which returns the path to the TensorBoard log directory. Furthermore, the content of this directory will be put in as a Dataset in your project's Experiments folder.

The directory could in practice be used to store other data that should be accessible after the experiment is finished.
```python
# Use this module to get the TensorBoard logdir
from hops import tensorboard
tensorboard_logdir = tensorboard.logdir()
```

### Using the `hdfs` module
The `hdfs` module provides a method to get the path in HopsFS where your data is stored, namely by calling `hdfs.project_path()`. The path resolves to the root path for your project, which is the view that you see when you click `Data Sets` in HopsWorks. To point where your actual data resides in the project you to append the full path from there to your Dataset. For example if you create a salse folder in your Resources Dataset, the path to the salse data would be `hdfs.project_path() + 'Resources/sales'`

```python
# Use this module to get the path to your project in HopsFS, then append the path to your Dataset in your project
from hops import hdfs
project_path = hdfs.project_path()
```

```python
# Downloading the sales dataset to the current working directory
from hops import hdfs
sales_hdfs_path = hdfs.project_path() + "Resources/sales"
local_sales_path = hdfs.copy_to_local(sales_hdfs_path)
```

### Documentation
See the following links to learn more about running experiments in Hopsworks

- <a href="https://hopsworks.readthedocs.io/en/latest/hopsml/experiment.html">Learn more about experiments</a>
<br>
- <a href="https://hopsworks.readthedocs.io/en/latest/hopsml/hopsML.html">Building End-To-End pipelines</a>
<br>
- Give us a star, create an issue or a feature request on  <a href="https://github.com/logicalclocks/hopsworks">Hopsworks github</a>

### Managing experiments
Experiments service provides a unified view of all the experiments run using the `experiment` module.
<br>
As demonstrated in the gif it provides general information about the experiment and the resulting metric. Experiments can be visualized meanwhile or after training in a TensorBoard.
<br>
<br>

In [17]:
def keras_sales_model():
    
    import os
    import sys
    import uuid
    import random
    
    import numpy as np
    
    from tensorflow import keras
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Dropout
    from tensorflow.keras.callbacks import TensorBoard
    from tensorflow.keras import backend as K

    import math
    from hops import tensorboard

    from hops import model as hops_model
    from hops import hdfs

    import pydoop.hdfs as pydoop

    train_input_processed = train_input.tf_record_dataset(process=True, batch_size =32, num_epochs=1)
    
    # Define a Keras Model.
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(11, activation='relu', input_shape=(11,)))
    model.add(tf.keras.layers.Dense(1))

    # Compile the model.
    model.compile(loss=tf.keras.losses.categorical_crossentropy,
                  optimizer= tf.keras.optimizers.Adam(0.001))
        
    callbacks = [
        tf.keras.callbacks.TensorBoard(log_dir=tensorboard.logdir()),
        tf.keras.callbacks.ModelCheckpoint(filepath=tensorboard.logdir()),
    ]
    model.fit(train_input_processed, 
        verbose=0,
        epochs=5, 
        steps_per_epoch=5,
        validation_data=train_input_processed,
        validation_steps=1,                    
        callbacks=callbacks
    )
    
    score = model.evaluate(train_input_processed, steps=1)

    # Export model
    # WARNING(break-tutorial-inline-code): The following code snippet is
    # in-lined in tutorials, please update tutorial documents accordingly
    # whenever code changes.

    export_path = os.getcwd() + '/model-' + str(uuid.uuid4())
    print('Exporting trained model to: {}'.format(export_path))
    
    tf.saved_model.save(model, export_path)

    print('Done exporting!')
    
    metrics = {'loss': score}
    
    hops_model.export(export_path, "sales", metrics=metrics)    
    
    return metrics

In [None]:
from hops import experiment
from hops import hdfs

experiment.launch(keras_sales_model, name='sales', local_logdir=True, metric_key='loss')

# Query Model Repository for best sales Model

In [None]:
from hops import model
from hops.model import Metric
MODEL_NAME="sales"
EVALUATION_METRIC="loss"

In [None]:
best_model = model.get_best_model(MODEL_NAME, EVALUATION_METRIC, Metric.MIN)

In [None]:
print('Model name: ' + best_model['name'])
print('Model version: ' + str(best_model['version']))
print(best_model['metrics'])

# Create Model Serving of Exported Model

In [None]:
from hops import serving

In [None]:
# Create serving
model_path="/Models/" + best_model['name']
response = serving.create_or_update(model_path, MODEL_NAME, serving_type="TENSORFLOW", 
                                 model_version=best_model['version'])

In [None]:
# List all available servings in the project
for s in serving.get_all():
    print(s.name)

In [None]:
# Get serving status
serving.get_status(MODEL_NAME)

# Check Model Serving for active servings

# Start Model Serving Server

In [24]:
if serving.get_status(MODEL_NAME) == 'Stopped':
    serving.start(MODEL_NAME)

In [25]:
import time
while serving.get_status(MODEL_NAME) != "Running":
    time.sleep(5) # Let the serving startup correctly
time.sleep(5)

#### hsfs TrainingDataset object provides utility method `get_serving_vector` to retrieve serving vector from online feature store. This method assumes that all feature groups used to creates this training dataset are online enabled. 
##### `get_serving_vector` method expect dict object where keys are feature primary key names. To identify with primary key names are used for this training dataset query use `serving_keys` method

In [15]:
td = fs.get_training_dataset("sales_model",1)

In [27]:
td.init_prepared_statement() #this is need to get serving_keys, however not necessary for `get_serving_vector` method
td.serving_keys

{'store', 'date', 'dept'}

### For demo purposes lets prepare list of primary key values that we are interest to get model score

In [28]:
incoming_data = [(31,"2010-02-05",47),
                 (2,"2010-02-12",92),
                 (20,"2010-03-05",11),
                 (4,"2010-04-02",52),
                 (12,"2010-05-07",27)
                ]


# Send Prediction Requests to the Served Model using Hopsworks REST API

In [29]:
TOPIC_NAME = serving.get_kafka_topic(MODEL_NAME)

#### iterate over incoming_data and use `td.get_serving_vector` to retrieve serving vector for each primary key combination

In [30]:
import json
for i in incoming_data:
    serving_vector = td.get_serving_vector({'store': i[0],'date': i[1], 'dept': i[2]})
    data = {
                "signature_name": "serving_default", "instances": [serving_vector]
            }
    response = serving.make_inference_request(MODEL_NAME, data)
    print(response)

{'predictions': [[-42484.6484]]}
{'predictions': [[-649627.875]]}
{'predictions': [[-2381425.5]]}
{'predictions': [[-7573677.0]]}
{'predictions': [[-6883642.0]]}