## TFMA Notebook example

This notebook describes how to export your model for TFMA and demonstrates the analysis tooling it offers.

Note: Please make sure to follow the instructions in [README.md](https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi/README.md) when running this notebook

## Setup

Import necessary packages.

In [1]:
from  hops import hdfs
import tensorflow_data_validation as tfdv
import tensorflow_model_analysis as tfma
import tensorflow_metadata as tfm
from tensorflow.python.lib.io import file_io
import os

W0820 17:16:27.998271 139825583544128 deprecation_wrapper.py:119] From /srv/hops/anaconda/anaconda/envs/python27/lib/python2.7/site-packages/tensorflow_transform/beam/common.py:51: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0820 17:16:28.007344 139825583544128 deprecation_wrapper.py:119] From /srv/hops/anaconda/anaconda/envs/python27/lib/python2.7/site-packages/tensorflow_transform/beam/impl.py:283: The name tf.SparseTensorValue is deprecated. Please use tf.compat.v1.SparseTensorValue instead.



Helper functions and some constants for running the notebook locally.

In [2]:
BASE_DIR = hdfs.project_path(exclude_nn_addr=True)
DATA_DIR = os.path.join(BASE_DIR, 'Resources/data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'Resources/taxi_out')
TMP_DIR = os.path.join(BASE_DIR, 'Resources/taxi_tmp')

# Base dir containing train and eval data
TRAIN_DATA_DIR = os.path.join(DATA_DIR, 'train')
EVAL_DATA_DIR = os.path.join(DATA_DIR, 'eval')

# Base dir where TFT writes training data
TFT_TRAIN_OUTPUT_BASE_DIR = os.path.join(OUTPUT_DIR, 'tft_train')
TFT_TRAIN_FILE_PREFIX = 'train_transformed'

# Base dir where TFT writes eval data
TFT_EVAL_OUTPUT_BASE_DIR = os.path.join(OUTPUT_DIR, 'tft_eval')
TFT_EVAL_FILE_PREFIX = 'eval_transformed'

TF_OUTPUT_BASE_DIR = os.path.join(OUTPUT_DIR, 'tf')

# Base dir where TFMA writes eval data
TFMA_OUTPUT_BASE_DIR = os.path.join(OUTPUT_DIR, 'tfma')

SERVING_MODEL_DIR = 'serving_model_dir'
EVAL_MODEL_DIR = 'eval_model_dir'


def get_tft_train_output_dir(run_id):
    return _get_output_dir(TFT_TRAIN_OUTPUT_BASE_DIR, run_id)


def get_tft_eval_output_dir(run_id):
    return _get_output_dir(TFT_EVAL_OUTPUT_BASE_DIR, run_id)


def get_tf_output_dir(run_id):
    return _get_output_dir(TF_OUTPUT_BASE_DIR, run_id)

def get_tfma_output_dir(run_id):
    return _get_output_dir(TFMA_OUTPUT_BASE_DIR, run_id)

def _get_output_dir(base_dir, run_id):
    return os.path.join(base_dir, 'run_' + str(run_id))

def get_schema_file():
    return os.path.join(OUTPUT_DIR, 'schema.pbtxt')


## Compute and visualize descriptive data statistics

In [4]:
# Visualize training data stats.
train_stats = tfdv.load_statistics(os.path.join(TMP_DIR,'train_stats'))
tfdv.visualize_statistics(train_stats)

W0820 16:24:26.321234 139929481271104 deprecation.py:323] From /srv/hops/anaconda/anaconda/envs/python27/lib/python2.7/site-packages/tensorflow_data_validation/utils/stats_gen_lib.py:328: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.
Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


## Infer a schema

In [7]:
hdfs.copy_to_local(os.path.join(TMP_DIR, 'schema'), '')
with open('schema','r') as f:
    schema = f.read()
    schema = tfm.proto.v0.schema_pb2.Schema().FromString(schema)

Started copying hdfs:///Projects/flink_tutorial/Resources/taxi_tmp/schema to local disk on path /srv/hops/staging/private_dirs/86e80ce5c9a20c6503c7203feae14c93f9e1f19e6c75063a4fcacd43641cdaf3/

Finished copying



In [8]:
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'fare',FLOAT,required,single,-
'trip_start_hour',INT,required,single,-
'pickup_census_tract',BYTES,optional,,-
'dropoff_census_tract',FLOAT,optional,single,-
'company',STRING,optional,single,'company'
'trip_start_timestamp',INT,required,single,-
'pickup_longitude',FLOAT,required,single,-
'trip_start_month',INT,required,single,-
'trip_miles',FLOAT,required,single,-
'dropoff_longitude',FLOAT,optional,single,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'company',"'0118 - 42111 Godfrey S.Awir', '0694 - 59280 Chinesco Trans Inc', '1085 - 72312 N and W Cab Co', '2733 - 74600 Benny Jona', '2809 - 95474 C & D Cab Co Inc.', '3011 - 66308 JBL Cab Inc.', '3152 - 97284 Crystal Abernathy', '3201 - C&D Cab Co Inc', '3201 - CID Cab Co Inc', '3253 - 91138 Gaither Cab Co.', '3385 - 23210 Eman Cab', '3623 - 72222 Arrington Enterprises', '3897 - Ilie Malec', '4053 - Adwar H. Nikola', '4197 - 41842 Royal Star', '4615 - 83503 Tyrone Henderson', '4615 - Tyrone Henderson', '4623 - Jay Kim', '5006 - 39261 Salifu Bawa', '5006 - Salifu Bawa', '5074 - 54002 Ahzmi Inc', '5074 - Ahzmi Inc', '5129 - 87128', '5129 - 98755 Mengisti Taxi', '5129 - Mengisti Taxi', '5724 - KYVI Cab Inc', '585 - Valley Cab Co', '5864 - 73614 Thomas Owusu', '5864 - Thomas Owusu', '5874 - 73628 Sergey Cab Corp.', '5997 - 65283 AW Services Inc.', '5997 - AW Services Inc.', '6488 - 83287 Zuha Taxi', '6743 - Luhak Corp', 'Blue Ribbon Taxi Association Inc.', 'C & D Cab Co Inc', 'Chicago Elite Cab Corp.', 'Chicago Elite Cab Corp. (Chicago Carriag', 'Chicago Medallion Leasing INC', 'Chicago Medallion Management', 'Choice Taxi Association', 'Dispatch Taxi Affiliation', 'KOAM Taxi Association', 'Northwest Management LLC', 'Taxi Affiliation Services', 'Top Cab Affiliation'"
'payment_type',"'Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Unknown'"


## Check evaluation data for errors

In [10]:
# Compare stats of eval data with training data.
eval_stats = tfdv.load_statistics(os.path.join(TMP_DIR,'eval_stats'))
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                          lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

In [12]:
hdfs.copy_to_local(os.path.join(TMP_DIR,'anomalies'), '')
with open('anomalies','r') as f:
    anomalies = f.read()
    anomalies = tfm.proto.v0.anomalies_pb2.Anomalies().FromString(anomalies)

Started copying hdfs:///Projects/flink_tutorial/Resources/taxi_tmp/anomalies to local disk on path /srv/hops/staging/private_dirs/86e80ce5c9a20c6503c7203feae14c93f9e1f19e6c75063a4fcacd43641cdaf3/

Finished copying



  after removing the cwd from sys.path.


In [13]:
tfdv.display_anomalies(anomalies)

In [14]:
# Update the schema based on the observed anomalies.

# Relax the minimum fraction of values that must come from the domain for feature company.
company = tfdv.get_feature(schema, 'company')
company.distribution_constraints.min_domain_mass = 0.9

# Add new value to the domain of feature payment_type.
payment_type_domain = tfdv.get_domain(schema, 'payment_type')
payment_type_domain.value.append('Prcard')

# Validate eval stats after updating the schema 
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)

## Visualization: Slicing Metrics

To see the slices, either use the name of the column (by setting slicing_column) or provide a tfma.slicer.SingleSliceSpec (by setting slicing_spec). If neither is provided, the overall will be displayed.

The default visualization is **slice overview** when the number of slices is small. It shows the value of a metric for each slice sorted by the another metric. It is also possible to set a threshold to filter out slices with smaller weights.

This view also supports **metrics histogram** as an alternative visualization. It is also the defautl view when the number of slices is large. The results will be divided into buckets and the number of slices / total weights / both can be visualized. Slices with small weights can be fitlered out by setting the threshold. Further filtering can be applied by dragging the grey band. To reset the range, double click the band. Filtering can be used to remove outliers in the visualization and the metrics table below.

In [3]:
# An empty slice spec means the overall slice, that is, the whole dataset.
OVERALL_SLICE_SPEC = tfma.slicer.SingleSliceSpec()

# Data can be sliced along a feature column
# In this case, data is sliced along feature column trip_start_hour.
FEATURE_COLUMN_SLICE_SPEC = tfma.slicer.SingleSliceSpec(columns=['trip_start_hour'])

# Data can be sliced by crossing feature columns
# In this case, slices are computed for trip_start_day x trip_start_month.
FEATURE_COLUMN_CROSS_SPEC = tfma.slicer.SingleSliceSpec(columns=['trip_start_day', 'trip_start_month'])

# Metrics can be computed for a particular feature value.
# In this case, metrics is computed for all data where trip_start_hour is 12.
FEATURE_VALUE_SPEC = tfma.slicer.SingleSliceSpec(features=[('trip_start_hour', 12)])

# It is also possible to mix column cross and feature value cross.
# In this case, data where trip_start_hour is 12 will be sliced by trip_start_day.
COLUMN_CROSS_VALUE_SPEC = tfma.slicer.SingleSliceSpec(columns=['trip_start_day'], features=[('trip_start_hour', 12)])

ALL_SPECS = [
    OVERALL_SLICE_SPEC,
    FEATURE_COLUMN_SLICE_SPEC, 
    FEATURE_COLUMN_CROSS_SPEC, 
    FEATURE_VALUE_SPEC, 
    COLUMN_CROSS_VALUE_SPEC    
]

In [4]:
tfma_result_1 = tfma.load_eval_result(output_path=get_tfma_output_dir(0))
# Show data sliced along feature column trip_start_hour.
tfma.view.render_slicing_metrics(
    tfma_result_1, slicing_column='trip_start_hour')

W0820 17:16:32.554158 139825583544128 deprecation.py:323] From /srv/hops/anaconda/anaconda/envs/python27/lib/python2.7/site-packages/tensorflow_model_analysis/evaluators/metrics_and_plots_evaluator.py:83: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.
Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


U2xpY2luZ01ldHJpY3NWaWV3ZXIoY29uZmlnPXsnd2VpZ2h0ZWRFeGFtcGxlc0NvbHVtbic6ICdwb3N0X2V4cG9ydF9tZXRyaWNzL2V4YW1wbGVfY291bnQnfSwgZGF0YT1beydtZXRyaWNzJzrigKY=


In [5]:
# Show metrics sliced by COLUMN_CROSS_VALUE_SPEC above.
tfma.view.render_slicing_metrics(tfma_result_1, slicing_spec=COLUMN_CROSS_VALUE_SPEC)

U2xpY2luZ01ldHJpY3NWaWV3ZXIoY29uZmlnPXsnd2VpZ2h0ZWRFeGFtcGxlc0NvbHVtbic6ICdwb3N0X2V4cG9ydF9tZXRyaWNzL2V4YW1wbGVfY291bnQnfSwgZGF0YT1beydtZXRyaWNzJzrigKY=


In [6]:
# Show overall metrics.
tfma.view.render_slicing_metrics(tfma_result_1)

U2xpY2luZ01ldHJpY3NWaWV3ZXIoY29uZmlnPXsnd2VpZ2h0ZWRFeGFtcGxlc0NvbHVtbic6ICdwb3N0X2V4cG9ydF9tZXRyaWNzL2V4YW1wbGVfY291bnQnfSwgZGF0YT1beydtZXRyaWNzJzrigKY=
