### Tensorflow Data Validation - TFDV

This notebook is continuation of 04_TFX notebook. In this notebook we will explore data and find and evaluate stats, schema for further use in our model building pipelines.

In [1]:
!pip freeze | grep "flow"

tensorflow==2.0.0b1
tensorflow-data-validation==0.13.1
tensorflow-metadata==0.13.0
tensorflow-model-analysis==0.13.2
tensorflow-transform==0.13.0


In [63]:
from  __future__ import print_function

import sys, os
import tempfile, zipfile, urllib
assert sys.version_info.major is 2, 'Oops not running Python2'

In [64]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import tensorflow_transform as tft


tf.logging.set_verbosity(tf.logging.ERROR)
print('TFDV version: {}'.format(tfdv.version.__version__))

TFDV version: 0.12.0


We will be using Chicago taxi dataset to evaluate function available in tensorflow extended.

### Load the files


In [12]:
# Set up global variable for file path

BASE_DIR=tempfile.mktemp()
DATA_DIR=os.path.join(BASE_DIR, 'data')
OUTPUT_DIR=os.path.join(BASE_DIR, 'chicago_taxi_output')
TRAIN_DATA=os.path.join(DATA_DIR, 'train', 'data.csv')
EVAL_DATA=os.path.join(DATA_DIR, 'eval', 'data.csv')
SERVING_DATA=os.path.join(DATA_DIR, 'serving', 'data.csv')

In [16]:
#Download zip data file from GCP and unzip it
url = 'https://storage.googleapis.com/tfx-colab-datasets/chicago_data.zip'
_zip, headers = urllib.urlretrieve(url)
zipfile.ZipFile(_zip).extractall(BASE_DIR)
zipfile.ZipFile(_zip).close()

print("Here's what we downloaded:")
!ls -lR {os.path.join(BASE_DIR, 'data')}

Here's what we downloaded:
total 0
drwxr-xr-x  3 pankaj  staff  96 May  6 20:54 [34meval[m[m
drwxr-xr-x  3 pankaj  staff  96 May  6 20:54 [34mserving[m[m
drwxr-xr-x  3 pankaj  staff  96 May  6 20:54 [34mtrain[m[m

/var/folders/5w/jmdmrnxx2pb6gzbgx_946lw40000gn/T/tmpGQNVqn/data/eval:
total 1280
-rw-r--r--  1 pankaj  staff  641080 May  6 20:54 data.csv

/var/folders/5w/jmdmrnxx2pb6gzbgx_946lw40000gn/T/tmpGQNVqn/data/serving:
total 32
-rw-r--r--  1 pankaj  staff  12727 May  6 20:54 data.csv

/var/folders/5w/jmdmrnxx2pb6gzbgx_946lw40000gn/T/tmpGQNVqn/data/train:
total 4224
-rw-r--r--  1 pankaj  staff  1281866 May  6 20:54 data.csv


In [65]:
# Generate statistics
train_stats = tfdv.generate_statistics_from_csv(data_location=TRAIN_DATA)
tfdv.visualize_statistics(train_stats)

In [66]:
# Compute stats for eval dataset
eval_stats = tfdv.generate_statistics_from_csv(data_location=EVAL_DATA)

# Visualize and compare with train stats
tfdv.visualize_statistics(lhs_statistics=eval_stats, rhs_statistics=train_stats,
                         lhs_name='EVAL_DATASET', rhs_name='TRAIN_DATASET')

In [67]:
schema = tfdv.infer_schema(statistics=train_stats)
tfdv.display_schema(schema=schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'fare',FLOAT,required,,-
'trip_start_hour',INT,required,,-
'pickup_census_tract',BYTES,optional,,-
'dropoff_census_tract',FLOAT,optional,single,-
'company',STRING,optional,single,'company'
'trip_start_timestamp',INT,required,,-
'pickup_longitude',FLOAT,required,,-
'trip_start_month',INT,required,,-
'trip_miles',FLOAT,required,,-
'dropoff_longitude',FLOAT,optional,single,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'company',"'0118 - 42111 Godfrey S.Awir', '0694 - 59280 Chinesco Trans Inc', '1085 - 72312 N and W Cab Co', '2733 - 74600 Benny Jona', '2809 - 95474 C & D Cab Co Inc.', '3011 - 66308 JBL Cab Inc.', '3152 - 97284 Crystal Abernathy', '3201 - C&D Cab Co Inc', '3201 - CID Cab Co Inc', '3253 - 91138 Gaither Cab Co.', '3385 - 23210 Eman Cab', '3623 - 72222 Arrington Enterprises', '3897 - Ilie Malec', '4053 - Adwar H. Nikola', '4197 - 41842 Royal Star', '4615 - 83503 Tyrone Henderson', '4615 - Tyrone Henderson', '4623 - Jay Kim', '5006 - 39261 Salifu Bawa', '5006 - Salifu Bawa', '5074 - 54002 Ahzmi Inc', '5074 - Ahzmi Inc', '5129 - 87128', '5129 - 98755 Mengisti Taxi', '5129 - Mengisti Taxi', '5724 - KYVI Cab Inc', '585 - Valley Cab Co', '5864 - 73614 Thomas Owusu', '5864 - Thomas Owusu', '5874 - 73628 Sergey Cab Corp.', '5997 - 65283 AW Services Inc.', '5997 - AW Services Inc.', '6488 - 83287 Zuha Taxi', '6743 - Luhak Corp', 'Blue Ribbon Taxi Association Inc.', 'C & D Cab Co Inc', 'Chicago Elite Cab Corp.', 'Chicago Elite Cab Corp. (Chicago Carriag', 'Chicago Medallion Leasing INC', 'Chicago Medallion Management', 'Choice Taxi Association', 'Dispatch Taxi Affiliation', 'KOAM Taxi Association', 'Northwest Management LLC', 'Taxi Affiliation Services', 'Top Cab Affiliation'"
'payment_type',"'Cash', 'Credit Card', 'Dispute', 'No Charge', 'Pcard', 'Unknown'"


In [39]:
anomalies=tfdv.validate_statistics(statistics=eval_stats, schema=schema)
tfdv.display_anomalies(anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'payment_type',Unexpected string values,Examples contain values missing from the schema: Prcard (<1%).
'company',Unexpected string values,"Examples contain values missing from the schema: 2092 - 61288 Sbeih company (<1%), 2192 - 73487 Zeymane Corp (<1%), 2192 - Zeymane Corp (<1%), 2823 - 73307 Seung Lee (<1%), 3094 - 24059 G.L.B. Cab Co (<1%), 3319 - CD Cab Co (<1%), 3385 - Eman Cab (<1%), 3897 - 57856 Ilie Malec (<1%), 4053 - 40193 Adwar H. Nikola (<1%), 4197 - Royal Star (<1%), 585 - 88805 Valley Cab Co (<1%), 5874 - Sergey Cab Corp. (<1%), 6057 - 24657 Richard Addo (<1%), 6574 - Babylon Express Inc. (<1%), 6742 - 83735 Tasha ride inc (<1%)."


In [47]:
company = tfdv.get_feature(schema, 'company')
company.distribution_constraints.min_domain_mass=0.9

# add new value to domain of feature payment_type
payment_type_domain= tfdv.get_domain(schema, 'payment_type')
payment_type_domain.value.append('Prcard')

# Validate eval stats after updating schema
updated_anomalies = tfdv.validate_statistics(eval_stats, schema)
tfdv.display_anomalies(updated_anomalies)

### Schema environments

In [48]:
serving_stats = tfdv.generate_statistics_from_csv(SERVING_DATA)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)

tfdv.display_anomalies(serving_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'tips',Column dropped,Column is completely missing
'trip_seconds',Expected data of type: FLOAT but got INT,


As we see tips is target column and missing from serving dataset and there is type mismatch of trip_seconds column. Let's address that by saying serving stats to infer schema from training

In [49]:
options = tfdv.StatsOptions(schema=schema, infer_type_from_schema=True)
serving_stats = tfdv.generate_statistics_from_csv(SERVING_DATA, stats_options=options)
serving_anomalies = tfdv.validate_statistics(serving_stats, schema)

In [50]:
tfdv.display_anomalies(serving_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'tips',Column dropped,Column is completely missing


Now we have only tips feature (which is our label) showing up as an anomaly, which is fine. Of course we don't want to have labels in our serving data.

In [52]:
# All features are by default in both envs.
schema.default_environment.append('TRAINING')
schema.default_environment.append('SERVING')

# Specify that 'tips' feature is not in SERVING env
tfdv.get_feature(schema, 'tips').not_in_environment.append('SERVING')

serving_anomalies_with_env = tfdv.validate_statistics(serving_stats, schema, environment='SERVING')
tfdv.display_anomalies(serving_anomalies_with_env)

### Check for drift and skew

In addition to checking whether a dataset conforms to the expectations set in schema, TFDV also provides functionality to detect drift and skew. TFDV performs this check by comparing the statistics of the different datasets based on the drift/skew comparators specified in the schema.

### Drift
Drift detection is supported for categorical features and between consecutive spans of data (i.e. between span N and N+1), such as between different days of training data. We express drift in terms of L-infinity distance, and you can set threshold to receive warnings when drift is higher than acceptable, Setting correct distance requires domain knowledge and experimentation.

### Skew
TFDV can detect three different kind of skew - schema skew, feature skew and distribution skew

#### Schema skew
It occurs when training and serving data do not conform to same schema (as we see above for tips feature), shoule be specified through environment field in the schema.

#### Feature skew
It occurs when feature values that a model trains on are different from feature values that it sees at serving time. It can happen when:

- A data source that provide some feature values is modified between training and serving time
- There is different logic for generating features between training and serving. You apply some transformation only in one of the 2 code paths

#### Distribution skew
It occurs when the distribution of the training dataset is significantly different from distribution in serving dataset, could be due to faulty sampling mechanism.

In [53]:
# Add skew comparator for 'payment_type' feature.
payment_type = tfdv.get_feature(schema, 'payment_type')
payment_type.skew_comparator.infinity_norm.threshold = 0.01

# Add drift comparator for 'company' feature.
company=tfdv.get_feature(schema, 'company')
company.drift_comparator.infinity_norm.threshold = 0.001

skew_anomalies = tfdv.validate_statistics(train_stats, schema,
                                          previous_statistics=eval_stats,
                                          serving_statistics=serving_stats)

tfdv.display_anomalies(skew_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'company',High Linfty distance between current and previous,"The Linfty distance between current and previous is 0.00820891 (up to six significant digits), above the threshold 0.001. The feature value with maximum difference is: Blue Ribbon Taxi Association Inc."
'payment_type',High Linfty distance between serving and training,"The Linfty distance between serving and training is 0.0225 (up to six significant digits), above the threshold 0.01. The feature value with maximum difference is: Credit Card"


### Freeze schema 

In [61]:
from tensorflow.python.lib.io import file_io
from google.protobuf import text_format

In [62]:
file_io.recursive_create_dir(OUTPUT_DIR)
schema_file = os.path.join(OUTPUT_DIR, 'schema.pbtxt')
tfdv.write_schema_text(schema, schema_file)

!cat {schema_file}

feature {
  name: "fare"
  type: FLOAT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "trip_start_hour"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "pickup_census_tract"
  type: BYTES
  presence {
    min_count: 0
  }
}
feature {
  name: "dropoff_census_tract"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "company"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "company"
  presence {
    min_count: 1
  }
  distribution_constraints {
    min_domain_mass: 0.9
  }
  drift_comparator {
    infinity_norm {
      threshold: 0.001
    }
  }
}
feature {
  name: "trip_start_timestamp"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 