# Data Validation

## Setup

### Install the package

In [None]:
!pip install tensorflow-data-validation==0.27.0 requests==2.23.0

### Import packages

In [None]:
import logging

import pandas as pd
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils import slicing_util
from tensorflow_metadata.proto.v0 import statistics_pb2

## The dataset

### Download the dataset

In [None]:
!mkdir data

In [None]:
# Initial dataset source
DATASET_URL = "http://bit.ly/building-ml-pipelines-dataset"
# Initial local dataset location
LOCAL_FILE_NAME = "data/consumer_complaints_with_narrative.csv"


def download_dataset(url=DATASET_URL):
    """download_dataset downloads the remote dataset to a local path

    Keyword Arguments:
        url {string} --
            complete url path to the csv data source (default: {DATASET_URL})
        local_path {string} --
            initial local file location (default: {LOCAL_FILE_NAME})
    Returns:
        None
    """
    df = pd.read_csv(url, index_col=0)
    df.to_csv(LOCAL_FILE_NAME)
    logging.info("Download completed.")


download_dataset()

### Load the dataset

In [None]:
df = pd.read_csv(LOCAL_FILE_NAME)
df.head()

## Experiments

### Generate statistics

In [None]:

stats = tfdv.generate_statistics_from_csv(
    data_location='data/consumer_complaints_with_narrative.csv',
    delimiter=','
)

In [None]:
stats

In [None]:
tfdv.visualize_statistics(stats)

### Generate schema

In [None]:
schema = tfdv.infer_schema(stats)

In [None]:
schema

In [None]:
tfdv.display_schema(schema)

### Compare datasets

In [None]:
!mkdir -p chapters/data_validation

In [None]:
df.iloc[5000:].to_csv('chapters/data_validation/dataset_1.csv', index=False)
df.iloc[:5000].to_csv('chapters/data_validation/dataset_2.csv', index=False)

In [None]:
!wc -l data/consumer_complaints_with_narrative.csv
!wc -l chapters/data_validation/dataset_1.csv
!wc -l chapters/data_validation/dataset_2.csv

In [None]:
train_stats = tfdv.generate_statistics_from_csv(
    data_location='chapters/data_validation/dataset_1.csv',
    delimiter=','
)
val_stats = tfdv.generate_statistics_from_csv(
    data_location='chapters/data_validation/dataset_2.csv',
    delimiter=','
)

tfdv.visualize_statistics(
    lhs_statistics=val_stats,
    rhs_statistics=train_stats,
    lhs_name='VAL_DATASET',
    rhs_name='TRAIN_DATASET'
)

In [None]:
anomalies = tfdv.validate_statistics(statistics=val_stats, schema=schema)

In [None]:
tfdv.display_anomalies(anomalies)

In [None]:
anomalies

### Skew and Drift

In [None]:
tfdv.get_feature(schema, 'company').skew_comparator.infinity_norm.threshold = 0.01
skew_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    serving_statistics=val_stats
)

In [None]:
tfdv.display_anomalies(skew_anomalies)

In [None]:
# Change the threshold to show anomaly
tfdv.get_feature(schema, 'company').skew_comparator.infinity_norm.threshold = 0.005
skew_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    serving_statistics=val_stats
)

In [None]:
tfdv.display_anomalies(skew_anomalies)

In [None]:
tfdv.get_feature(schema, 'company').drift_comparator.infinity_norm.threshold = 0.01
drift_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    previous_statistics=val_stats
)

In [None]:
tfdv.display_anomalies(drift_anomalies)

In [None]:
# Change the threshold to show anomaly
tfdv.get_feature(schema, 'company').drift_comparator.infinity_norm.threshold = 0.005
drift_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    previous_statistics=val_stats
)

In [None]:
tfdv.display_anomalies(drift_anomalies)

### Slice the dataset

In [None]:
slice_fn1 = slicing_util.get_feature_value_slicer(
    features={'state': [b'CA']}
)
slice_options = tfdv.StatsOptions(slice_functions=[slice_fn1])
slice_stats = tfdv.generate_statistics_from_csv(
    data_location='data/consumer_complaints_with_narrative.csv',
    stats_options=slice_options
)

In [None]:
def display_slice_keys(stats):
    print(list(map(lambda x: x.name, slice_stats.datasets)))


def get_sliced_stats(stats, slice_key):
    for sliced_stats in stats.datasets:
        if sliced_stats.name == slice_key:
            result = statistics_pb2.DatasetFeatureStatisticsList()
            result.datasets.add().CopyFrom(sliced_stats)
            return result
    print('Invalid Slice key')


def compare_slices(stats, slice_key1, slice_key2):
    lhs_stats = get_sliced_stats(stats, slice_key1)
    rhs_stats = get_sliced_stats(stats, slice_key2)
    tfdv.visualize_statistics(lhs_stats, rhs_stats)

In [None]:
tfdv.visualize_statistics(get_sliced_stats(slice_stats, 'state_CA'))

In [None]:
compare_slices(slice_stats, 'state_CA', 'All Examples')