In [39]:
from google.protobuf.json_format import MessageToDict
import tensorflow_data_validation as tfdv
training_stats = tfdv.generate_statistics_from_csv("training_data.csv")
serving_stats = tfdv.generate_statistics_from_csv("serving_data.csv")
tfdv.visualize_statistics(training_stats)

##### Infer the schema using the DatasetFeatureStatisticsList

In [46]:
training_schema = tfdv.infer_schema(training_stats, max_string_domain_size = 1000000)

# Non Null values of City to be 99%
city_feature = tfdv.get_feature(training_schema, 'city')
city_feature.presence.min_fraction=.99

#Setup L-infty thresold on product
product_feature = tfdv.get_feature(training_schema, 'product')
product_feature.drift_comparator.infinity_norm.threshold=0.01

#Setup Range of expected values for amount, integer domain
amount_feature = tfdv.get_feature(training_schema, 'amount')
amount_feature.value_count.min=1
amount_feature.value_count.max=10

#Setup a max value on int column, to showcase out of range values
id_feature = tfdv.get_feature(training_schema, 'id')
id_feature.int_domain.max = 10

# Relax the min fraction of values that must come from the domain for feature period_key.
period_key_feature = tfdv.get_feature(training_schema, 'period_key')
period_key_feature.distribution_constraints.min_domain_mass = 0
product_feature.distribution_constraints.min_domain_mass = 0
city_feature.distribution_constraints.min_domain_mass = 0

period_key_feature.int_domain

#Save the schema for future consumption. But the schema object is alreay updated in memory.
tfdv.write_schema_text(training_schema, 'training_schema_fixed')


In [47]:
drift_anomalies = tfdv.validate_statistics(statistics=serving_stats, schema=training_schema, previous_statistics=training_stats)
tfdv.display_anomalies(drift_anomalies)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'city',Column dropped,"The feature was present in fewer examples than expected: minimum fraction = 0.990000, actual = 0.970000"
'product',High Linfty distance between current and previous,"The Linfty distance between current and previous is 0.12381 (up to six significant digits), above the threshold 0.01. The feature value with maximum difference is: Acetaminophen"
'supplier',Unexpected string values,"Examples contain values missing from the schema: ALK-Abello, Inc. (~1%), AMI Cosmetic Co.,Ltd. (~1%), Actavis Inc. (~1%), Advanced Generic Corporation (~1%), Ajou Medics Co., Ltd (~1%), Allergy Laboratories, Inc. (~1%), American Regent, Inc. (~2%), Antigen Laboratories, Inc. (~1%), Arbonne International, LLC (~1%), AuroMedics Pharma LLC (~1%), Aurobindo Pharma Limited (~1%), B. Braun Medical Inc. (~1%), Camber Pharmaceuticals, Inc. (~1%), Cardinal Health (Leader) (~1%), Claris Lifesciences Inc. (~1%), Clinical Solutions Wholesale (~1%), Core Brands, Inc. (~1%), Cresson (~1%), DZA Brands LLC (~1%), Dispensing Solutions, Inc. (~1%), Dr. Fresh, Inc. (~1%), Ferring Pharmaceuticals Inc. (~1%), G.D. Searle LLC Division of Pfizer Inc (~1%), GOJO Industries, Inc. (~1%), Have and Be Co., Ltd. (~1%), Inel Cosmetics Co., Ltd. (~1%), JAFRA COSMETICS INTERNATIONAL (~1%), Janssen Pharmaceuticals, Inc. (~1%), Jubilant HollisterStier LLC (~2%), KAISER FOUNDATION HOSPITALS (~1%), L\'Oreal USA Products Inc (~1%), Laboratoires Boiron (~1%), Liddell Laboratories, Inc. (~1%), Marathon Pharmaceuticals, LLC (~1%), MedVantx, Inc. (~1%), Merck Sharp & Dohme Corp. (~1%), MineralHouse Corporation (~1%), Modern Welding Company, Inc. (~1%), Native Remedies, LLC (~1%), OMP, INC. (~1%), PETNET Solutions, Inc. (~1%), Paddock Laboratories, LLC (~1%), Parfums Christian Dior (~1%), Performance Health LLC. (~1%), Pfizer Laboratories Div Pfizer Inc (~1%), Preferred Pharmaceuticals, Inc (~1%), Preferred Pharmaceuticals, Inc. (~1%), Procter & Gamble Manufacturing Company (~1%), Procter and Gamble Manufacturing Company (~1%), Proficient Rx LP (~1%), Qualitest Pharmaceuticals (~1%), Ranbaxy Pharmaceuticals Inc. (~1%), SDA Laboratories, Inc. (~1%), SMART SENSE (Kmart) (~1%), Safeway Inc. (~1%), Safeway, Inc. (~1%), Sage Products LLC (~1%), Shanghai Yinjing Medical Supplies Co., Ltd. (~1%), Shopko Stores Operating Co., LLC (~1%), Speer Laboratories, LLC (~1%), Supervalu Inc (~2%), Supervalu Inc. (~1%), Target Corporation (~1%), Terumo Corporation (~1%), The Harvard Drug Group, LLC (~1%), The Procter & Gamble Manufacturing Company (~1%), Topco Associates LLC (~1%), Unit Dose Services (~1%), Uriel Pharmacy Inc. (~1%), Ventura Corporation LTD (~1%), Western Family Foods Inc (~1%), Wockhardt Limited (~2%), sanofi-aventis U.S. LLC (~1%)."
'id',Out-of-range values,Unexpectedly large value: 100.
'country',Column dropped,Column is completely missing


In [48]:
#on a per example basis - yet to fully understand
options = tfdv.StatsOptions(schema=training_schema)
anomalous_example_stats = tfdv.validate_examples_in_csv(
   data_location="serving_data.csv", stats_options=options)
print(anomalous_example_stats)

datasets {
  name: "supplier_ENUM_TYPE_UNEXPECTED_STRING_VALUES"
  num_examples: 77
  features {
    num_stats {
      common_stats {
        num_non_missing: 77
        min_num_values: 1
        max_num_values: 1
        avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 7.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 7.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 7.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 7.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 7.7
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 7.7
          }
          buckets {
     

In [8]:
training_schema_fixed = tfdv.load_schema_text('training_schema_fixed')
anomalies = tfdv.validate_statistics(statistics = training_stats, schema = training_schema_fixed)
tfdv.display_anomalies(anomalies)
# print(training_stats)
# anomaly_info = MessageToDict(anomalies)
# anomaly_info.keys()


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'city',Column dropped,"The feature was present in fewer examples than expected: minimum fraction = 0.990000, actual = 0.970000"
'product',Invalid values,"String values that were not ints were found, such as ""Absolut Citron""."
'id',Out-of-range values,Unexpectedly large value: 1000.


### Update the features
