# Data Validation

- パッケージインストール後、ランタイムを再起動すること（Colab）

## Setup

### Install the package

In [3]:
!pip install tensorflow-data-validation==0.27.0 requests==2.24.0

Collecting tensorflow-data-validation==0.27.0
  Using cached tensorflow_data_validation-0.27.0-cp37-cp37m-manylinux2010_x86_64.whl (1.3 MB)
Collecting requests==2.24.0
  Downloading requests-2.24.0-py2.py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 282 kB/s 
Collecting tensorflow-metadata<0.28,>=0.27
  Using cached tensorflow_metadata-0.27.0-py3-none-any.whl (47 kB)
Collecting absl-py<0.11,>=0.9
  Using cached absl_py-0.10.0-py3-none-any.whl (127 kB)
Collecting joblib<0.15,>=0.12
  Using cached joblib-0.14.1-py2.py3-none-any.whl (294 kB)
Collecting apache-beam[gcp]<3,>=2.27
  Using cached apache_beam-2.31.0-cp37-cp37m-manylinux2010_x86_64.whl (9.7 MB)
Collecting pyarrow<3,>=1
  Using cached pyarrow-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl (17.7 MB)
Collecting tfx-bsl<0.28,>=0.27
  Using cached tfx_bsl-0.27.1-cp37-cp37m-manylinux2010_x86_64.whl (2.2 MB)
Collecting future<1.0.0,>=0.18.2
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |█████████████████████

### Import packages

In [1]:
import logging

import pandas as pd
import tensorflow_data_validation as tfdv
from tensorflow_data_validation.utils import slicing_util
from tensorflow_metadata.proto.v0 import statistics_pb2

## The dataset

### Download the dataset

In [2]:
!mkdir data

In [3]:
# Initial dataset source
DATASET_URL = "http://bit.ly/building-ml-pipelines-dataset"
# Initial local dataset location
LOCAL_FILE_NAME = "data/consumer_complaints_with_narrative.csv"


def download_dataset(url=DATASET_URL):
    """download_dataset downloads the remote dataset to a local path

    Keyword Arguments:
        url {string} --
            complete url path to the csv data source (default: {DATASET_URL})
        local_path {string} --
            initial local file location (default: {LOCAL_FILE_NAME})
    Returns:
        None
    """
    df = pd.read_csv(url, index_col=0)
    df.to_csv(LOCAL_FILE_NAME)
    logging.info("Download completed.")


download_dataset()

### Load the dataset

In [4]:
df = pd.read_csv(LOCAL_FILE_NAME)
df.head()

Unnamed: 0,product,sub_product,issue,sub_issue,consumer_complaint_narrative,company,state,zip_code,company_response,timely_response,consumer_disputed
0,Debt collection,I do not know,Disclosure verification of debt,Right to dispute notice not received,I was denied employment because of a judgment ...,Encore Capital Group,NY,113XX,Closed with explanation,Yes,0
1,Credit reporting,,Improper use of my credit report,Report improperly shared by CRC,I have a credit card through XXXX XXXX and XXX...,Experian,IL,606XX,Closed with non-monetary relief,Yes,0
2,Debt collection,I do not know,Cont'd attempts collect debt not owed,Debt is not mine,Almost daily phone calls from Stellar Recovery...,Stellar Recovery Inc.,MI,480XX,Closed with explanation,Yes,1
3,Mortgage,Conventional fixed mortgage,"Loan servicing, payments, escrow account",,I submitted my monthly mortgage payment to Pri...,Primary Residential Mortgage,CT,066XX,Closed with monetary relief,Yes,0
4,Student loan,Non-federal student loan,Dealing with my lender or servicer,Received bad information about my loan,I contacted America Education Services in XX/X...,AES/PHEAA,FL,321XX,Closed with explanation,Yes,1


## Experiments

### Generate statistics

In [5]:

stats = tfdv.generate_statistics_from_csv(
    data_location='data/consumer_complaints_with_narrative.csv',
    delimiter=','
)





Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [6]:
stats

datasets {
  num_examples: 66799
  features {
    type: STRING
    string_stats {
      common_stats {
        num_non_missing: 66799
        min_num_values: 1
        max_num_values: 1
        avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6679.9
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6679.9
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6679.9
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6679.9
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6679.9
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 6679.9
          }
          buckets {
            low

In [7]:
tfdv.visualize_statistics(stats)

### Generate schema

In [8]:
schema = tfdv.infer_schema(stats)

In [9]:
schema

feature {
  name: "product"
  type: BYTES
  domain: "product"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "sub_product"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "sub_product"
  presence {
    min_count: 1
  }
}
feature {
  name: "issue"
  type: BYTES
  domain: "issue"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "sub_issue"
  value_count {
    min: 1
    max: 1
  }
  type: BYTES
  domain: "sub_issue"
  presence {
    min_count: 1
  }
}
feature {
  name: "consumer_complaint_narrative"
  type: BYTES
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "company"
  type: BYTES
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "state"
  value_count {
    min: 1
    max: 1
  }
  type:

In [10]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'product',STRING,required,,'product'
'sub_product',STRING,optional,single,'sub_product'
'issue',STRING,required,,'issue'
'sub_issue',STRING,optional,single,'sub_issue'
'consumer_complaint_narrative',BYTES,required,,-
'company',BYTES,required,,-
'state',STRING,optional,single,'state'
'zip_code',BYTES,optional,single,-
'company_response',STRING,required,,'company_response'
'timely_response',STRING,required,,'timely_response'


  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'product',"'Bank account or service', 'Consumer Loan', 'Credit card', 'Credit reporting', 'Debt collection', 'Money transfers', 'Mortgage', 'Other financial service', 'Payday loan', 'Prepaid card', 'Student loan'"
'sub_product',"'(CD) Certificate of deposit', 'Auto', 'Cashing a check without an account', 'Check cashing', 'Checking account', 'Conventional adjustable mortgage (ARM)', 'Conventional fixed mortgage', 'Credit card', 'Credit repair', 'Debt settlement', 'Domestic (US) money transfer', 'Electronic Benefit Transfer / EBT card', 'FHA mortgage', 'Federal student loan', 'Foreign currency exchange', 'General purpose card', 'Gift or merchant card', 'Government benefit payment card', 'Home equity loan or line of credit', 'I do not know', 'ID prepaid card', 'Installment loan', 'International money transfer', 'Medical', 'Mobile wallet', 'Money order', 'Mortgage', 'Non-federal student loan', 'Other (i.e. phone, health club, etc.)', 'Other bank product/service', 'Other mortgage', 'Other special purpose card', 'Pawn loan', 'Payday loan', 'Payroll card', 'Personal line of credit', 'Refund anticipation check', 'Reverse mortgage', 'Savings account', 'Title loan', 'Transit card', 'Travelerâs/Cashierâs checks', 'VA mortgage', 'Vehicle lease', 'Vehicle loan'"
'issue',"'APR or interest rate', 'Account opening, closing, or management', 'Account terms and changes', 'Adding money', 'Advertising and marketing', 'Advertising, marketing or disclosures', 'Application processing delay', 'Application, originator, mortgage broker', 'Applied for loan/did not receive money', 'Arbitration', 'Balance transfer', 'Balance transfer fee', 'Bankruptcy', 'Billing disputes', 'Billing statement', 'Can\'t contact lender', 'Can\'t repay my loan', 'Can\'t stop charges to bank account', 'Cash advance', 'Cash advance fee', 'Charged bank acct wrong day or amt', 'Charged fees or interest I didn\'t expect', 'Closing/Cancelling account', 'Communication tactics', 'Cont\'d attempts collect debt not owed', 'Convenience checks', 'Credit card protection / Debt protection', 'Credit decision / Underwriting', 'Credit determination', 'Credit line increase/decrease', 'Credit monitoring or identity protection', 'Credit reporting company\'s investigation', 'Customer service / Customer relations', 'Customer service/Customer relations', 'Dealing with my lender or servicer', 'Delinquent account', 'Deposits and withdrawals', 'Disclosure verification of debt', 'Disclosures', 'Excessive fees', 'False statements or representation', 'Fees', 'Forbearance / Workout plans', 'Fraud or scam', 'Getting a loan', 'Identity theft / Fraud / Embezzlement', 'Improper contact or sharing of info', 'Improper use of my credit report', 'Incorrect exchange rate', 'Incorrect information on credit report', 'Incorrect/missing disclosures or info', 'Late fee', 'Lender damaged or destroyed vehicle', 'Lender repossessed or sold the vehicle', 'Lender sold the property', 'Loan modification,collection,foreclosure', 'Loan servicing, payments, escrow account', 'Lost or stolen check', 'Lost or stolen money order', 'Making/receiving payments, sending money', 'Managing the line of credit', 'Managing the loan or lease', 'Managing, opening, or closing account', 'Money was not available when promised', 'Other', 'Other fee', 'Other service issues', 'Other transaction issues', 'Overdraft, savings or rewards features', 'Overlimit fee', 'Payment to acct not credited', 'Payoff process', 'Privacy', 'Problems caused by my funds being low', 'Problems when you are unable to pay', 'Received a loan I didn\'t apply for', 'Rewards', 'Sale of account', 'Settlement process and costs', 'Shopping for a line of credit', 'Shopping for a loan or lease', 'Taking out the loan or lease', 'Taking/threatening an illegal action', 'Transaction issue', 'Unable to get credit report/credit score', 'Unauthorized transactions/trans. issues', 'Unexpected/Other fees', 'Unsolicited issuance of credit card', 'Using a debit or ATM card', 'Wrong amount charged or received'"
'sub_issue',"'Account status', 'Account terms', 'Account terms and changes', 'Applied for loan/did not receive money', 'Attempted to collect wrong amount', 'Attempted to/Collected exempt funds', 'Billing dispute', 'Called after sent written cease of comm', 'Called outside of 8am-9pm', 'Can\'t contact lender', 'Can\'t decrease my monthly payments', 'Can\'t get flexible payment options', 'Can\'t qualify for a loan', 'Can\'t stop charges to bank account', 'Can\'t temporarily postpone payments', 'Charged bank acct wrong day or amt', 'Charged fees or interest I didn\'t expect', 'Contacted employer after asked not to', 'Contacted me after I asked not to', 'Contacted me instead of my attorney', 'Debt is not mine', 'Debt resulted from identity theft', 'Debt was discharged in bankruptcy', 'Debt was paid', 'Don\'t agree with fees charged', 'Frequent or repeated calls', 'Having problems with customer service', 'Impersonated an attorney or official', 'Inadequate help over the phone', 'Indicated committed crime not paying', 'Indicated shouldn\'t respond to lawsuit', 'Information is not mine', 'Investigation took too long', 'Keep getting calls about my loan', 'Need information about my balance/terms', 'No notice of investigation status/result', 'Not disclosed as an attempt to collect', 'Not given enough info to verify debt', 'Payment to acct not credited', 'Personal information', 'Problem cancelling or closing account', 'Problem getting my free annual report', 'Problem getting report or credit score', 'Problem with fraud alerts', 'Problem with statement of dispute', 'Public record', 'Qualify for a better loan than offered', 'Received a loan I didn\'t apply for', 'Received bad information about my loan', 'Received marketing offer after opted out', 'Receiving unwanted marketing/advertising', 'Reinserted previously deleted info', 'Report improperly shared by CRC', 'Report shared with employer w/o consent', 'Right to dispute notice not received', 'Seized/Attempted to seize property', 'Sued w/o proper notification of suit', 'Sued where didn\'t live/sign for debt', 'Talked to a third party about my debt', 'Threatened arrest/jail if do not pay', 'Threatened to sue on too old debt', 'Threatened to take legal action', 'Trouble with how payments are handled', 'Used obscene/profane/abusive language'"
'state',"'AA', 'AE', 'AK', 'AL', 'AP', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'FM', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI', 'VT', 'WA', 'WI', 'WV', 'WY'"
'company_response',"'Closed', 'Closed with explanation', 'Closed with monetary relief', 'Closed with non-monetary relief', 'Untimely response'"
'timely_response',"'No', 'Yes'"


### Compare datasets

In [11]:
!mkdir -p chapters/data_validation

In [12]:
df.iloc[5000:].to_csv('chapters/data_validation/dataset_1.csv', index=False)
df.iloc[:5000].to_csv('chapters/data_validation/dataset_2.csv', index=False)

In [13]:
!wc -l data/consumer_complaints_with_narrative.csv
!wc -l chapters/data_validation/dataset_1.csv
!wc -l chapters/data_validation/dataset_2.csv

66800 data/consumer_complaints_with_narrative.csv
61800 chapters/data_validation/dataset_1.csv
5001 chapters/data_validation/dataset_2.csv


In [14]:
train_stats = tfdv.generate_statistics_from_csv(
    data_location='chapters/data_validation/dataset_1.csv',
    delimiter=','
)
val_stats = tfdv.generate_statistics_from_csv(
    data_location='chapters/data_validation/dataset_2.csv',
    delimiter=','
)

tfdv.visualize_statistics(
    lhs_statistics=val_stats,
    rhs_statistics=train_stats,
    lhs_name='VAL_DATASET',
    rhs_name='TRAIN_DATASET'
)



In [15]:
anomalies = tfdv.validate_statistics(statistics=val_stats, schema=schema)

In [16]:
tfdv.display_anomalies(anomalies)

In [17]:
anomalies

baseline {
  feature {
    name: "product"
    type: BYTES
    domain: "product"
    presence {
      min_fraction: 1.0
      min_count: 1
    }
    shape {
      dim {
        size: 1
      }
    }
  }
  feature {
    name: "sub_product"
    value_count {
      min: 1
      max: 1
    }
    type: BYTES
    domain: "sub_product"
    presence {
      min_count: 1
    }
  }
  feature {
    name: "issue"
    type: BYTES
    domain: "issue"
    presence {
      min_fraction: 1.0
      min_count: 1
    }
    shape {
      dim {
        size: 1
      }
    }
  }
  feature {
    name: "sub_issue"
    value_count {
      min: 1
      max: 1
    }
    type: BYTES
    domain: "sub_issue"
    presence {
      min_count: 1
    }
  }
  feature {
    name: "consumer_complaint_narrative"
    type: BYTES
    presence {
      min_fraction: 1.0
      min_count: 1
    }
    shape {
      dim {
        size: 1
      }
    }
  }
  feature {
    name: "company"
    type: BYTES
    presence {
      min_fract

### Skew and Drift

In [18]:
tfdv.get_feature(schema, 'company').skew_comparator.infinity_norm.threshold = 0.01
skew_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    serving_statistics=val_stats
)

In [19]:
tfdv.display_anomalies(skew_anomalies)

In [20]:
# Change the threshold to show anomaly
tfdv.get_feature(schema, 'company').skew_comparator.infinity_norm.threshold = 0.005
skew_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    serving_statistics=val_stats
)

In [21]:
tfdv.display_anomalies(skew_anomalies)

  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'company',High Linfty distance between training and serving,"The Linfty distance between training and serving is 0.00571643 (up to six significant digits), above the threshold 0.005. The feature value with maximum difference is: Equifax"


In [22]:
tfdv.get_feature(schema, 'company').drift_comparator.infinity_norm.threshold = 0.01
drift_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    previous_statistics=val_stats
)

In [23]:
tfdv.display_anomalies(drift_anomalies)

In [24]:
# Change the threshold to show anomaly
tfdv.get_feature(schema, 'company').drift_comparator.infinity_norm.threshold = 0.005
drift_anomalies = tfdv.validate_statistics(
    statistics=train_stats,
    schema=schema,
    previous_statistics=val_stats
)

In [25]:
tfdv.display_anomalies(drift_anomalies)

  pd.set_option('max_colwidth', -1)


Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'company',High Linfty distance between current and previous,"The Linfty distance between current and previous is 0.00571643 (up to six significant digits), above the threshold 0.005. The feature value with maximum difference is: Equifax"


### Slice the dataset

In [26]:
slice_fn1 = slicing_util.get_feature_value_slicer(
    features={'state': [b'CA']}
)
slice_options = tfdv.StatsOptions(slice_functions=[slice_fn1])
slice_stats = tfdv.generate_statistics_from_csv(
    data_location='data/consumer_complaints_with_narrative.csv',
    stats_options=slice_options
)



In [27]:
def display_slice_keys(stats):
    print(list(map(lambda x: x.name, slice_stats.datasets)))


def get_sliced_stats(stats, slice_key):
    for sliced_stats in stats.datasets:
        if sliced_stats.name == slice_key:
            result = statistics_pb2.DatasetFeatureStatisticsList()
            result.datasets.add().CopyFrom(sliced_stats)
            return result
    print('Invalid Slice key')


def compare_slices(stats, slice_key1, slice_key2):
    lhs_stats = get_sliced_stats(stats, slice_key1)
    rhs_stats = get_sliced_stats(stats, slice_key2)
    tfdv.visualize_statistics(lhs_stats, rhs_stats)

In [28]:
tfdv.visualize_statistics(get_sliced_stats(slice_stats, 'state_CA'))

In [29]:
compare_slices(slice_stats, 'state_CA', 'All Examples')