In [41]:
!pip install tensorflow-data-validation



In [42]:
import tensorflow as tf
import tensorflow_data_validation as tfdv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow_metadata.proto.v0 import schema_pb2


print(f"TFDV Version : {tfdv.__version__}")
print(f"Tensorflow version : {tf.__version__}")

TFDV Version : 1.14.0
Tensorflow version : 2.13.0


In [43]:
import zipfile

local = "/content/adult.csv.zip"
zipref = zipfile.ZipFile(local)
zipref.extractall()

In [44]:
df = pd.read_csv("/content/adult.csv")
print(f"Total number of features in the dataset : {len(df.columns)}")

Total number of features in the dataset : 15


In [45]:
train_set,test_set = train_test_split(df,test_size = 0.2,shuffle = False)

In [46]:
train_set.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,<=50K
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,<=50K
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States,>50K
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States,<=50K
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,?,>50K


In [47]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
26048,45,Private,194698,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K
26049,18,?,67793,HS-grad,9,Never-married,?,Own-child,White,Female,0,0,60,United-States,<=50K
26050,27,Private,289147,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Female,0,0,40,United-States,<=50K
26051,21,Private,229826,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,20,United-States,<=50K
26052,49,Self-emp-inc,246739,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,45,United-States,>50K


In [48]:
# Adding extra row to detect and analyze annomalies to test set

new_row = {
    "annomalous_detection" : "annomali_detection"
}
test_set = test_set.append(new_row,ignore_index = True)

test_set.head()

  test_set = test_set.append(new_row,ignore_index = True)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,annomalous_detection
0,45.0,Private,194698.0,Some-college,10.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,40.0,United-States,>50K,
1,18.0,?,67793.0,HS-grad,9.0,Never-married,?,Own-child,White,Female,0.0,0.0,60.0,United-States,<=50K,
2,27.0,Private,289147.0,Bachelors,13.0,Never-married,Prof-specialty,Own-child,White,Female,0.0,0.0,40.0,United-States,<=50K,
3,21.0,Private,229826.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,20.0,United-States,<=50K,
4,49.0,Self-emp-inc,246739.0,Some-college,10.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,45.0,United-States,>50K,


In [86]:
# Generate and Visualize training dataset statistics
# Performs variety of statistics for each feature in the dataset
train_stats = tfdv.generate_statistics_from_dataframe(train_set)

In [87]:
# visualize the training dataset statistics
tfdv.visualize_statistics(train_stats)

In [57]:
# Infer the data schema
# Describe the trainset schema describes standard characteristics of your data

# Infer schema from the computed statistics
schema = tfdv.infer_schema(statistics = train_stats)

# Display the infered schema
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,-
'workclass',STRING,required,,'workclass'
'fnlwgt',INT,required,,-
'education',STRING,required,,'education'
'education.num',INT,required,,-
'marital.status',STRING,required,,'\'marital.status\''
'occupation',STRING,required,,'occupation'
'relationship',STRING,required,,'relationship'
'race',STRING,required,,'race'
'sex',STRING,required,,'sex'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'workclass',"'?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'"
'education',"'10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'"
'\'marital.status\'',"'Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'"
'occupation',"'?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'"
'relationship',"'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'"
'race',"'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'"
'sex',"'Female', 'Male'"
'\'native.country\'',"'?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'"
'income',"'<=50K', '>50K'"


In [58]:
# Generate and visualize evaluation dataset statistics

eval_stats = tfdv.generate_statistics_from_dataframe(test_set)

# Compare training with evaluation
# The features range should be same for the training and evaluation set,otherwise we might have distribution skew.

tfdv.visualize_statistics(
    lhs_statistics = eval_stats,
    rhs_statistics = train_stats,
    lhs_name = "Eval_Dataset",
    rhs_name = "Train_Dataset"
)

In [59]:
# So there is detection of missing value and let's drop these rows to make the data more clean.

# Filter the age range
# Because it shows min values has 0 and max value has 1000.

test_set = test_set[test_set['age'] > 16]
test_set = test_set[test_set['age'] < 91]

# Drop missing values
test_set.dropna(inplace = True)

In [60]:
test_stats = tfdv.generate_statistics_from_dataframe(test_set)

tfdv.visualize_statistics(
    lhs_statistics = test_stats,
    rhs_statistics = train_stats,
    lhs_name = "Eval_Dataset",
    rhs_name = "Train_Dataset"
)

In [73]:
# Calculate and display evaluation anomalies
# Check evaluation data for errors by validating the evaluation dataset

anomalies = tfdv.validate_statistics(statistics = test_stats,schema = schema)

# Display anomalies
tfdv.display_anomalies(anomalies)

In [66]:
# Revise the schema

# I

# Relax the minimum fraction  of values that must come from the domain  for the feature 'native-country'
country_feature = tfdv.get_feature(schema,'native.country')
country_feature.distribution_constraints.min_domain_mass = 0.9

# Relax the minimum fraction of values that must come from the domain for the features 'occupation'
occupation_feature = tfdv.get_feature(schema,'occupation')
occupation_feature.distribution_constraints.min_domain_mass = 0.9

In [69]:
# If u want to add valid values to the domain
# Add a new value to the domain of the feature race

race_domain =tfdv.get_domain(schema,"race")
race_domain.value.append("Asian")

In [71]:
# You can restrict the range of a numerical feature

tfdv.set_domain(schema,'age',schema_pb2.IntDomain(name = 'age',min=17,max=90))
tfdv.display_schema(schema)



Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'age',INT,required,,min: 17; max: 90
'workclass',STRING,required,,'workclass'
'fnlwgt',INT,required,,-
'education',STRING,required,,'education'
'education.num',INT,required,,-
'marital.status',STRING,required,,'\'marital.status\''
'occupation',STRING,required,,'occupation'
'relationship',STRING,required,,'relationship'
'race',STRING,required,,'race'
'sex',STRING,required,,'sex'


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'workclass',"'?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'"
'education',"'10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th', 'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad', 'Masters', 'Preschool', 'Prof-school', 'Some-college'"
'\'marital.status\'',"'Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'"
'occupation',"'?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct', 'Other-service', 'Priv-house-serv', 'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'"
'relationship',"'Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried', 'Wife'"
'race',"'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White', 'Asian'"
'sex',"'Female', 'Male'"
'\'native.country\'',"'?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba', 'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England', 'France', 'Germany', 'Greece', 'Guatemala', 'Haiti', 'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India', 'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico', 'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan', 'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam', 'Yugoslavia'"
'income',"'<=50K', '>50K'"


No charts were generated by quickchart


In [72]:
# Validation with no anomalies

update_anomalies = tfdv.validate_statistics(eval_stats,schema)
tfdv.display_anomalies(update_anomalies)

In [78]:
# Examining dataset slices

from tensorflow_data_validation.utils import slicing_util


slice_fn = slicing_util.get_feature_value_slicer(features={'sex' : None})

# Declare the stats options
slice_stats_options = tfdv.StatsOptions(schema = schema ,
                                        slice_functions = [slice_fn],
                                        infer_type_from_schema = True)

# sliced statistics works only for csv,so convert the pandas dataframe to a csv file
csv_path = "slice_sample.csv"
train_set.to_csv(csv_path)

# Calculate statistics for the sliced dataset
sliced_stats = tfdv.generate_statistics_from_csv(csv_path,stats_options= slice_stats_options)



Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


In [80]:
print(f"dataset generated {[sliced.name for sliced in sliced_stats.datasets]}")

print(f"type of sliced_stats elements : {type(sliced_stats.datasets[0])}")

dataset generated ['All Examples', 'sex_Female', 'sex_Male']
type of sliced_stats elements : <class 'tensorflow_metadata.proto.v0.statistics_pb2.DatasetFeatureStatistics'>


In [85]:
from tensorflow_metadata.proto.v0.statistics_pb2 import DatasetFeatureStatisticsList

# Convert 'Male' statics (index =1 ) to the correct type and get the dataset name
male_stats_list = DatasetFeatureStatisticsList()
male_stats_list.datasets.extend([sliced_stats.datasets[1]])
male_stats_name = sliced_stats.datasets[1].name

# Convert 'Female' statics (index = 2) to the correct type and get the dataset name
female_stats_list = DatasetFeatureStatisticsList()
female_stats_list.datasets.extend([sliced_stats.datasets[2]])
female_stats_name = sliced_stats.datasets[2].name

# Visualize the two slice side by side
tfdv.visualize_statistics(
    lhs_statistics = male_stats_list,
    rhs_statistics = female_stats_list,
    lhs_name = male_stats_name,
    rhs_name = female_stats_name
)