# Use TDDA to validate data


In [1]:
from tdda.constraints import discover_df, verify_df
import pandas as pd
import os

# Step1: Read data

In [2]:
file_path = "../../data/adult_with_duplicates.csv"
columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship",
           "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
df_base = pd.read_csv(file_path, names=columns, header=None, sep=',', na_values=["null"])
df_base.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
1,139,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
2,fourty five,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
3,-12,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
4,,emp-by-pengfei,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


In [3]:
root_dir = './tdda_refs'
my_rule_path = 'my_constraints.tdda'

# Step 2. Load validation rule and validate data

# Validation rules

Table level validation rule:
1. Table must have 32603 rows and 15 columns
2. Table can't have duplicate rows

Column level validation rule:
1. Column Age must be a number
2. Column Age can't have null
3. Column Age must have value between 0 and 120

TDDA cant specify table level rules. So we can only apply column level rules

In [10]:
# load the custom validation rule
validation_rule_path = f'{root_dir}/{my_rule_path}'

In [11]:
# apply validation rules on data.
result = verify_df(df_base, validation_rule_path, type_checking='strict', epsilon=0)
print(str(result))

FIELDS:

age: 3 failures  1 pass  type ✗  min ✗  max ✗  max_nulls ✓

SUMMARY:

Constraints passing: 1
Constraints failing: 3


Now try to change validation rules (e.g. max_nulls=1), and rerun above code.

# Step 3. Auto generate rules

In [12]:
# generate the constraint
constraints = discover_df(df_base)

# Show the generated constraints
print(str(constraints))


FIELDS:

Field age:
           type: TypeConstraint(value='string')
     min_length: MinLengthConstraint(value=2)
     max_length: MaxLengthConstraint(value=11)
      max_nulls: MaxNullsConstraint(value=1)

Field workclass:
           type: TypeConstraint(value='string')
     min_length: MinLengthConstraint(value=7)
     max_length: MaxLengthConstraint(value=16)
  allowed_values: AllowedValuesConstraint(value=['Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay', 'emp-by-pengfei', 'workclass'])

Field fnlwgt:
           type: TypeConstraint(value='string')
     min_length: MinLengthConstraint(value=5)
     max_length: MaxLengthConstraint(value=7)
      max_nulls: MaxNullsConstraint(value=0)

Field education:
           type: TypeConstraint(value='string')
     min_length: MinLengthConstraint(value=3)
     max_length: MaxLengthConstraint(value=12)
      max_nulls: MaxNullsConstraint(value=0)
  allowed_values: AllowedValues

## Step 4 save the generated validation rules


In [13]:
def write_constrain(constrains, constrain_root_dir: str, constrain_file_name: str):
    """
    This function takes a tdda constrain object, root directory and file name to store the tdda constrains in json format

    :param constrains: tdda constrain object
    :param constrain_root_dir: root directory to store the tdda constrains
    :param constrain_file_name: file name to store the constrain
    :return:
    """
    if not os.path.exists(constrain_root_dir):
        os.mkdir(constrain_root_dir)
    constraints_path = f'{constrain_root_dir}/{constrain_file_name}'
    print(f"write constrain to path {constraints_path}")
    with open(constraints_path, 'w') as f:
        f.write(constraints.to_json())

In [16]:
# write constrain to local file system
generated_rule_file_name="generated_constraints.tdda"
write_constrain(constraints,root_dir,generated_rule_file_name)

write constrain to path ./tdda_refs/generated_constraints.tdda


## Step 5 Validate data with generated rules

Now we can use the generated constraints to validate some data.

In [17]:

generated_constraints_path = f'{root_dir}/{generated_rule_file_name}'
valid_result = verify_df(df_base, generated_constraints_path, type_checking='strict', epsilon=0)
print(str(valid_result))

FIELDS:

age: 0 failures  4 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓

workclass: 0 failures  4 passes  type ✓  min_length ✓  max_length ✓  allowed_values ✓

fnlwgt: 0 failures  4 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓

education: 0 failures  5 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓  allowed_values ✓

education-num: 0 failures  5 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓  allowed_values ✓

marital-status: 0 failures  5 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓  allowed_values ✓

occupation: 0 failures  4 passes  type ✓  min_length ✓  max_length ✓  allowed_values ✓

relationship: 0 failures  5 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓  allowed_values ✓

race: 0 failures  5 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓  allowed_values ✓

sex: 0 failures  5 passes  type ✓  min_length ✓  max_length ✓  max_nulls ✓  allowed_values ✓

capital-gain: 0 failures  4 passes  type ✓  min_length ✓  ma