In [1]:
import pandas as pd

from features.data_quality_validation.schema_definition import WINE_QUALITY_TYPE_SCHEMA
from features.data_quality_validation.schema_validation import SchemaValidation
from features.etl.etl_validation import ETLValidation

wine_path = "./../datasets/wine_quality_corrupted.csv"
df = pd.read_csv(wine_path)

In [2]:
# aka ETL with simple transformation
df_transformed = df.copy()
df_transformed['alcohol_category'] = df_transformed['alcohol'].apply(lambda x: 'High' if x > 10 else 'Low')

In [3]:
etl_validator = ETLValidation(df, df_transformed)

# Perform ETL validation checks
print("Row Count Match:", etl_validator.validate_row_count())
print("Column Validation:", etl_validator.validate_columns())
print("Alcohol Aggregation Check:", etl_validator.validate_aggregates("alcohol"))
print("Completeness Check:", etl_validator.validate_completeness("quality"))

Row Count Match: (1158, 1158, True)
Column Validation: {'missing': [], 'extra': ['alcohol_category']}
Alcohol Aggregation Check: {'sum_difference': 0.0, 'avg_difference': 0.0}
Completeness Check: {'missing_keys': [nan, nan, nan, nan, nan]}


In [4]:
schema_validator = SchemaValidation(df_transformed)

print("Data Type Validation:", schema_validator.validate_data_types(WINE_QUALITY_TYPE_SCHEMA))
print("Alcohol > 0 Check:", schema_validator.validate_business_rules("alcohol", lambda x: x > 0))
print("High Alcohol Check:", schema_validator.validate_business_rules("alcohol_category", lambda x: x == 'High'))
print("High Alcohol Check:", schema_validator.validate_business_rules("alcohol_category", lambda x: x == 'Low'))

Data Type Validation: {}
Alcohol > 0 Check: 52 rows violate alcohol condition, 1106 rows meet the condition.
High Alcohol Check: 574 rows violate alcohol_category condition, 584 rows meet the condition.
High Alcohol Check: 584 rows violate alcohol_category condition, 574 rows meet the condition.
