In [1]:
import pandas as pd
from features.utiils import logger
from features.data_quality_validation.schema_definition import WINE_QUALITY_TYPE_SCHEMA
from features.data_quality_validation.schema_validation import SchemaValidation
from features.etl.etl_validation import ETLValidation

wine_path = "./../datasets/wine_quality_corrupted.csv"
df = pd.read_csv(wine_path)

In [2]:
# aka ETL with simple transformation
df_transformed = df.copy()
df_transformed['alcohol_category'] = df_transformed['alcohol'].apply(lambda x: 'High' if x > 10 else 'Low')

In [3]:
etl_validator = ETLValidation(df, df_transformed)

# Perform ETL validation checks
logger.info(f"Row Count Match: {etl_validator.validate_row_count()}")
logger.info(f"Column Validation: {etl_validator.validate_columns()}")
logger.info(f"Alcohol Aggregation Check: {etl_validator.validate_aggregates('alcohol')}")
logger.info(f"Completeness Check: {etl_validator.validate_completeness('quality')}")

15:03:34 INFO: Row Count Match: (1158, 1158, True)
15:03:34 INFO: Column Validation: {'missing': [], 'extra': ['alcohol_category']}
15:03:34 INFO: Alcohol Aggregation Check: {'sum_difference': 0.0, 'avg_difference': 0.0}
15:03:34 INFO: Completeness Check: {'missing_keys': [nan, nan, nan, nan, nan]}


In [4]:
schema_validator = SchemaValidation(df_transformed)

logger.info(f"Data Type Validation: "
            f"{schema_validator.validate_data_types(WINE_QUALITY_TYPE_SCHEMA)}")
logger.info(f"Alcohol > 0 Check: "
            f"{schema_validator.validate_business_rules('alcohol', lambda x: x > 0)}")
logger.info(f"High Alcohol Check (High): "
            f"{schema_validator.validate_business_rules('alcohol_category', lambda x: x == 'High')}")
logger.info(f"High Alcohol Check (Low): "
            f"{schema_validator.validate_business_rules('alcohol_category', lambda x: x == 'Low')}")


15:03:34 INFO: Data Type Validation: {}
15:03:34 INFO: Alcohol > 0 Check: (52, 1106)
15:03:34 INFO: High Alcohol Check (High): (574, 584)
15:03:34 INFO: High Alcohol Check (Low): (584, 574)
