In [1]:
import pandas as pd
import sqlite3

from features.sql_data_validation.sql_integrity import SQLDataIntegrityChecks
from features.utils.logger_config import logger

# File paths
csv_path = "./../datasets/wine_quality_corrupted.csv"
db_path = "./../datasets/wine_quality.db"

In [2]:
# Load and Save Wine Dataset into SQLite
# Load wine dataset
wine_data = pd.read_csv(csv_path)

# Create SQLite connection
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Save wine_data table into SQLite
table_name = "wine_data"
wine_data.to_sql(table_name, conn, if_exists="replace", index=False)

# Create a reference table for quality levels
quality_levels = pd.DataFrame({"quality": sorted(wine_data["quality"].unique())})
quality_levels.to_sql("quality_levels", conn, if_exists="replace", index=False)

# Commit changes
conn.commit()
logger.info(f"Data successfully saved into {db_path}")

# Close connection (will reopen for validation)
conn.close()

14:50:58-INFO: Data successfully saved into ./../datasets/wine_quality.db


In [3]:
checker = SQLDataIntegrityChecks(db_path)

14:50:58-INFO: Connected to database: ./../datasets/wine_quality.db


In [4]:
# Referential Integrity Check
missing_referential = checker.check_referential_integrity("wine_data", "quality", "quality_levels", "quality")
logger.info(f"Referential Integrity Violations: {missing_referential} rows")

# Business Rule Check
invalid_business = checker.check_business_rule("wine_data", "alcohol", "alcohol > 0")
logger.info(f"Business Rule Violations (Alcohol > 0): {invalid_business} rows")

14:50:58-INFO: Referential Integrity Violations: 5
14:50:58-INFO: Violating Values: [(None,), (None,), (None,), (None,), (None,)]
14:50:58-INFO: Referential Integrity Violations: 5 rows
14:50:58-INFO: Business Rule Violations (Alcohol > 0): 0 rows


In [5]:
# STEP 4: Close Checker
checker.close()

14:50:58-INFO: Database connection closed.
