In [6]:
import pandas as pd
import sqlite3

from features.sql_data_validation.sql_integrity import SQLDataIntegrityChecks

# File paths
csv_path = "./../datasets/wine_quality_corrupted.csv"
db_path = "./../datasets/wine_quality.db"

In [7]:
# Load and Save Wine Dataset into SQLite
# Load wine dataset
wine_data = pd.read_csv(csv_path)

# Create SQLite connection
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Save wine_data table into SQLite
table_name = "wine_data"
wine_data.to_sql(table_name, conn, if_exists="replace", index=False)

# Create a reference table for quality levels
quality_levels = pd.DataFrame({"quality": sorted(wine_data["quality"].unique())})
quality_levels.to_sql("quality_levels", conn, if_exists="replace", index=False)

# Commit changes
conn.commit()
print(f"Data successfully saved into {db_path}")

# Close connection (will reopen for validation)
conn.close()

Data successfully saved into ./../datasets/wine_quality.db


In [8]:
checker = SQLDataIntegrityChecks(db_path)

Connected to database: ./../datasets/wine_quality.db


In [9]:
# Referential Integrity Check
missing_referential = checker.check_referential_integrity("wine_data", "quality", "quality_levels", "quality")
print(f"Referential Integrity Violations: {missing_referential} rows")

# Business Rule Check
invalid_business = checker.check_business_rule("wine_data", "alcohol", "alcohol > 0")
print(f"Business Rule Violations (Alcohol > 0): {invalid_business} rows")

Referential Integrity Violations: 5
Violating Values: [(None,), (None,), (None,), (None,), (None,)]
Referential Integrity Violations: 5 rows
Business Rule Violations (Alcohol > 0): 0 rows


In [10]:
# STEP 4: Close Checker
checker.close()

Database connection closed.
