# Data Quality Checks

This notebook performs comprehensive data quality validation on the processed datasets.

## Learning Objectives
- Implement data quality checks
- Monitor data freshness
- Validate data completeness
- Generate quality reports


In [None]:
# Load the table to validate
table_path = "/delta/airlines_gold"  # This will be passed as parameter
df = spark.read.format("delta").load(table_path)

print(f"Data Quality Check for: {table_path}")
print(f"Total records: {df.count()}")
print(f"Columns: {len(df.columns)}")
df.printSchema()


In [None]:
# Data Quality Checks
from pyspark.sql.functions import col, count, isnan, isnull, sum as spark_sum

print("=== DATA QUALITY REPORT ===")

# 1. Check for null values
print("\n1. NULL VALUE ANALYSIS:")
for column in df.columns:
    null_count = df.filter(col(column).isNull()).count()
    total_count = df.count()
    null_percentage = (null_count / total_count) * 100 if total_count > 0 else 0
    print(f"  {column}: {null_count} nulls ({null_percentage:.2f}%)")

# 2. Check for duplicate records
print("\n2. DUPLICATE ANALYSIS:")
duplicate_count = df.count() - df.dropDuplicates().count()
print(f"  Duplicate records: {duplicate_count}")

# 3. Data freshness check
print("\n3. DATA FRESHNESS:")
if "processing_timestamp" in df.columns:
    latest_timestamp = df.select("processing_timestamp").orderBy(col("processing_timestamp").desc()).first()[0]
    print(f"  Latest processing time: {latest_timestamp}")
else:
    print("  No timestamp column found")

# 4. Data distribution
print("\n4. DATA DISTRIBUTION:")
df.describe().show()
