# A3 Archive Validation

Validates completeness and quality of data extracted from the legacy A3 Firebase system.
Checks:
- Row counts per collection vs expected minimums
- Data quality: null rates, invalid values, date ranges
- Referential integrity: brokers <-> commissions <-> carriers
- Duplicate detection

**Run after:** firebase-extract function completes
**Output:** Validation report written to Files/validation-reports/

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, lit, count, when, isnan, isnull, countDistinct,
    sum as spark_sum, avg, min as spark_min, max as spark_max,
    current_timestamp, length, regexp_extract
)
from datetime import datetime
import json

spark = SparkSession.builder.getOrCreate()

LAKEHOUSE = "lh-a3-extract"
BASE_PATH = f"abfss://{LAKEHOUSE}@onelake.dfs.fabric.microsoft.com"

# Expected minimum row counts (from legacy A3 system inventory)
EXPECTED_MINIMUMS = {
    "brokers": 500,
    "commissions": 10000,
    "carriers": 200,
    "contacts": 2000,
    "activities": 5000
}

COLLECTIONS = list(EXPECTED_MINIMUMS.keys())
validation_results = {}

print(f"A3 Archive Validation started at {datetime.utcnow().isoformat()}")
print(f"Collections to validate: {COLLECTIONS}")

In [None]:
# ── Cell 2: Row Counts and Basic Statistics ────────────────────────────────

dataframes = {}
count_results = {}

for collection in COLLECTIONS:
    try:
        df = spark.read.format("delta").load(f"{BASE_PATH}/Tables/{collection}")
        dataframes[collection] = df
        
        actual_count = df.count()
        expected_min = EXPECTED_MINIMUMS[collection]
        is_complete = actual_count >= expected_min
        completeness_pct = round(actual_count / expected_min * 100, 2) if expected_min > 0 else 0
        
        count_results[collection] = {
            "actual_count": actual_count,
            "expected_minimum": expected_min,
            "is_complete": is_complete,
            "completeness_pct": completeness_pct,
            "column_count": len(df.columns),
            "columns": df.columns
        }
        
        status = "PASS" if is_complete else "FAIL"
        print(f"  [{status}] {collection}: {actual_count:,} rows (expected >= {expected_min:,}, {completeness_pct}%)")
        print(f"         Columns ({len(df.columns)}): {', '.join(df.columns[:8])}{'...' if len(df.columns) > 8 else ''}")
    except Exception as e:
        count_results[collection] = {
            "actual_count": 0,
            "expected_minimum": EXPECTED_MINIMUMS[collection],
            "is_complete": False,
            "completeness_pct": 0,
            "error": str(e)
        }
        print(f"  [ERROR] {collection}: {e}")

validation_results["row_counts"] = count_results

total_rows = sum(r["actual_count"] for r in count_results.values())
total_pass = sum(1 for r in count_results.values() if r["is_complete"])
print(f"\nTotal rows extracted: {total_rows:,}")
print(f"Collections passing minimum: {total_pass}/{len(COLLECTIONS)}")

In [None]:
# ── Cell 3: Data Quality Checks ──────────────────────────────────────────

quality_results = {}

# Define required fields per collection
REQUIRED_FIELDS = {
    "brokers": ["_id", "name"],
    "commissions": ["_id", "amount"],
    "carriers": ["_id", "name"],
    "contacts": ["_id"],
    "activities": ["_id"]
}

for collection, df in dataframes.items():
    quality = {"null_rates": {}, "issues": []}
    total = df.count()
    
    if total == 0:
        quality["issues"].append("Collection is empty")
        quality_results[collection] = quality
        continue
    
    # Check null rates for all columns
    for col_name in df.columns:
        null_count = df.filter(isnull(col(col_name)) | (col(col_name) == "")).count()
        null_rate = round(null_count / total * 100, 2)
        quality["null_rates"][col_name] = null_rate
        
        # Flag high null rates on required fields
        if col_name in REQUIRED_FIELDS.get(collection, []) and null_rate > 1:
            quality["issues"].append(f"Required field '{col_name}' has {null_rate}% null rate")
    
    # Check for duplicate _id values
    if "_id" in df.columns:
        distinct_ids = df.select("_id").distinct().count()
        duplicates = total - distinct_ids
        quality["duplicate_ids"] = duplicates
        if duplicates > 0:
            dup_rate = round(duplicates / total * 100, 2)
            quality["issues"].append(f"{duplicates} duplicate _id values ({dup_rate}%)")
    
    # Check _extractedAt timestamp validity
    if "_extractedAt" in df.columns:
        min_ts = df.agg(spark_min("_extractedAt")).collect()[0][0]
        max_ts = df.agg(spark_max("_extractedAt")).collect()[0][0]
        quality["extraction_range"] = {"min": str(min_ts), "max": str(max_ts)}
    
    quality["overall_status"] = "PASS" if len(quality["issues"]) == 0 else "WARN" if len(quality["issues"]) <= 2 else "FAIL"
    quality_results[collection] = quality
    
    status = quality["overall_status"]
    issue_count = len(quality["issues"])
    print(f"  [{status}] {collection}: {issue_count} issue(s)")
    for issue in quality["issues"]:
        print(f"         - {issue}")

validation_results["data_quality"] = quality_results

In [None]:
# ── Cell 4: Referential Integrity Checks ─────────────────────────────────

integrity_results = {}

# Check: commissions -> brokers (broker_id / brokerId reference)
if "commissions" in dataframes and "brokers" in dataframes:
    df_comm = dataframes["commissions"]
    df_brokers = dataframes["brokers"]
    
    # Find the broker reference column in commissions
    broker_ref_cols = [c for c in df_comm.columns if "broker" in c.lower() and "id" in c.lower()]
    if not broker_ref_cols:
        broker_ref_cols = [c for c in df_comm.columns if "broker" in c.lower() and "ref" in c.lower()]
    
    if broker_ref_cols:
        ref_col = broker_ref_cols[0]
        broker_ids = set(row[0] for row in df_brokers.select("_id").collect())
        
        total_comm = df_comm.count()
        orphaned = df_comm.filter(~col(ref_col).isin(list(broker_ids))).count()
        orphan_rate = round(orphaned / total_comm * 100, 2) if total_comm > 0 else 0
        
        integrity_results["commissions_to_brokers"] = {
            "reference_column": ref_col,
            "total_records": total_comm,
            "orphaned_records": orphaned,
            "orphan_rate_pct": orphan_rate,
            "status": "PASS" if orphan_rate < 5 else "WARN" if orphan_rate < 15 else "FAIL"
        }
        print(f"  [{integrity_results['commissions_to_brokers']['status']}] commissions -> brokers: {orphaned} orphaned ({orphan_rate}%)")
    else:
        print("  [SKIP] No broker reference column found in commissions")

# Check: commissions -> carriers (carrier_id / carrierId reference)
if "commissions" in dataframes and "carriers" in dataframes:
    df_comm = dataframes["commissions"]
    df_carriers = dataframes["carriers"]
    
    carrier_ref_cols = [c for c in df_comm.columns if "carrier" in c.lower() and ("id" in c.lower() or "ref" in c.lower())]
    
    if carrier_ref_cols:
        ref_col = carrier_ref_cols[0]
        carrier_ids = set(row[0] for row in df_carriers.select("_id").collect())
        
        total_comm = df_comm.count()
        orphaned = df_comm.filter(~col(ref_col).isin(list(carrier_ids))).count()
        orphan_rate = round(orphaned / total_comm * 100, 2) if total_comm > 0 else 0
        
        integrity_results["commissions_to_carriers"] = {
            "reference_column": ref_col,
            "total_records": total_comm,
            "orphaned_records": orphaned,
            "orphan_rate_pct": orphan_rate,
            "status": "PASS" if orphan_rate < 5 else "WARN" if orphan_rate < 15 else "FAIL"
        }
        print(f"  [{integrity_results['commissions_to_carriers']['status']}] commissions -> carriers: {orphaned} orphaned ({orphan_rate}%)")
    else:
        print("  [SKIP] No carrier reference column found in commissions")

# Check: contacts -> brokers
if "contacts" in dataframes and "brokers" in dataframes:
    df_contacts = dataframes["contacts"]
    
    broker_ref_cols = [c for c in df_contacts.columns if "broker" in c.lower() and ("id" in c.lower() or "ref" in c.lower())]
    
    if broker_ref_cols:
        ref_col = broker_ref_cols[0]
        broker_ids = set(row[0] for row in df_brokers.select("_id").collect())
        
        total_contacts = df_contacts.count()
        orphaned = df_contacts.filter(~col(ref_col).isin(list(broker_ids))).count()
        orphan_rate = round(orphaned / total_contacts * 100, 2) if total_contacts > 0 else 0
        
        integrity_results["contacts_to_brokers"] = {
            "reference_column": ref_col,
            "total_records": total_contacts,
            "orphaned_records": orphaned,
            "orphan_rate_pct": orphan_rate,
            "status": "PASS" if orphan_rate < 5 else "WARN" if orphan_rate < 15 else "FAIL"
        }
        print(f"  [{integrity_results['contacts_to_brokers']['status']}] contacts -> brokers: {orphaned} orphaned ({orphan_rate}%)")
    else:
        print("  [SKIP] No broker reference column found in contacts")

validation_results["referential_integrity"] = integrity_results

In [None]:
# ── Cell 5: Validation Report Summary ────────────────────────────────────

# Compute overall status
all_statuses = []
for section in ["row_counts", "data_quality", "referential_integrity"]:
    section_data = validation_results.get(section, {})
    for key, value in section_data.items():
        if isinstance(value, dict):
            status = value.get("overall_status") or value.get("status")
            if value.get("is_complete") is True:
                status = "PASS"
            elif value.get("is_complete") is False:
                status = "FAIL"
            if status:
                all_statuses.append(status)

fail_count = all_statuses.count("FAIL")
warn_count = all_statuses.count("WARN")
pass_count = all_statuses.count("PASS")

if fail_count > 0:
    overall_status = "FAIL"
elif warn_count > 0:
    overall_status = "WARN"
else:
    overall_status = "PASS"

validation_results["summary"] = {
    "overall_status": overall_status,
    "checks_passed": pass_count,
    "checks_warned": warn_count,
    "checks_failed": fail_count,
    "total_rows_extracted": total_rows,
    "collections_validated": len(COLLECTIONS),
    "validated_at": datetime.utcnow().isoformat()
}

# Write validation report to Files
report_timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
report_path = f"{BASE_PATH}/Files/validation-reports/validation_{report_timestamp}.json"

# Convert to JSON and save
report_json = json.dumps(validation_results, indent=2, default=str)
report_df = spark.createDataFrame([(report_json,)], ["report"])
report_df.coalesce(1).write.mode("overwrite").text(report_path)

print("=" * 60)
print(f"A3 ARCHIVE VALIDATION REPORT")
print(f"=" * 60)
print(f"Overall Status: [{overall_status}]")
print(f"Validated At:   {validation_results['summary']['validated_at']}")
print(f"Total Rows:     {total_rows:,}")
print(f"")
print(f"Check Results:")
print(f"  PASS: {pass_count}")
print(f"  WARN: {warn_count}")
print(f"  FAIL: {fail_count}")
print(f"")
print(f"Report saved to: {report_path}")
print(f"=" * 60)

# Raise error if critical failures for pipeline alerting
if overall_status == "FAIL":
    print("\nWARNING: Validation found FAILURES. Review report for details.")
    # Uncomment to fail pipeline on validation errors:
    # raise Exception(f"A3 Archive validation failed: {fail_count} check(s) failed")