# Data Validation

This notebook validates data quality in `silver/` and `gold/` tables, logging issues to `silver/errors/`.

## Inputs
- `silver/<table>/`
- `gold/<table>/`

## Outputs
- `silver/errors/` (validation issues)

## Dependencies
- `pyspark`

## Environment
- Uses `.env` for Blob Storage credentials

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, current_timestamp
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()
AZURE_CONN_STR = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
CONTAINER_NAME = os.getenv("CONTAINER_NAME")
AZURE_STORAGE_ACCOUNT_NAME = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")

if not all([AZURE_CONN_STR, CONTAINER_NAME, AZURE_STORAGE_ACCOUNT_NAME]):
    raise ValueError("Missing environment variables. Check .env file.")

# Initialize Spark session
spark = SparkSession.builder \
    .appName("ValidateData") \
    .config("spark.hadoop.fs.azure.account.auth.type", "SAS") \
    .config("spark.hadoop.fs.azure.sas.token.provider.type", "org.apache.hadoop.fs.azure.SimpleSasTokenProvider") \
    .config("spark.hadoop.fs.azure.sas.fixed.token", AZURE_CONN_STR) \
    .getOrCreate()

# Define blob storage paths
blob_base_path = f"wasbs://{CONTAINER_NAME}@{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net"
silver_path = f"{blob_base_path}/silver"
gold_path = f"{blob_base_path}/gold"
error_path = f"{silver_path}/errors"

# Validate silver tables
silver_tables = ["donations", "projects", "campaigns", "volunteers", "volunteer_shifts", "beneficiaries", "transactions", "crm_data"]
for table in silver_tables:
    df = spark.read.parquet(f"{silver_path}/{table}")
    error_df = df.filter(col(df.columns[0]).isNull()).withColumn("error", lit(f"Missing primary key in {table}")) \
        .withColumn("ingest_timestamp", current_timestamp())
    error_df.write.mode("append").parquet(f"{error_path}/{table}_validation")

# Validate gold tables
gold_tables = [
    "donations_per_project", "volunteer_hours_per_project", "donations_by_region", 
    "campaign_performance", "donor_activity", "volunteer_engagement", 
    "beneficiary_demographics", "transaction_success_rate", "donation_trends", 
    "active_campaigns", "donor_engagement", "predicted_donations"
]
for table in gold_tables:
    try:
        df = spark.read.parquet(f"{gold_path}/{table}")
        error_df = df.filter(col(df.columns[0]).isNull()).withColumn("error", lit(f"Missing primary key in {table}")) \
            .withColumn("ingest_timestamp", current_timestamp())
        error_df.write.mode("append").parquet(f"{error_path}/{table}_validation")
    except Exception as e:
        print(f"Validation error for {table}: {str(e)}")

spark.stop()
print("Data validation completed.")