In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Configuration
catalog = "workspace"
schema = "crime"
volume = "datastore"
file_name = "crime_data.csv"

# Volume path
volume_path = f"/Volumes/{catalog}/{schema}/{volume}/{file_name}"

# ============================================================================
# BRONZE LAYER - Raw data from volume
# ============================================================================

@dlt.table(
    name="crime_bronze",
    comment="Raw crime data loaded from volume CSV file",
    table_properties={
        "quality": "bronze",
        "pipelines.autoOptimize.managed": "true"
    }
)
def crime_bronze():
    """
    Read CSV file from Unity Catalog volume
    Path: /Volumes/workspace/crime/datastore/crime_data.csv
    """
    return (
        spark.read.format("csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(volume_path)
    )



In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

@dlt.table(
    name="crime_silver",
    comment="Cleaned LA crime data - ALL 1,004,991 records preserved with area centroid imputation"
)
@dlt.expect_all_or_drop({
    "valid_dr_no": "dr_no IS NOT NULL",
    "valid_date_rptd": "date_rptd IS NOT NULL",
    "valid_date_occ": "date_occ IS NOT NULL",
    "valid_time_occ": "time_occ IS NOT NULL",
    "valid_area": "area IS NOT NULL AND area_name IS NOT NULL",
    "valid_crime": "crm_cd IS NOT NULL AND crm_cd_desc IS NOT NULL"
})
def crime_silver():
    """
    Silver Layer: Cleaned data with 100% record retention
    
    Data Quality Rules:
    - Drop only if critical fields are NULL (should be 0 based on profiling)
    - Keep ALL records with cleaned values
    - Use -1 for unknown ages (not NULL)
    - Use 'X' for unknown sex/descent (not NULL)
    - Use area centroid for missing coordinates
    - Add helper flags for optional fields
    
    Expected Output: 1,004,991 records (100% retention)
    """
    
    # Read bronze data
    df = dlt.read("crime_bronze")
    

    area_window = Window.partitionBy("area_name")
    
    df = df.withColumn(
        'area_avg_lat',
        avg(when((col('lat') != 0) & col('lat').isNotNull(), col('lat'))).over(area_window)
    ).withColumn(
        'area_avg_lon',
        avg(when((col('lon') != 0) & col('lon').isNotNull(), col('lon'))).over(area_window)
    )
    

    return (
        df

        # STEP 1: CLEAN VICTIM AGE


        .withColumn('vict_age_clean',
            when(col('vict_age') < 0, -1)
            .when(col('vict_age') == 0, -1)
            .when((col('vict_age') >= 1) & (col('vict_age') <= 120), col('vict_age'))
            .otherwise(-1)
        )
        

        # STEP 2: CLEAN VICTIM SEX

        .withColumn('vict_sex_clean',
            when(col('vict_sex').isNull(), 'X')
            .when(col('vict_sex') == '', 'X')
            .when(col('vict_sex') == 'M', 'M')
            .when(col('vict_sex') == 'F', 'F')
            .when(col('vict_sex') == 'X', 'X')
            .otherwise('X')
        )

        # STEP 3: CLEAN VICTIM DESCENT

        .withColumn('vict_descent_clean',
            when(col('vict_descent').isNull(), 'X')
            .when(col('vict_descent') == '', 'X')
            .otherwise(col('vict_descent'))
        )
        

        # STEP 4: CLEAN COORDINATES WITH AREA CENTROID

        .withColumn('lat_clean',
            when((col('lat') == 0) | col('lat').isNull(), col('area_avg_lat'))
            .otherwise(col('lat'))
        )
        
        .withColumn('lon_clean',
            when((col('lon') == 0) | col('lon').isNull(), col('area_avg_lon'))
            .otherwise(col('lon'))
        )
        
        # Flag imputed locations
        .withColumn('is_imputed_location',
            when((col('lat') == 0) | col('lat').isNull(), True)
            .otherwise(False)
        )
        

        # STEP 5: CLEAN WEAPON

        .withColumn('weapon_desc_clean',
            when(col('weapon_desc').isNull(), 'NO WEAPON/UNKNOWN')
            .when(col('weapon_desc') == '', 'NO WEAPON/UNKNOWN')
            .otherwise(trim(col('weapon_desc')))
        )
        
        .withColumn('weapon_used_cd_clean',
            when(col('weapon_used_cd').isNull(), 0)
            .otherwise(col('weapon_used_cd'))
        )
        

        # STEP 6: CLEAN STATUS

        .withColumn('status_clean',
            when(col('status').isNull(), 'IC')
            .when(col('status') == '', 'IC')
            .otherwise(col('status'))
        )
        

        # STEP 7: CLEAN PREMISE

        .withColumn('premis_desc_clean',
            when(col('premis_desc').isNull(), 'UNKNOWN')
            .when(col('premis_desc') == '', 'UNKNOWN')
            .otherwise(trim(col('premis_desc')))
        )
        
        .withColumn('premis_cd_clean',
            when(col('premis_cd').isNull(), 0)
            .otherwise(col('premis_cd'))
        )
        

        # STEP 8: ADD MULTI-CRIME FLAGS
        # For optional crime code fields (crm_cd_1, 2, 3, 4)

        .withColumn('has_secondary_crime',
            when(col('crm_cd_1').isNotNull(), True)
            .otherwise(False)
        )
        
        .withColumn('total_crime_codes',
            lit(1) +  # Primary crime (crm_cd) always exists
            when(col('crm_cd_1').isNotNull(), 1).otherwise(0) +
            when(col('crm_cd_2').isNotNull(), 1).otherwise(0) +
            when(col('crm_cd_3').isNotNull(), 1).otherwise(0) +
            when(col('crm_cd_4').isNotNull(), 1).otherwise(0)
        )
        

        # STEP 9: ADD CROSS STREET FLAG
  
        .withColumn('has_cross_street',
            when(col('cross_street').isNotNull(), True)
            .otherwise(False)
        )
        

        # STEP 10: ADD DATA QUALITY FLAGS
  
        .withColumn('has_valid_location',
            when(
                (col('lat_clean').isNotNull()) & 
                (col('lon_clean').isNotNull()) &
                (col('lat_clean') >= 33.7) & (col('lat_clean') <= 34.8) &
                (col('lon_clean') >= -118.7) & (col('lon_clean') <= -118.0),
                True
            ).otherwise(False)
        )
        
        .withColumn('has_original_location',
            when(~col('is_imputed_location'), True)
            .otherwise(False)
        )
        
        .withColumn('has_valid_age',
            when((col('vict_age_clean') >= 1) & (col('vict_age_clean') <= 120), True)
            .otherwise(False)
        )
        
        .withColumn('has_known_sex',
            when(col('vict_sex_clean') != 'X', True)
            .otherwise(False)
        )
        
        .withColumn('has_known_descent',
            when(col('vict_descent_clean') != 'X', True)
            .otherwise(False)
        )
        
        .withColumn('has_weapon',
            when(col('weapon_desc_clean') != 'NO WEAPON/UNKNOWN', True)
            .otherwise(False)
        )
        

        # STEP 11: ADD METADATA

        .withColumn('processing_timestamp', current_timestamp())
        

        # STEP 12: SELECT ALL COLUMNS

        .select(

            'dr_no',
            'date_rptd',
            'date_occ',
            'time_occ',
            'area',
            'area_name',
            'rpt_dist_no',
            'part_1_2',
            'crm_cd',
            'crm_cd_desc',
            'mocodes',
            'vict_age',
            'vict_sex',
            'vict_descent',
            'premis_cd',
            'premis_desc',
            'weapon_used_cd',
            'weapon_desc',
            'status',
            'status_desc',
            'crm_cd_1',           # Keep as-is (NULL = no secondary crime)
            'crm_cd_2',           # Keep as-is (NULL = no tertiary crime)
            'crm_cd_3',           # Keep as-is (NULL = no 4th crime)
            'crm_cd_4',           # Keep as-is (NULL = no 5th crime)
            'location',
            'cross_street',       # Keep as-is (NULL = not recorded)
            'lat',
            'lon',
            
            # ===== CLEANED COLUMNS (11) =====
            'vict_age_clean',
            'vict_sex_clean',
            'vict_descent_clean',
            'lat_clean',
            'lon_clean',
            'weapon_desc_clean',
            'weapon_used_cd_clean',
            'status_clean',
            'premis_desc_clean',
            'premis_cd_clean',
            'is_imputed_location',
            
            # ===== QUALITY FLAGS (9) =====
            'has_valid_location',
            'has_original_location',
            'has_valid_age',
            'has_known_sex',
            'has_known_descent',
            'has_weapon',
            'has_secondary_crime',    # NEW: Multi-crime flag
            'total_crime_codes',      # NEW: Count of crime codes
            'has_cross_street',       # NEW: Cross street flag
            
            # ===== METADATA (1) =====
            'processing_timestamp'
        )
    )