In [0]:
import dlt
from pyspark.sql.functions import *

In [0]:
dbutils.fs.ls('/tmp/chp_03/taxi_data')

In [0]:
@dlt.table(comment="The randomly generated taxi trip dataset")
def yellow_taxi_raw():
    path = "/tmp/chp_03/taxi_data"
    checkpoint = "/tmp/chp_03/taxi_data_chkpnt"
    schema = "trip_id INT, taxi_number INT, passenger_count INT, trip_amount FLOAT, trip_distance FLOAT, trip_date DATE"
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("cloudFiles.schemaLocation", "/tmp/chp_03/taxi_data_chkpnt")
        #.option("cloudFiles.includeExistingFiles", "true")
        .load(path)
    )

In [0]:
@dlt.table(
    name="trip_data_financials",
    comment="Financial information from completed taxi trips."
)
@dlt.expect_or_fail("valid_total_amount", "trip_amount > 0.0")
def trip_data_financials():
    return (
        dlt.readStream("yellow_taxi_raw")
           .withColumn("driver_payment",
                       expr("trip_amount*0.40"))
           .withColumn("vehicle_maintenance_fee",
                       expr("trip_amount*0.05"))
           .withColumn("adminstrative_fee",
                       expr("trip_amount*0.1"))
           .withColumn("potential_profits",
                       expr("trip_amount*0.45")))

In [0]:
assertions = {
   "total_amount_constraint": "trip_amount > 0.0",
   "passenger_count": "passenger_count >= 1"
}

@dlt.table(
   name="yellow_taxi_validated",
   comment="A dataset containing trip data that has been validated.")
@dlt.expect_all_or_drop(assertions)
def yellow_taxi_validated():
   return (dlt.readStream("yellow_taxi_raw")
      .withColumn("nyc_congestion_tax", expr("trip_amount * 0.05"))) 