import

In [0]:
# ——————————————————————————————
#  LIBRARIES
# ——————————————————————————————    
import dlt # For live delta tables
from pyspark.sql.functions import current_timestamp, col
from pyspark.sql.types import StringType, IntegerType

In [0]:
# ——————————————————————————————
#  ENVIRONMENT CONFIGURATION
# ——————————————————————————————

# Get DEV or PROD
try:
    env = spark.conf.get("pipeline.env")
except:
    env = "dev"
catalog = "book_rec_catalog"
bronze_schema = f"{env}_bronze"

Ingestion from volume to bronze

In [0]:
# ——————————————————————————————
#  Table for books
# ——————————————————————————————
@dlt.table(name=f"{bronze_schema}.books_bronze", comment="Bronze table for raw Books data")
def books_bronze():
    path = f"/Volumes/{catalog}/landing/book_files"
    
    return (spark.readStream
            .format("cloudFiles") # auto laoder ... read always new added files
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("multiLine", True)
            .option("quote", '"')
            .option("escape", '"')
            .option("pathGlobFilter", "Books.csv") # Only those that are named Books.csv
            .load(path)
            .select("*", col("_metadata.file_path").alias("source_file")) # Add source file path
            .withColumn("ingestion_ts", current_timestamp()) # Add ingestion timestamp
    )

# ——————————————————————————————
#  Table for ratings
# ——————————————————————————————

# Comments are all above
@dlt.table(name=f"{bronze_schema}.ratings_bronze", comment="Bronze table for raw Ratings data")
def ratings_bronze():
    path = f"/Volumes/{catalog}/landing/book_files"
    
    return (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("multiLine", True)
            .option("quote", '"')
            .option("escape", '"')
            .option("pathGlobFilter", "Ratings.csv")
            .load(path)
            .select("*", col("_metadata.file_path").alias("source_file"))
            .withColumn("ingestion_ts", current_timestamp())
            )

# ——————————————————————————————
#  Table for users
# ——————————————————————————————

# Comments are all above
@dlt.table(name=f"{bronze_schema}.users_bronze", comment="Bronze table for raw Users data")
def users_bronze():
    path = f"/Volumes/{catalog}/landing/book_files"
    
    return (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("multiLine", True)
            .option("quote", '"')
            .option("escape", '"')
            .option("pathGlobFilter", "Users.csv")
            .load(path)
            .select("*", col("_metadata.file_path").alias("source_file"))
            .withColumn("ingestion_ts", current_timestamp())
            )
