## **Use Auto Loader to automate file loading.**

In [0]:

dbutils.fs.ls('/FileStore/tables')


Out[5]: [FileInfo(path='dbfs:/FileStore/tables/employees-1.csv', name='employees-1.csv', size=3778, modificationTime=1749218486000),
 FileInfo(path='dbfs:/FileStore/tables/employees-2.csv', name='employees-2.csv', size=3778, modificationTime=1749218504000),
 FileInfo(path='dbfs:/FileStore/tables/employees-3.csv', name='employees-3.csv', size=3778, modificationTime=1749229958000),
 FileInfo(path='dbfs:/FileStore/tables/employees.csv', name='employees.csv', size=3778, modificationTime=1749218423000)]

In [0]:
 # Code for Auto Loader in Databricks
from pyspark.sql.types import StructType, StringType, IntegerType

# Step 1: Define the schema of your employee data
schema = StructType() \
    .add("id", IntegerType()) \
    .add("name", StringType()) \
    .add("age", IntegerType())

# Step 2: Set up Auto Loader to read new files
df = (spark.readStream
      .format("cloudFiles")
      .option("cloudFiles.format", "csv")  # Input format
      .schema(schema)
      .load("/FileStore/tables/"))  # Your uploaded path

# Step 3: Stream output to Delta table folder
query = (df.writeStream
         .format("delta")
         .outputMode("append")
         .option("checkpointLocation", "/FileStore/employee_checkpoint/")  # Required for streaming
         .start("/FileStore/employee_output/"))  # Output path


In [0]:

spark.read.format("delta").load("/FileStore/employee_output/").show()


+----+----------+----+
|  id|      name| age|
+----+----------+----+
|null|FIRST_NAME|null|
| 198|    Donald|null|
| 199|   Douglas|null|
| 200|  Jennifer|null|
| 201|   Michael|null|
| 202|       Pat|null|
| 203|     Susan|null|
| 204|   Hermann|null|
| 205|   Shelley|null|
| 206|   William|null|
| 100|    Steven|null|
| 101|     Neena|null|
| 102|       Lex|null|
| 103| Alexander|null|
| 104|     Bruce|null|
| 105|     David|null|
| 106|     Valli|null|
| 107|     Diana|null|
| 108|     Nancy|null|
| 109|    Daniel|null|
+----+----------+----+
only showing top 20 rows



Out[8]: {'message': 'Waiting for data to arrive',
 'isDataAvailable': False,
 'isTriggerActive': False}

## **Set up a simple Medallion pipeline to move data from raw (bronze) to cleansed (silver) to curated (gold).**

In [0]:
dbutils.fs.ls('/FileStore/tables')


Out[38]: [FileInfo(path='dbfs:/FileStore/tables/employees-1.csv', name='employees-1.csv', size=3778, modificationTime=1749218486000),
 FileInfo(path='dbfs:/FileStore/tables/employees-2.csv', name='employees-2.csv', size=3778, modificationTime=1749218504000),
 FileInfo(path='dbfs:/FileStore/tables/employees-3.csv', name='employees-3.csv', size=3778, modificationTime=1749229958000),
 FileInfo(path='dbfs:/FileStore/tables/employees-fresh.csv', name='employees-fresh.csv', size=42, modificationTime=1749822020000),
 FileInfo(path='dbfs:/FileStore/tables/employees.csv', name='employees.csv', size=3778, modificationTime=1749218423000)]

**Bronze Layer – Load Raw Data**

In [0]:
# STEP 1: Define Schema
from pyspark.sql.types import StructType, StringType, IntegerType

schema = StructType() \
    .add("id", IntegerType()) \
    .add("name", StringType()) \
    .add("age", IntegerType())


In [0]:
# STEP 2: Bronze - Ingest raw CSVs with Auto Loader
bronze_df = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .schema(schema)
    .load("/FileStore/tables/"))

In [0]:
bronze_query = (bronze_df.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", "/FileStore/bronze_checkpoint_v1/")
    .start("/FileStore/bronze_table_v1/"))

**Silver Layer – Clean or Filter Data**

In [0]:
#STEP 3: Silver - Clean and filter data
silver_df = spark.readStream.format("delta").load("/FileStore/bronze_table_v1/")

cleansed_df = silver_df.filter("age IS NOT NULL AND age >= 18")

In [0]:
silver_query = (cleansed_df.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", "/FileStore/silver_checkpoint_v1/")
    .start("/FileStore/silver_table_v1/"))

**Gold Layer – Aggregate Business Data**

In [0]:
#STEP 4: Gold - Aggregate by age
from pyspark.sql.functions import count

gold_df = spark.readStream.format("delta").load("/FileStore/silver_table_v1/")
agg_df = gold_df.groupBy("age").agg(count("id").alias("employee_count"))


In [0]:
gold_query = (agg_df.writeStream
    .format("delta")
    .outputMode("complete")  # needed for aggregation
    .option("checkpointLocation", "/FileStore/gold_checkpoint_v1/")
    .start("/FileStore/gold_table_v1/"))

 View Each Layer Output

In [0]:
# Bronze
spark.read.format("delta").load("/FileStore/bronze_table_v1/").show()


+----+----------+----+
|  id|      name| age|
+----+----------+----+
|null|FIRST_NAME|null|
| 198|    Donald|null|
| 199|   Douglas|null|
| 200|  Jennifer|null|
| 201|   Michael|null|
| 202|       Pat|null|
| 203|     Susan|null|
| 204|   Hermann|null|
| 205|   Shelley|null|
| 206|   William|null|
| 100|    Steven|null|
| 101|     Neena|null|
| 102|       Lex|null|
| 103| Alexander|null|
| 104|     Bruce|null|
| 105|     David|null|
| 106|     Valli|null|
| 107|     Diana|null|
| 108|     Nancy|null|
| 109|    Daniel|null|
+----+----------+----+
only showing top 20 rows



In [0]:
# Silver
spark.read.format("delta").load("/FileStore/silver_table_v1/").show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1|Aman| 22|
|  2|Neha| 27|
|  3|Ravi| 33|
+---+----+---+



In [0]:
# Gold
spark.read.format("delta").load("/FileStore/gold_table_v1/").show()

+---+--------------+
|age|employee_count|
+---+--------------+
| 27|             1|
| 22|             1|
| 33|             1|
+---+--------------+

