In [0]:
# Databricks notebook: 02_bronze_ingestion

from pyspark.sql import functions as F

# 1) Create / use database
spark.sql("CREATE DATABASE IF NOT EXISTS retail_lakehouse")
spark.sql("USE retail_lakehouse")

# 2) Define landing path in DBFS (where you uploaded CSV files)
landing_path = landing_path = "dbfs:/tmp/retail/landing/"

print("✅ Using landing path:", landing_path)



✅ Using landing path: dbfs:/tmp/retail/landing/


In [0]:
from pyspark.sql.types import (
    StructType, StructField,
    StringType, IntegerType, DoubleType, TimestampType
)

bronze_schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("transaction_ts", StringType(), True),   # keep as string in Bronze
    StructField("store_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("qty", IntegerType(), True),
    StructField("unit_price", DoubleType(), True),
    StructField("currency", StringType(), True),
    StructField("channel", StringType(), True),
    StructField("payment_method", StringType(), True),
])


In [0]:
raw_df = (
    spark.read
    .option("header", "true")
    .schema(bronze_schema)
    .csv(landing_path)
)

print("✅ Raw rows read:", raw_df.count())
display(raw_df.limit(10))


✅ Raw rows read: 500000


transaction_id,transaction_ts,store_id,customer_id,product_id,qty,unit_price,currency,channel,payment_method
TXN_000000230001,2025-12-28T05:54:29.451173+00:00,S_0037,C_0033011,P_000534,3,175.32,EUR,online,wallet
TXN_000000230002,2026-01-02T14:07:44.451173+00:00,S_0034,C_0014998,P_001587,3,193.95,EUR,store,cash
TXN_000000230003,2026-01-11T00:02:30.451173+00:00,S_0001,C_0094549,P_001688,4,161.54,EUR,store,cash
TXN_000000230004,2026-01-06T20:49:56.451173+00:00,S_0015,C_0022889,P_000341,1,157.78,EUR,online,cash
TXN_000000230005,2025-12-18T22:20:29.452174+00:00,S_0016,C_0016786,P_000430,3,20.62,EUR,online,wallet
TXN_000000230006,2026-01-08T23:50:55.452174+00:00,S_0007,C_0005414,P_001512,1,191.85,EUR,online,wallet
TXN_000000230007,2025-12-28T23:41:53.452174+00:00,S_0026,C_0059128,P_000832,5,228.1,EUR,store,card
TXN_000000230008,2025-12-30T23:52:47.452174+00:00,S_0008,C_0026607,P_000031,4,124.48,EUR,store,card
TXN_000000230009,2026-01-06T20:30:19.452174+00:00,S_0027,C_0058075,P_001230,5,243.18,EUR,store,card
TXN_000000230010,2025-12-23T12:26:00.452174+00:00,S_0044,C_0045725,P_001865,5,74.97,EUR,online,cash


In [0]:
bronze_df = (
    raw_df
    .withColumn("ingest_time", F.current_timestamp())
    .withColumn("source_file", F.input_file_name())
)

display(bronze_df.limit(10))


transaction_id,transaction_ts,store_id,customer_id,product_id,qty,unit_price,currency,channel,payment_method,ingest_time,source_file
TXN_000000230001,2025-12-28T05:54:29.451173+00:00,S_0037,C_0033011,P_000534,3,175.32,EUR,online,wallet,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230002,2026-01-02T14:07:44.451173+00:00,S_0034,C_0014998,P_001587,3,193.95,EUR,store,cash,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230003,2026-01-11T00:02:30.451173+00:00,S_0001,C_0094549,P_001688,4,161.54,EUR,store,cash,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230004,2026-01-06T20:49:56.451173+00:00,S_0015,C_0022889,P_000341,1,157.78,EUR,online,cash,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230005,2025-12-18T22:20:29.452174+00:00,S_0016,C_0016786,P_000430,3,20.62,EUR,online,wallet,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230006,2026-01-08T23:50:55.452174+00:00,S_0007,C_0005414,P_001512,1,191.85,EUR,online,wallet,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230007,2025-12-28T23:41:53.452174+00:00,S_0026,C_0059128,P_000832,5,228.1,EUR,store,card,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230008,2025-12-30T23:52:47.452174+00:00,S_0008,C_0026607,P_000031,4,124.48,EUR,store,card,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230009,2026-01-06T20:30:19.452174+00:00,S_0027,C_0058075,P_001230,5,243.18,EUR,store,card,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv
TXN_000000230010,2025-12-23T12:26:00.452174+00:00,S_0044,C_0045725,P_001865,5,74.97,EUR,online,cash,2026-01-12T04:27:28.037755Z,dbfs:/tmp/retail/landing/transactions_20260112_014912_part024.csv


In [0]:
(
    bronze_df.write
    .format("delta")
    .mode("overwrite")  # first run: overwrite. Later we will use append.
    .saveAsTable("retail_lakehouse.bronze_transactions")
)

print("✅ Bronze Delta table created: retail_lakehouse.bronze_transactions")


✅ Bronze Delta table created: retail_lakehouse.bronze_transactions


In [0]:
bronze = spark.table("retail_lakehouse.bronze_transactions")

print("✅ Bronze row count:", bronze.count())
display(bronze.limit(10))

# Check columns (schema)
bronze.printSchema()


✅ Bronze row count: 500000


transaction_id,transaction_ts,store_id,customer_id,product_id,qty,unit_price,currency,channel,payment_method,ingest_time,source_file
TXN_000000000001,2026-01-06T16:06:32.481383+00:00,S_0002,C_0097197,P_000564,2,57.36,EUR,online,wallet,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000002,2025-12-14T09:50:43.481383+00:00,S_0028,C_0004166,P_000062,1,56.22,EUR,online,wallet,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000003,2025-12-16T14:54:58.481383+00:00,S_0027,C_0028894,P_000920,5,-78.3,EUR,online,card,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000004,2025-12-26T13:23:54.481383+00:00,S_0018,C_0020380,P_000441,3,27.35,EUR,store,card,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000005,2025-12-26T09:04:54.481383+00:00,S_0039,C_0034672,P_001653,1,182.97,EUR,online,cash,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000006,2025-12-16T06:38:42.481383+00:00,S_0019,C_0082398,P_001267,3,145.18,EUR,online,card,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000007,2025-12-29T00:39:27.481383+00:00,S_0006,C_0030513,P_001775,1,96.27,EUR,store,wallet,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000008,2026-01-04T04:18:48.481383+00:00,S_0024,C_0046567,P_000430,3,176.05,EUR,online,wallet,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000009,2025-12-17T03:30:03.481383+00:00,S_0047,C_0032088,P_000335,4,96.1,EUR,online,wallet,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv
TXN_000000000010,2026-01-09T08:38:48.481383+00:00,S_0015,C_0004208,P_001649,3,101.49,EUR,online,card,2026-01-12T04:27:34.594566Z,dbfs:/tmp/retail/landing/transactions_20260112_014908_part001.csv


root
 |-- transaction_id: string (nullable = true)
 |-- transaction_ts: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- unit_price: double (nullable = true)
 |-- currency: string (nullable = true)
 |-- channel: string (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- ingest_time: timestamp (nullable = true)
 |-- source_file: string (nullable = true)

