In [0]:
filePath = "dbfs:/FileStore/GlobalRetail/bronze_layer/customer_data/customer.csv"
df = spark.read.csv(filePath, inferSchema = True, header = True)

In [0]:
df.show()

+-----------+-----------+--------------------+---------+-------------+-----------------+---+------+---------------+
|customer_id|       name|               email|  country|customer_type|registration_date|age|gender|total_purchases|
+-----------+-----------+--------------------+---------+-------------+-----------------+---+------+---------------+
|          1| Customer 1|customer1@example...|Australia|      Regular|       2011-05-15| 22|  Male|            191|
|          2| Customer 2|customer2@example...|   France|      Premium|       2018-11-27| 52| Other|            145|
|          3| Customer 3|customer3@example...|   Canada|      Premium|       2015-10-01| 32| Other|            691|
|          4| Customer 4|customer4@example...|      USA|      Premium|       2011-01-19| 70| Other|            644|
|          5| Customer 5|customer5@example...|  Germany|      Regular|       2021-08-26| 66| Other|            508|
|          6| Customer 6|customer6@example...|   France|      Premium|  

In [0]:
from pyspark.sql.functions import current_timestamp
df_new = df.withColumn("ingestion_timestamp", current_timestamp())
display(df_new.limit(10))

customer_id,name,email,country,customer_type,registration_date,age,gender,total_purchases,ingestion_timestamp
1,Customer 1,customer1@example.com,Australia,Regular,2011-05-15,22,Male,191,2025-04-29T11:49:34.366+0000
2,Customer 2,customer2@example.com,France,Premium,2018-11-27,52,Other,145,2025-04-29T11:49:34.366+0000
3,Customer 3,customer3@example.com,Canada,Premium,2015-10-01,32,Other,691,2025-04-29T11:49:34.366+0000
4,Customer 4,customer4@example.com,USA,Premium,2011-01-19,70,Other,644,2025-04-29T11:49:34.366+0000
5,Customer 5,customer5@example.com,Germany,Regular,2021-08-26,66,Other,508,2025-04-29T11:49:34.366+0000
6,Customer 6,customer6@example.com,France,Premium,2015-03-02,20,Male,704,2025-04-29T11:49:34.366+0000
7,Customer 7,customer7@example.com,China,Premium,2018-05-24,24,Female,892,2025-04-29T11:49:34.366+0000
8,Customer 8,customer8@example.com,China,Regular,2023-10-02,26,Male,488,2025-04-29T11:49:34.366+0000
9,Customer 9,customer9@example.com,Japan,Premium,2014-10-05,36,Other,30,2025-04-29T11:49:34.366+0000
10,Customer 10,customer10@example.com,Brazil,Premium,2017-08-30,30,Male,959,2025-04-29T11:49:34.366+0000


In [0]:
dbutils.fs.ls("dbfs:/user/hive/warehouse/global_retail_bronze.db/bronze_customer")

Out[4]: [FileInfo(path='dbfs:/user/hive/warehouse/global_retail_bronze.db/bronze_customer/_delta_log/', name='_delta_log/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/user/hive/warehouse/global_retail_bronze.db/bronze_customer/part-00000-0b1d1d63-df36-4bad-b1ab-9137ffb2cd3a-c000.snappy.parquet', name='part-00000-0b1d1d63-df36-4bad-b1ab-9137ffb2cd3a-c000.snappy.parquet', size=27122, modificationTime=1745833633000)]

In [0]:
dbutils.fs.rm("dbfs:/user/hive/warehouse/global_retail_bronze.db/bronze_customer", recurse=True)

Out[5]: True

In [0]:
# we want to save it as delta lake table because we are creating data lake house architecture (delta lake is underlying table format)
# delta lake provides you the capability to do crud operations and acid transactions
spark.sql("use global_retail_bronze")
try:
    df_new.write.format("delta").mode("append").saveAsTable("bronze_customer")
except Exception as e:
    print(f"Error occurred: {str(e)}")

In [0]:
%sql
show tables

database,tableName,isTemporary
global_retail_bronze,bronze_customer,False
global_retail_bronze,bronze_products,False
global_retail_bronze,bronze_transactions,False


In [0]:
# import datetime

# archive_folder = 'dbfs:/FileStore/GlobalRetail/bronze_layer/customer_data/archive/'
# archive_file_path = archive_folder+'_'+datetime.datetime.now().strftime("%Y%m%d%H%M%s")
# dbutils.fs.mv(filePath, archive_file_path)
# print(archive_file_path)