[Uploading image_1769357446296.png...]

In [0]:
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/Volumes/workspace/ecommerce/ecommerce_data/*.csv")

df.display()


**1️⃣ Create a NEW Volume for Delta Tables**

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.ecommerce.delta_tables;

**2️⃣ Verify the Volume Exists**

In [0]:
%sql
SHOW VOLUMES IN workspace.ecommerce;

**Task 1: Converting CSV to Delta format (PySpark)**

In [0]:
df.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/workspace/ecommerce/delta_tables/ecommerce_delta")


**Task 2(a): Creating Delta Table using SQL**

In [0]:
%sql
CREATE TABLE IF NOT EXISTS workspace.ecommerce.ecommerce_sql
USING DELTA
AS
SELECT *
FROM delta.`/Volumes/workspace/ecommerce/delta_tables/ecommerce_delta`;


**Task 2(b): Creating Delta Table using PySpark**

In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS workspace.ecommerce.ecommerce_pyspark
USING DELTA
AS
SELECT *
FROM workspace.ecommerce.ecommerce_sql
""")


In [0]:
%sql
DESCRIBE TABLE workspace.ecommerce.ecommerce_pyspark;

**3️⃣ Detect duplicate rows**

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col, count


delta_table = DeltaTable.forPath(
    spark,
    "/Volumes/workspace/ecommerce/delta_tables/ecommerce_delta"
)
df = delta_table.toDF()


df.groupBy(df.columns).count().filter("count > 1").show()


**MERGE to prevent future duplicates**

In [0]:
%sql
MERGE INTO workspace.ecommerce.ecommerce_pyspark AS target
USING workspace.ecommerce.ecommerce_sql AS source
ON target.event_time = source.event_time
AND target.event_type = source.event_type
AND target.product_id = source.product_id
AND target.category_id = source.category_id
AND target.category_code = source.category_code
AND target.brand = source.brand
AND target.price = source.price
AND target.user_id = source.user_id
AND target.user_session = source.user_session
WHEN NOT MATCHED THEN
INSERT *;


**Re-read & validate**

In [0]:
df_check = spark.read.format("delta") \
    .load("/Volumes/workspace/ecommerce/delta_tables/ecommerce_delta")

print("Total rows:", df_check.count())

df_check.groupBy(df_check.columns) \
    .count() \
    .filter("count > 1") \
    .show()
