In [0]:
from pyspark.sql.functions import from_json, col, lit, coalesce
from pyspark.sql.types import StructType, StructField, StringType

In [0]:
df =spark.read.option("header", True)\
               .option("escape", '"')\
               .option("quote", '"')\
               .option("multiLine", True)\
               .csv("dbfs:/FileStore/shared_uploads/parveen.r@live.com/sales.csv")
df.show(truncate=False)

In [0]:
sales_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("dbfs:/FileStore/shared_uploads/parveen.r@live.com/sales.csv")

print("Loaded sales.csv file:")
sales_df.show(5, truncate=False)

In [0]:
print(sales_df.select("ProductMetadata").first()["ProductMetadata"])

In [0]:
sales_df.select("ProductMetadata").show(truncate=False)

In [0]:
# 2. Write as Delta table (overwrite if exists)
sales_df.write.format("delta").mode("overwrite").saveAsTable("sales_delta_demo")

print("sales_delta_demo Delta table created.")

In [0]:
# 3. Create SQL UDF to uppercase CustomerName
spark.sql("""
CREATE OR REPLACE FUNCTION to_upper_case(str STRING) RETURNS STRING
RETURN UPPER(str)
""")
print("SQL UDF 'to_upper_case' created.")




In [0]:
# 4. Create SQL UDF to Reverse CustomerName
spark.sql("""
CREATE OR REPLACE FUNCTION to_reverse(str STRING) RETURNS STRING
RETURN Reverse(str)
""")
print("SQL UDF 'to_reverse' created.")


In [0]:
spark.sql("SELECT CustomerName, to_upper_case(CustomerName) AS CustomerName_Upper, to_reverse(CustomerName) AS CustomerName_Reverse FROM sales_delta_demo LIMIT 10").show(truncate=False)

In [0]:
# 5. Create SQL UDF to find the Demand
spark.sql("""
CREATE OR REPLACE FUNCTION demand(qty INT) RETURNS STRING
RETURN CASE
    WHEN qty<2 THEN 'Low'
    WHEN qty BETWEEN 2 and 6 THEN 'Medium'
    WHEN qty>=10 THEN 'High'
END;
""")
print("SQL UDF 'demand' created.")

In [0]:
%sql
select Quantity, demand(Quantity) as Demand from sales_delta_demo

In [0]:
# 4. Parse JSON in ProductMetadata column
json_schema = StructType([
    StructField("color", StringType(), True),
    StructField("warranty", StringType(), True)
])


In [0]:
# Clean ProductMetadata strings (remove leading/trailing quotes and fix escapes)
from pyspark.sql.functions import regexp_replace

sales_clean = sales_df.withColumn(
    "ProductMetadata_clean",
    regexp_replace(col("ProductMetadata"), '^"+|"+$', '')
).withColumn(
    "ProductMetadata_clean",
    regexp_replace(col("ProductMetadata_clean"), '""', '"')
)

df_parsed = sales_clean.withColumn("ProductDetails", from_json(col("ProductMetadata_clean"), json_schema))

df_parsed.select("SalesOrderNumber", "CustomerName", "ProductDetails.color", "ProductDetails.warranty").show(10, truncate=False)


In [0]:
# 5. Schema Evolution: Add a new column 'Discount' with default value 0.1
df_with_discount = df_parsed.withColumn("Discount", lit(0.1))

df_with_discount.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable("sales_delta_demo")

print("Added 'Discount' column using schema evolution.")

spark.table("sales_delta_demo").printSchema()


In [0]:
# 6. Time Travel: Query previous version of the table (version 0)
print("Time travel query to version 0:")
spark.sql("SELECT * FROM sales_delta_demo VERSION AS OF 0 LIMIT 10").show(truncate=False)


In [0]:
# 7. Optimize table and Z-order by CustomerName
spark.sql("OPTIMIZE sales_delta_demo ZORDER BY (CustomerName)")

print("OPTIMIZE with ZORDER completed.")