In [0]:
%sql
use catalog newpavancatalog;
use schema bronze;


In [0]:
# %fs ls 'abfss://landing@parkaru15sa.dfs.core.windows.net/orders/'

In [0]:
adls_path = "abfss://landing@parkaru15sa.dfs.core.windows.net/orders/"
df = spark.read.format("json").load(adls_path)
display(df)

In [0]:
from pyspark.sql.functions import explode

# Remove the first column (assumed to be corrupted records)
cols = df.columns[1:]
exploded_df = df.select(*cols).withColumn("items", explode("items"))
display(exploded_df)

In [0]:
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Assuming you already have exploded_df loaded
# exploded_df is your dataframe with customer_id and items columns

# Define schema for JSON parsing
schema = StructType([
    StructField("category", StringType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("details", StructType([
        StructField("brand", StringType(), True),
        StructField("color", StringType(), True)
    ]), True)
])

# Parse JSON string to structured format
df_parsed = exploded_df.withColumn("items_parsed", col("items"))

# Flatten the nested structure into separate columns
df_flattened = df_parsed.select(
    "customer_id",
    col("items_parsed.category").alias("category"),
    col("items_parsed.item_id").alias("item_id"),
    col("items_parsed.name").alias("name"),
    col("items_parsed.price").alias("price"),
    col("items_parsed.quantity").alias("quantity"),
    col("items_parsed.details.brand").alias("brand"),
    col("items_parsed.details.color").alias("color")
).drop("items")

# Display the flattened result
print("Flattened DataFrame:")
display(df_flattened)

print("\nSchema:")
df_flattened.printSchema()

# Optional: Save the result
# df_flattened.write.mode("overwrite").csv("flattened_data.csv", header=True)
# df_flattened.write.mode("overwrite").parquet("flattened_data.parquet")

In [0]:
# Option 2: Create or replace a managed table in Delta Lake
df_flattened.write.mode("overwrite").format("delta").mode("overwrite").saveAsTable("gold.final_orders")


In [0]:
%sql
select count(*) from newpavancatalog.gold.final_orders