In [None]:
# /home/labuser/Documents/Level2_Day1_Data/walmart_inventory.json

 - Explode Arrays: Each store contains a list of products, which needs to be exploded.
 - Flatten Structs: Product details are stored in nested structures that need flattening.
 - Handle Maps: Different warehouse locations hold stock quantities in a MapType column.
 - Deal with Complex JSON: The input JSON is deeply nested and needs transformation.
 - Performance Considerations: Use optimized transformations, avoid unnecessary shuffling, and leverage broadcast joins where applicable.

In [1]:
# Importing Lib
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, struct

In [2]:
# Initialize the Spark Session
spark = SparkSession.builder.appName("WalmartInventory").getOrCreate()

In [6]:
# Loading JSON File
# df = spark.read.json("/home/labuser/Documents/Level2_Day1_Data/walmart_inventory.json")
df = spark.read.option("multiline", "true").json("/home/labuser/Documents/Level2_Day1_Data/walmart_inventory.json")

In [7]:
# Print schema to inspect nested structure
df.printSchema()
df.show(truncate=False)

root
 |-- inventory: struct (nullable = true)
 |    |-- product: struct (nullable = true)
 |    |    |-- details: struct (nullable = true)
 |    |    |    |-- category: string (nullable = true)
 |    |    |    |-- price: long (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- stock: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- quantity: long (nullable = true)
 |    |    |    |-- warehouse: string (nullable = true)
 |    |-- supplier_info: struct (nullable = true)
 |    |    |-- contact: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- store_id: string (nullable = true)

+-------------------------------------------------------------------------------------------+---------+
|inventory                                                                                  |store_id |
+----------------------------------------------------------------------

In [9]:
# Explode Arrays (Stock Information)

df_exploded = df.withColumn("stock", explode(col("inventory.stock")))
df_exploded.printSchema()
df_exploded.show()

root
 |-- inventory: struct (nullable = true)
 |    |-- product: struct (nullable = true)
 |    |    |-- details: struct (nullable = true)
 |    |    |    |-- category: string (nullable = true)
 |    |    |    |-- price: long (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- stock: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- quantity: long (nullable = true)
 |    |    |    |-- warehouse: string (nullable = true)
 |    |-- supplier_info: struct (nullable = true)
 |    |    |-- contact: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- stock: struct (nullable = true)
 |    |-- quantity: long (nullable = true)
 |    |-- warehouse: string (nullable = true)

+--------------------+---------+---------+
|           inventory| store_id|    stock|
+--------------------+---------+---------+
|{{{Electronics, 5...|St

Before:
+-----------+----------------+
|product_id |stock           |
+-----------+----------------+
|P001       |[10, 20, 30]    |
|P002       |[15, 25]        |
+-----------+----------------+


After:
+-----------+------+
|product_id |stock |
+-----------+------+
|P001       |10    |
|P001       |20    |
|P001       |30    |
|P002       |15    |
|P002       |25    |
+-----------+------+



In [10]:
# Flatten Nested Structure (Product & Supplier Info)

df_flattened = df_exploded.select(
    col("store_id"),
    col("inventory.product.id").alias("product_id"),
    col("inventory.product.name").alias("product_name"),
    col("inventory.product.details.category").alias("category"),
    col("inventory.product.details.price").alias("price"),
    col("stock.warehouse").alias("warehouse"),
    col("stock.quantity").alias("stock_quantity"),
    col("inventory.supplier_info.name").alias("supplier_name"),
    col("inventory.supplier_info.contact").alias("supplier_contact")
)

df_flattened.show(truncate=False)

+---------+----------+------------+-----------+-----+---------+--------------+-------------+----------------+
|store_id |product_id|product_name|category   |price|warehouse|stock_quantity|supplier_name|supplier_contact|
+---------+----------+------------+-----------+-----+---------+--------------+-------------+----------------+
|Store_001|102       |Smartphone  |Electronics|500  |WH1      |95            |Supplier_5   |contact_5191    |
|Store_001|102       |Smartphone  |Electronics|500  |WH2      |88            |Supplier_5   |contact_5191    |
|Store_002|201       |Apple       |Groceries  |2    |WH1      |59            |Supplier_1   |contact_1560    |
|Store_002|201       |Apple       |Groceries  |2    |WH2      |96            |Supplier_1   |contact_1560    |
|Store_002|101       |Laptop      |Electronics|800  |WH1      |87            |Supplier_2   |contact_1923    |
|Store_002|101       |Laptop      |Electronics|800  |WH2      |74            |Supplier_2   |contact_1923    |
|Store_003

In [12]:
df_flattened.cache()

DataFrame[store_id: string, product_id: bigint, product_name: string, category: string, price: bigint, warehouse: string, stock_quantity: bigint, supplier_name: string, supplier_contact: string]

In [11]:
# Extracting fields correctly

df_map = df_exploded.select(col("store_id"), col("stock.warehouse").alias("Warehouse"), col("stock.quantity").alias("quantity"))
df_map.show()

+---------+---------+--------+
| store_id|Warehouse|quantity|
+---------+---------+--------+
|Store_001|      WH1|      95|
|Store_001|      WH2|      88|
|Store_002|      WH1|      59|
|Store_002|      WH2|      96|
|Store_002|      WH1|      87|
|Store_002|      WH2|      74|
|Store_003|      WH1|      55|
|Store_003|      WH2|      91|
|Store_003|      WH1|      15|
|Store_003|      WH2|      25|
|Store_001|      WH1|      51|
|Store_001|      WH2|      83|
|Store_002|      WH1|      94|
|Store_002|      WH2|      49|
|Store_003|      WH1|      95|
|Store_003|      WH2|      41|
|Store_002|      WH1|      24|
|Store_002|      WH2|      37|
|Store_003|      WH1|      12|
|Store_003|      WH2|      17|
+---------+---------+--------+

