In [None]:
"""
Author: Matt Martin
Date: 2/26/24
Desc: Demonstrates how to nest rows in the same table in spark using array_agg/struct
"""

In [None]:
import os
dw_path = os.path.expanduser("~")+'/test_dummy_data/spark/test_dw'

## create the spark connection/instance
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test") \
    .config("spark.sql.warehouse.dir", dw_path) \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.instances", 10) \
    .getOrCreate()

In [5]:
#1: generate some data
data = [
    {'dept_id':1,'name':'Bill','title':'manager'},
    {'dept_id':1,'name':'Fred','title':'Director'},
    {'dept_id':2,'name':'Ted','title':'Analyst'},
    {'dept_id':2,'name':'Amy','title':'Snr Analyst'}
]
df = spark.createDataFrame(data)

In [10]:
#2: Nest the groupable columns together
sql = """
select dept_id
    , array_agg(struct(name, title)) as emp_dtl
from {df}
group by 1
"""
dfn = spark.sql(sql, df=df)

In [11]:
dfn.write.mode('overwrite').saveAsTable('employee_dept_hier')

In [18]:
#3: using the "lateral view explode", unnest the data so it shows all in the same row as the header info
sql = """
select
    hdr.dept_id
    ,dtl.name
    ,dtl.title
from employee_dept_hier as hdr
    lateral view explode(emp_dtl) as dtl
"""
spark.sql(sql).show()

+-------+----+-----------+
|dept_id|name|      title|
+-------+----+-----------+
|      1|Bill|    manager|
|      1|Fred|   Director|
|      2| Ted|    Analyst|
|      2| Amy|Snr Analyst|
+-------+----+-----------+

