In [6]:

val df = Seq(
    ("Jaya", 20, List("SQL", "Data", "Science")), 
    ("Milan", 21, List("ML", "AI")), 
    ("Rohit", 19, List()), 
    ("Maria", 20, List("DBMS", "Networking")), 
    ("Jay", 22, List())
).toDF("Name", "Age", "Courses_enrolled")


df.show(false)

+-----+---+--------------------+
|Name |Age|Courses_enrolled    |
+-----+---+--------------------+
|Jaya |20 |[SQL, Data, Science]|
|Milan|21 |[ML, AI]            |
|Rohit|19 |[]                  |
|Maria|20 |[DBMS, Networking]  |
|Jay  |22 |[]                  |
+-----+---+--------------------+



df: org.apache.spark.sql.DataFrame = [Name: string, Age: int ... 1 more field]


In [13]:
/*
    NOTE that if exploding column has Empty List all such records will be dropped by Spark itself
*/

val explode_df = df.withColumn("course", explode($"Courses_enrolled")).drop("Courses_enrolled")

explode_df.show(false)
explode_df.explain()


+-----+---+----------+
|Name |Age|course    |
+-----+---+----------+
|Jaya |20 |SQL       |
|Jaya |20 |Data      |
|Jaya |20 |Science   |
|Milan|21 |ML        |
|Milan|21 |AI        |
|Maria|20 |DBMS      |
|Maria|20 |Networking|
+-----+---+----------+

== Physical Plan ==
*(1) Generate explode(Courses_enrolled#28), [Name#26, Age#27], false, [course#152]
+- *(1) LocalTableScan [Name#26, Age#27, Courses_enrolled#28]




explode_df: org.apache.spark.sql.DataFrame = [Name: string, Age: int ... 1 more field]


In [17]:
/*
    NOTE that outerexplode have to be used in a select expression, it will output 
    index in pos column
    value in col column
    
    Records with Empty List will be dropped by Spark itself
*/

val posexplode_df = df.select($"name", $"age", posexplode($"Courses_enrolled"))

posexplode_df.show(false)
posexplode_df.explain()


+-----+---+---+----------+
|name |age|pos|col       |
+-----+---+---+----------+
|Jaya |20 |0  |SQL       |
|Jaya |20 |1  |Data      |
|Jaya |20 |2  |Science   |
|Milan|21 |0  |ML        |
|Milan|21 |1  |AI        |
|Maria|20 |0  |DBMS      |
|Maria|20 |1  |Networking|
+-----+---+---+----------+

== Physical Plan ==
*(1) Project [name#26, age#27, pos#179, col#180]
+- *(1) Generate posexplode(Courses_enrolled#28), [Name#26, Age#27], false, [pos#179, col#180]
   +- *(1) LocalTableScan [Name#26, Age#27, Courses_enrolled#28]




posexplode_df: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]


In [19]:
/*
    NOTE that outerexplode have to be used in a select expression, it will output 
    index in pos column
    value in col column
    
    Records with Empty List will be CONSIDERED IN THE OUTPUT
*/

val posexplode_outer_df = df.select($"name", $"age", posexplode_outer($"Courses_enrolled"))

posexplode_outer_df.show(false)
posexplode_outer_df.explain()


+-----+---+----+----------+
|name |age|pos |col       |
+-----+---+----+----------+
|Jaya |20 |0   |SQL       |
|Jaya |20 |1   |Data      |
|Jaya |20 |2   |Science   |
|Milan|21 |0   |ML        |
|Milan|21 |1   |AI        |
|Rohit|19 |null|null      |
|Maria|20 |0   |DBMS      |
|Maria|20 |1   |Networking|
|Jay  |22 |null|null      |
+-----+---+----+----------+

== Physical Plan ==
*(1) Project [name#26, age#27, pos#225, col#226]
+- *(1) Generate posexplode(Courses_enrolled#28), [Name#26, Age#27], true, [pos#225, col#226]
   +- *(1) LocalTableScan [Name#26, Age#27, Courses_enrolled#28]




posexplode_outer_df: org.apache.spark.sql.DataFrame = [name: string, age: int ... 2 more fields]
