# Explode
Returns a new row for each element in the given array or map.
    Uses the default column name `col` for elements in the array and
    `key` and `value` for elements in the map unless specified otherwise.

In [1]:
import pyspark

In [2]:
import sys
#reload(sys)
#sys.setdefaultencoding("utf-8")

In [3]:
# Create SparkSession from builder
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local") \
                    .appName('Explode Pyspark') \
                    .getOrCreate()


In [4]:
from pyspark.sql import Row

In [51]:
from pyspark.sql.functions import explode,explode_outer

In [6]:
eDF=spark.createDataFrame([Row(a=1,intlist=[1,2,3],mapfield={"a":"b"})])

In [7]:
eDF.show()

+---+---------+--------+
|  a|  intlist|mapfield|
+---+---------+--------+
|  1|[1, 2, 3]|{a -> b}|
+---+---------+--------+



In [8]:
eDF.select(explode(eDF.intlist).alias("anInt")).collect()

[Row(anInt=1), Row(anInt=2), Row(anInt=3)]

In [9]:
eDF.select(explode(eDF.mapfield).alias("key","value")).show()

+---+-----+
|key|value|
+---+-----+
|  a|    b|
+---+-----+



## Working of Explode in PySpark with Example

In [10]:
data1  = [("Jhon",[["USA","MX","USW","UK"],["23","34","56"]]),("Joe",[["IND","AF","YR","QW"],["22","35","76"]]),("Juhi",[["USA","MX","USW","UK"],["13","64","59"]]),("Jhony",[["USSR","MXR","USA","UK"],["22","44","76"]])]

In [11]:
data_frame = spark.createDataFrame(data=data1, schema = ['name','subjectandID'])

In [12]:
data_frame.printSchema()

root
 |-- name: string (nullable = true)
 |-- subjectandID: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)



In [13]:
data_frame.show(truncate=False)

+-----+------------------------------------+
|name |subjectandID                        |
+-----+------------------------------------+
|Jhon |[[USA, MX, USW, UK], [23, 34, 56]]  |
|Joe  |[[IND, AF, YR, QW], [22, 35, 76]]   |
|Juhi |[[USA, MX, USW, UK], [13, 64, 59]]  |
|Jhony|[[USSR, MXR, USA, UK], [22, 44, 76]]|
+-----+------------------------------------+



In [14]:
df2 = data_frame.select(data_frame.name,explode(data_frame.subjectandID))

In [15]:
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- col: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [16]:
df2.show()

+-----+--------------------+
| name|                 col|
+-----+--------------------+
| Jhon|  [USA, MX, USW, UK]|
| Jhon|        [23, 34, 56]|
|  Joe|   [IND, AF, YR, QW]|
|  Joe|        [22, 35, 76]|
| Juhi|  [USA, MX, USW, UK]|
| Juhi|        [13, 64, 59]|
|Jhony|[USSR, MXR, USA, UK]|
|Jhony|        [22, 44, 76]|
+-----+--------------------+



## Working of Explode in PySpark with Example-2

In [17]:
#One more Example
data1  = [("Jhon",["USA","MX","USW","UK"],{'23':'USA','34':'IND','56':'RSA'}),("Joe",["IND","AF","YR","QW"],{'23':'USA','34':'IND','56':'RSA'}),("Juhi",["USA","MX","USW","UK"],{'23':'USA','34':'IND','56':'RSA'}),("Jhony",["USSR","MXR","USA","UK"],{'23':'USA','34':'IND','56':'RSA'})]

In [18]:
data_frame2 = spark.createDataFrame(data=data1, schema = ['name','subjectandID'])

In [19]:
data_frame2.printSchema()

root
 |-- name: string (nullable = true)
 |-- subjectandID: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- _3: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [20]:
data_frame2.show(truncate=False)

+-----+--------------------+---------------------------------+
|name |subjectandID        |_3                               |
+-----+--------------------+---------------------------------+
|Jhon |[USA, MX, USW, UK]  |{56 -> RSA, 34 -> IND, 23 -> USA}|
|Joe  |[IND, AF, YR, QW]   |{56 -> RSA, 34 -> IND, 23 -> USA}|
|Juhi |[USA, MX, USW, UK]  |{56 -> RSA, 34 -> IND, 23 -> USA}|
|Jhony|[USSR, MXR, USA, UK]|{56 -> RSA, 34 -> IND, 23 -> USA}|
+-----+--------------------+---------------------------------+



In [21]:
df3 = data_frame2.select(data_frame2.name,explode(data_frame2.subjectandID))
df3.printSchema()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)



In [22]:
df3.show()

+-----+----+
| name| col|
+-----+----+
| Jhon| USA|
| Jhon|  MX|
| Jhon| USW|
| Jhon|  UK|
|  Joe| IND|
|  Joe|  AF|
|  Joe|  YR|
|  Joe|  QW|
| Juhi| USA|
| Juhi|  MX|
| Juhi| USW|
| Juhi|  UK|
|Jhony|USSR|
|Jhony| MXR|
|Jhony| USA|
|Jhony|  UK|
+-----+----+



## Working of Explode in PySpark with Example - 3

### The following code snippet explode an array column.

In [23]:
data_kontext=[{"values":[1,2,3,4,5]},{"values":[6,7,8]}]

In [24]:
kontext_df=spark.createDataFrame(data=data_kontext,schema=['value1'])

In [25]:
kontext_df.show()

+---------------+
|         value1|
+---------------+
|[1, 2, 3, 4, 5]|
|      [6, 7, 8]|
+---------------+



In [26]:
kontext_df2=kontext_df.select(kontext_df.value1,explode(kontext_df.value1))

In [27]:
kontext_df2.show()

+---------------+---+
|         value1|col|
+---------------+---+
|[1, 2, 3, 4, 5]|  1|
|[1, 2, 3, 4, 5]|  2|
|[1, 2, 3, 4, 5]|  3|
|[1, 2, 3, 4, 5]|  4|
|[1, 2, 3, 4, 5]|  5|
|      [6, 7, 8]|  6|
|      [6, 7, 8]|  7|
|      [6, 7, 8]|  8|
+---------------+---+



### The following code snippet explode an Map column.

In [28]:
data_kontext2 = [{"values": {"a": "100", "b": "200"}},
        {"values": {"a": "1000", "b": "2000"}}]

In [29]:
kontext_df2=spark.createDataFrame(data=data_kontext2,schema=['value1'])

In [30]:
kontext_df2.show(truncate=False)

+----------------------+
|value1                |
+----------------------+
|{a -> 100, b -> 200}  |
|{a -> 1000, b -> 2000}|
+----------------------+



In [31]:
kontext_df3=kontext_df2.select(kontext_df2.value1,explode(kontext_df2.value1))

In [32]:
kontext_df3.show(truncate=False)

+----------------------+---+-----+
|value1                |key|value|
+----------------------+---+-----+
|{a -> 100, b -> 200}  |a  |100  |
|{a -> 100, b -> 200}  |b  |200  |
|{a -> 1000, b -> 2000}|a  |1000 |
|{a -> 1000, b -> 2000}|b  |2000 |
+----------------------+---+-----+



In [33]:
kontext_df2.select(explode(kontext_df2.value1).alias("key","value")).show()

+---+-----+
|key|value|
+---+-----+
|  a|  100|
|  b|  200|
|  a| 1000|
|  b| 2000|
+---+-----+



## Working of Explode in PySpark with Example - 4

In [34]:
my_array_data = [(1, ['A']), (2, ['B','L','B']), (3, ['K','A','K']),(4, ['K']),
 (3, ['B','P'])]

In [37]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType,FloatType,ArrayType

In [38]:
from pyspark.sql.functions import array_contains

In [39]:
schema = StructType([StructField("Student_category", IntegerType()),StructField("Student_full_name", ArrayType(StringType()))])

In [41]:
df = spark.createDataFrame(my_array_data, schema=schema)

In [42]:
df.show()

+----------------+-----------------+
|Student_category|Student_full_name|
+----------------+-----------------+
|               1|              [A]|
|               2|        [B, L, B]|
|               3|        [K, A, K]|
|               4|              [K]|
|               3|           [B, P]|
+----------------+-----------------+



In [43]:
df.select("Student_full_name",explode('Student_full_name')).show()

+-----------------+---+
|Student_full_name|col|
+-----------------+---+
|              [A]|  A|
|        [B, L, B]|  B|
|        [B, L, B]|  L|
|        [B, L, B]|  B|
|        [K, A, K]|  K|
|        [K, A, K]|  A|
|        [K, A, K]|  K|
|              [K]|  K|
|           [B, P]|  B|
|           [B, P]|  P|
+-----------------+---+



## Working of Explode in PySpark with Example - 5

In [44]:
my_array_data = [(1, []), (2, []), (3, []),(4, []), (3, [])]

In [45]:
schema = StructType([StructField("Student_category", IntegerType()),StructField("Student_full_name", ArrayType(StringType()))])

In [46]:
df = spark.createDataFrame(my_array_data, schema=schema)

In [47]:
df.select("Student_full_name",explode('Student_full_name')).show()
#It returns nothing since all the values are missing in the array column – Student_full_name.

+-----------------+---+
|Student_full_name|col|
+-----------------+---+
+-----------------+---+



In [None]:
#explode_outer() will return each and every individual value from an array. If the array is empty or null

## Working of Explode in PySpark with Example - 6

In [48]:
# consider an array with 5 elements
my_array_data = [(1, ['A']), (2, ['B','L','B']), (3, ['K','A','K']),
 (4, ['K']), (3, ['B','P'])]

In [49]:
#define the StructType and StructFields
#for the above data
schema = StructType([StructField("Student_category", IntegerType()),StructField("Student_full_name", ArrayType(StringType()))])

In [53]:
#create the dataframe and add schema to the dataframe
df = spark.createDataFrame(my_array_data, schema=schema)

df.show()

+----------------+-----------------+
|Student_category|Student_full_name|
+----------------+-----------------+
|               1|              [A]|
|               2|        [B, L, B]|
|               3|        [K, A, K]|
|               4|              [K]|
|               3|           [B, P]|
+----------------+-----------------+



In [54]:
# apply explode_outer on the Student_full_name column
df.select("Student_full_name",explode_outer('Student_full_name')).show()

+-----------------+---+
|Student_full_name|col|
+-----------------+---+
|              [A]|  A|
|        [B, L, B]|  B|
|        [B, L, B]|  L|
|        [B, L, B]|  B|
|        [K, A, K]|  K|
|        [K, A, K]|  A|
|        [K, A, K]|  K|
|              [K]|  K|
|           [B, P]|  B|
|           [B, P]|  P|
+-----------------+---+

