### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Función 'explode'

Es una función de PySpark que devuelve una nueva fila por cada elemento del array o map dado. Utiliza el nombre de columna por defecto col para los elementos del array y key y value para los elementos del map.

#### Explode el campo array

##### Ejemplo 1

In [0]:
from pyspark.sql.functions import explode, explode_outer

array_appliance = [
        ('Raja',['TV','Refrigerator','Oven','AC']),
        ('Raghav',['AC','Washing machine',None]),
        ('Ram',['Grinder','TV']),
        ('Ramesh',['Refrigerator','TV',None]),
        ('Rajesh',None)]

df = spark.createDataFrame(data=array_appliance, schema = ['name','appliances'])
df.printSchema()
display(df)

root
 |-- name: string (nullable = true)
 |-- appliances: array (nullable = true)
 |    |-- element: string (containsNull = true)



name,appliances
Raja,"List(TV, Refrigerator, Oven, AC)"
Raghav,"List(AC, Washing machine, null)"
Ram,"List(Grinder, TV)"
Ramesh,"List(Refrigerator, TV, null)"
Rajesh,


In [0]:
df_explode = df.select(df.name,explode(df.appliances))

df_explode.printSchema()
display(df_explode)

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)



name,col
Raja,TV
Raja,Refrigerator
Raja,Oven
Raja,AC
Raghav,AC
Raghav,Washing machine
Raghav,
Ram,Grinder
Ram,TV
Ramesh,Refrigerator


##### Ejemplo 2

In [0]:
from pyspark.sql.functions import col, concat, lit, size, explode
import datetime

usuario = [(1,
            'Corrie',
            'Van den Oord',
            'cvandenoord@etsy.com',
            [1,2,3],
            True,
            1000.55, 
            datetime.date(2021, 1, 15), 
            datetime.datetime(2021, 2, 10, 1, 15)),
           (2,
            'Nikolas',
            'Brewitt',
            'nkbrewitt@gmail.com',
            [3,5],
            True,
            900.0, 
            datetime.date(2021, 2, 14), 
            datetime.datetime(2021, 2, 18, 3, 33)),
           (3,
            'Oriel',
            'Penney',
            'openney@gmail.com',
            [2,4],
            True,
            850.55, 
            datetime.date(2021, 1, 21), 
            datetime.datetime(2021, 3, 15, 15, 16, 55)),
           (4,
            'Ashby',
            'Maddocks',
            'amaddocks@gmail.com',
            None,
            False,
            None, 
            None, 
            datetime.datetime(2021, 4, 10, 17, 45, 30)),
           (5,
            'Kurt',
            'Rome',
            'krome@etsy.com',
            None,
            False,
            None, 
            None, 
            datetime.datetime(2021, 4, 2, 0, 55, 18))
          ]

df = spark.createDataFrame(usuario, schema=['id','nombre','apellido','email','cursos','es_cliente',
                                            'importe_abonado','cliente_desde','ultima_actualizacion'])

df.printSchema()
df.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- nombre: string (nullable = true)
 |-- apellido: string (nullable = true)
 |-- email: string (nullable = true)
 |-- cursos: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- es_cliente: boolean (nullable = true)
 |-- importe_abonado: double (nullable = true)
 |-- cliente_desde: date (nullable = true)
 |-- ultima_actualizacion: timestamp (nullable = true)

+---+-------+------------+--------------------+---------+----------+---------------+-------------+--------------------+
|id |nombre |apellido    |email               |cursos   |es_cliente|importe_abonado|cliente_desde|ultima_actualizacion|
+---+-------+------------+--------------------+---------+----------+---------------+-------------+--------------------+
|1  |Corrie |Van den Oord|cvandenoord@etsy.com|[1, 2, 3]|true      |1000.55        |2021-01-15   |2021-02-10 01:15:00 |
|2  |Nikolas|Brewitt     |nkbrewitt@gmail.com |[3, 5]   |true      |900.0          |2021-02-14  

##### Método 1

In [0]:
df_explode = df.withColumn('curso', explode('cursos')).drop('cursos')

df_explode.show()

+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+-----+
| id| nombre|    apellido|               email|es_cliente|importe_abonado|cliente_desde|ultima_actualizacion|curso|
+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+-----+
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|    1|
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|    2|
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|    3|
|  2|Nikolas|     Brewitt| nkbrewitt@gmail.com|      true|          900.0|   2021-02-14| 2021-02-18 03:33:00|    3|
|  2|Nikolas|     Brewitt| nkbrewitt@gmail.com|      true|          900.0|   2021-02-14| 2021-02-18 03:33:00|    5|
|  3|  Oriel|      Penney|   openney@gmail.com|      true|         850.5

##### Método 2

In [0]:
df_explode = df.select('*',col('cursos')[0].alias('curso_1'), col('cursos')[1].alias('curso_2'), col('cursos')[2].alias('curso_3')). \
                drop('cursos')

df_explode.show()

+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+-------+-------+-------+
| id| nombre|    apellido|               email|es_cliente|importe_abonado|cliente_desde|ultima_actualizacion|curso_1|curso_2|curso_3|
+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+-------+-------+-------+
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|      1|      2|      3|
|  2|Nikolas|     Brewitt| nkbrewitt@gmail.com|      true|          900.0|   2021-02-14| 2021-02-18 03:33:00|      3|      5|   null|
|  3|  Oriel|      Penney|   openney@gmail.com|      true|         850.55|   2021-01-21| 2021-03-15 15:16:55|      2|      4|   null|
|  4|  Ashby|    Maddocks| amaddocks@gmail.com|     false|           null|         null| 2021-04-10 17:45:30|   null|   null|   null|
|  5|   Kurt|        Rome|      krome@etsy.com|     false|    

#### Explode_outer el campo array considerando Nulls

In [0]:
df_explode = df.select(df.name,explode_outer(df.appliances))

df_explode.printSchema()
display(df_explode)

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)



name,col
Raja,TV
Raja,Refrigerator
Raja,Oven
Raja,AC
Raghav,AC
Raghav,Washing machine
Raghav,
Ram,Grinder
Ram,TV
Ramesh,Refrigerator


#### Explode el campo map

##### Ejemplo 1

In [0]:
map_brand = [
        ('Raja',{'TV':'LG','Refrigerator':'Samsung','Oven':'Phillips','AC':'Voltas'}),
        ('Raghav',{'AC':'Samsung','Washing machine':'LG'}),
        ('Ram',{'Grinder':'Preethi','TV':''}),
        ('Ramesh',{'Refrigerator':'LG','TV':'Croma'}),
        ('Rajesh',None)]

df = spark.createDataFrame(data=map_brand, schema = ['name','brand'])
df.printSchema()
display(df)

root
 |-- name: string (nullable = true)
 |-- brand: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



name,brand
Raja,"Map(Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Phillips)"
Raghav,"Map(AC -> Samsung, Washing machine -> LG)"
Ram,"Map(TV -> , Grinder -> Preethi)"
Ramesh,"Map(Refrigerator -> LG, TV -> Croma)"
Rajesh,


In [0]:
df_explode = df.select(df.name,explode(df.brand))

df_explode.printSchema()
display(df_explode)

root
 |-- name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)



name,key,value
Raja,Refrigerator,Samsung
Raja,AC,Voltas
Raja,TV,LG
Raja,Oven,Phillips
Raghav,AC,Samsung
Raghav,Washing machine,LG
Ram,TV,
Ram,Grinder,Preethi
Ramesh,Refrigerator,LG
Ramesh,TV,Croma


##### Positional explode

In [0]:
from pyspark.sql.functions import posexplode

df_explode = df.select(df.name,posexplode(df.brand))

df_explode.printSchema()
display(df_explode)

root
 |-- name: string (nullable = true)
 |-- pos: integer (nullable = false)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)



name,pos,key,value
Raja,0,Refrigerator,Samsung
Raja,1,AC,Voltas
Raja,2,TV,LG
Raja,3,Oven,Phillips
Raghav,0,AC,Samsung
Raghav,1,Washing machine,LG
Ram,0,TV,
Ram,1,Grinder,Preethi
Ramesh,0,Refrigerator,LG
Ramesh,1,TV,Croma


##### Ejemplo 2

In [0]:
from pyspark.sql.functions import col, concat, lit, size, explode
import datetime

usuario = [(1,
            'Corrie',
            'Van den Oord',
            'cvandenoord@etsy.com',
            {'curso_1':'1','curso_2':'2','curso_3':'3'},
            True,
            1000.55, 
            datetime.date(2021, 1, 15), 
            datetime.datetime(2021, 2, 10, 1, 15)),
           (2,
            'Nikolas',
            'Brewitt',
            'nkbrewitt@gmail.com',
            {'curso_1':'3','curso_2':'5'},
            True,
            900.0, 
            datetime.date(2021, 2, 14), 
            datetime.datetime(2021, 2, 18, 3, 33)),
           (3,
            'Oriel',
            'Penney',
            'openney@gmail.com',
            {'curso_1':'2','curso_2':'4'},
            True,
            850.55, 
            datetime.date(2021, 1, 21), 
            datetime.datetime(2021, 3, 15, 15, 16, 55)),
           (4,
            'Ashby',
            'Maddocks',
            'amaddocks@gmail.com',
            None,
            False,
            None, 
            None, 
            datetime.datetime(2021, 4, 10, 17, 45, 30)),
           (5,
            'Kurt',
            'Rome',
            'krome@etsy.com',
            None,
            False,
            None, 
            None, 
            datetime.datetime(2021, 4, 2, 0, 55, 18))
          ]

df = spark.createDataFrame(usuario, schema=['id','nombre','apellido','email','cursos','es_cliente',
                                            'importe_abonado','cliente_desde','ultima_actualizacion'])

df.printSchema()
df.show(truncate=False)

root
 |-- id: long (nullable = true)
 |-- nombre: string (nullable = true)
 |-- apellido: string (nullable = true)
 |-- email: string (nullable = true)
 |-- cursos: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- es_cliente: boolean (nullable = true)
 |-- importe_abonado: double (nullable = true)
 |-- cliente_desde: date (nullable = true)
 |-- ultima_actualizacion: timestamp (nullable = true)

+---+-------+------------+--------------------+------------------------------------------+----------+---------------+-------------+--------------------+
|id |nombre |apellido    |email               |cursos                                    |es_cliente|importe_abonado|cliente_desde|ultima_actualizacion|
+---+-------+------------+--------------------+------------------------------------------+----------+---------------+-------------+--------------------+
|1  |Corrie |Van den Oord|cvandenoord@etsy.com|{curso_1 -> 1, curso_2 -> 2, curso_3 -> 3}|tr

In [0]:
df_explode = df.select('*', explode('cursos')).drop('cursos')

df_explode.show()

+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+-------+-----+
| id| nombre|    apellido|               email|es_cliente|importe_abonado|cliente_desde|ultima_actualizacion|    key|value|
+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+-------+-----+
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|curso_1|    1|
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|curso_2|    2|
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|curso_3|    3|
|  2|Nikolas|     Brewitt| nkbrewitt@gmail.com|      true|          900.0|   2021-02-14| 2021-02-18 03:33:00|curso_1|    3|
|  2|Nikolas|     Brewitt| nkbrewitt@gmail.com|      true|          900.0|   2021-02-14| 2021-02-18 03:33:00|curso_2|    5|
|  3|  O

In [0]:
df_explode = df.select('*', explode('cursos')). \
                withColumnRenamed('key','nombre_curso'). \
                withColumnRenamed('value','curso'). \
                drop('cursos')

df_explode.show()

+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+------------+-----+
| id| nombre|    apellido|               email|es_cliente|importe_abonado|cliente_desde|ultima_actualizacion|nombre_curso|curso|
+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+------------+-----+
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|     curso_1|    1|
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|     curso_2|    2|
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|     curso_3|    3|
|  2|Nikolas|     Brewitt| nkbrewitt@gmail.com|      true|          900.0|   2021-02-14| 2021-02-18 03:33:00|     curso_1|    3|
|  2|Nikolas|     Brewitt| nkbrewitt@gmail.com|      true|          900.0|   2021-02-14| 2021-02-

In [0]:
df_explode = df.select('*',col('cursos')['curso_1'].alias('curso_1'), 
                           col('cursos')['curso_2'].alias('curso_2'), 
                           col('cursos')['curso_3'].alias('curso_3')). \
                drop('cursos')

df_explode.show()

+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+-------+-------+-------+
| id| nombre|    apellido|               email|es_cliente|importe_abonado|cliente_desde|ultima_actualizacion|curso_1|curso_2|curso_3|
+---+-------+------------+--------------------+----------+---------------+-------------+--------------------+-------+-------+-------+
|  1| Corrie|Van den Oord|cvandenoord@etsy.com|      true|        1000.55|   2021-01-15| 2021-02-10 01:15:00|      1|      2|      3|
|  2|Nikolas|     Brewitt| nkbrewitt@gmail.com|      true|          900.0|   2021-02-14| 2021-02-18 03:33:00|      3|      5|   null|
|  3|  Oriel|      Penney|   openney@gmail.com|      true|         850.55|   2021-01-21| 2021-03-15 15:16:55|      2|      4|   null|
|  4|  Ashby|    Maddocks| amaddocks@gmail.com|     false|           null|         null| 2021-04-10 17:45:30|   null|   null|   null|
|  5|   Kurt|        Rome|      krome@etsy.com|     false|    

In [0]:
df.select('id','nombre',col('cursos')['curso_1'].alias('curso 1')).show()

+---+-------+-------+
| id| nombre|curso 1|
+---+-------+-------+
|  1| Corrie|      1|
|  2|Nikolas|      3|
|  3|  Oriel|      2|
|  4|  Ashby|   null|
|  5|   Kurt|   null|
+---+-------+-------+



#### Explode_outer el campo map considerando Nulls

In [0]:
map_brand = [
        ('Raja',{'TV':'LG','Refrigerator':'Samsung','Oven':'Phillips','AC':'Voltas'}),
        ('Raghav',{'AC':'Samsung','Washing machine':'LG'}),
        ('Ram',{'Grinder':'Preethi','TV':''}),
        ('Ramesh',{'Refrigerator':'LG','TV':'Croma'}),
        ('Rajesh',None)]

df = spark.createDataFrame(data=map_brand, schema = ['name','brand'])
df.printSchema()
display(df)

root
 |-- name: string (nullable = true)
 |-- brand: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



name,brand
Raja,"Map(Refrigerator -> Samsung, AC -> Voltas, TV -> LG, Oven -> Phillips)"
Raghav,"Map(AC -> Samsung, Washing machine -> LG)"
Ram,"Map(TV -> , Grinder -> Preethi)"
Ramesh,"Map(Refrigerator -> LG, TV -> Croma)"
Rajesh,


In [0]:
df_explode = df.select(df.name,explode_outer(df.brand))

df_explode.show()

+------+---------------+--------+
|  name|            key|   value|
+------+---------------+--------+
|  Raja|   Refrigerator| Samsung|
|  Raja|             AC|  Voltas|
|  Raja|             TV|      LG|
|  Raja|           Oven|Phillips|
|Raghav|             AC| Samsung|
|Raghav|Washing machine|      LG|
|   Ram|             TV|        |
|   Ram|        Grinder| Preethi|
|Ramesh|   Refrigerator|      LG|
|Ramesh|             TV|   Croma|
|Rajesh|           null|    null|
+------+---------------+--------+



##### Positional explode_outer

In [0]:
from pyspark.sql.functions import posexplode_outer

df_explode = df.select(df.name,posexplode_outer(df.brand))

df_explode.printSchema()
display(df_explode)

root
 |-- name: string (nullable = true)
 |-- pos: integer (nullable = true)
 |-- key: string (nullable = true)
 |-- value: string (nullable = true)



name,pos,key,value
Raja,0.0,Refrigerator,Samsung
Raja,1.0,AC,Voltas
Raja,2.0,TV,LG
Raja,3.0,Oven,Phillips
Raghav,0.0,AC,Samsung
Raghav,1.0,Washing machine,LG
Ram,0.0,TV,
Ram,1.0,Grinder,Preethi
Ramesh,0.0,Refrigerator,LG
Ramesh,1.0,TV,Croma
