##  Explode Function

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark  =  SparkSession.builder.appName('yettodecide').master('local').getOrCreate()

In [24]:
explodeData = [('Saif', ['Java', 'Scala'], {'hair': 'black', 'eye': 'brown'}),
 ('Mitali', ['Spark', 'Java', None], {'hair': 'brown', 'eye': None}),
 ('Ram', ['CSharp', ''], {'hair': 'red', 'eye': ''}),
 ('Wilma', None, None)
#  ,('Jatin', ['1', '2'], {})
              ]
array_df = spark.createDataFrame(data=explodeData,
schema=['name', 'knownLanguages', 'properties']) 

In [25]:
array_df.show(truncate=False)

+------+--------------+-----------------------------+
|name  |knownLanguages|properties                   |
+------+--------------+-----------------------------+
|Saif  |[Java, Scala] |[eye -> brown, hair -> black]|
|Mitali|[Spark, Java,]|[eye ->, hair -> brown]      |
|Ram   |[CSharp, ]    |[eye -> , hair -> red]       |
|Wilma |null          |null                         |
+------+--------------+-----------------------------+



In [26]:
from pyspark.sql.functions import explode,col
# explode
df2 = array_df.select(col("*"), explode("knownLanguages").alias('name'))
df2.printSchema()
df2.show() 

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- name: string (nullable = true)

+------+--------------+--------------------+------+
|  name|knownLanguages|          properties|  name|
+------+--------------+--------------------+------+
|  Saif| [Java, Scala]|[eye -> brown, ha...|  Java|
|  Saif| [Java, Scala]|[eye -> brown, ha...| Scala|
|Mitali|[Spark, Java,]|[eye ->, hair -> ...| Spark|
|Mitali|[Spark, Java,]|[eye ->, hair -> ...|  Java|
|Mitali|[Spark, Java,]|[eye ->, hair -> ...|  null|
|   Ram|    [CSharp, ]|[eye -> , hair ->...|CSharp|
|   Ram|    [CSharp, ]|[eye -> , hair ->...|      |
+------+--------------+--------------------+------+



In [33]:
# explode map
df3 = array_df.select(array_df.name, explode(array_df.properties))
df3.printSchema()
df3.show()

root
 |-- name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)

+------+----+-----+
|  name| key|value|
+------+----+-----+
|  Saif| eye|brown|
|  Saif|hair|black|
|Mitali| eye| null|
|Mitali|hair|brown|
|   Ram| eye|     |
|   Ram|hair|  red|
+------+----+-----+



https://sparkbyexamples.com/pyspark/pyspark-explode-array-and-map-columns-to-rows/

## Joins

In [47]:
emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1), \
       (50,"Bob",2,"2010","50","",-1)
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
empDF.show()

dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40), \
    ("R&D",101)    
  ]
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame(data=dept, schema = deptColumns)
deptDF.printSchema()
deptDF.show()

root
 |-- emp_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- superior_emp_id: long (nullable = true)
 |-- year_joined: string (nullable = true)
 |-- emp_dept_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|    name|superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|     1|   Smith|             -1|       2018|         10|     M|  3000|
|     2|    Rose|              1|       2010|         20|     M|  4000|
|     3|Williams|              1|       2010|         10|     M|  1000|
|     4|   Jones|              2|       2005|         10|     F|  2000|
|     5|   Brown|              2|       2010|         40|      |    -1|
|     6|   Brown|              2|       2010|         50|      |    -1|
|    50|     Bob|              2|       2010|         50|      |    -

### Inner Join
Inner join is the default join in PySpark

In [48]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "inner").show(truncate=False) 

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



### Full Outer Join
fullouter join returns all rows from both datasets where join
expression doesn’t match it returns null on respective record columns. 

In [49]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "outer").show(truncate=False) 
# OR
# empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "full").show(truncate=False) 
# OR 
# empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "fullouter").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
|50    |Bob     |2              |2010       |50         |      |-1    |null     |null   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|null  |null    |null           |null       |null       |null  |null  |R&D      |101    |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Br

### Left Outer Join
Leftouter join returns all rows from the left dataset regardless of match
found on the right dataset.

In [50]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "left").show(truncate=False)
# OR
# empDF.join(deptDF, empDF("emp_dept_id") == deptDF("dept_id"), "leftouter").show(False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
|50    |Bob     |2              |2010       |50         |      |-1    |null     |null   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



###  Right Outer Join
a Rightouter join is opposite of left join, here it returns all rows from the
right dataset regardless of match found on the left dataset

In [51]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "right").show(truncate=False) 
# OR
# empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "rightouter").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|null  |null    |null           |null       |null       |null  |null  |R&D      |101    |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



###  Left Semi Join
<ul>
<li>leftsemi join is similar to inner join difference being leftsemi join returns all columns from the left dataset and ignores all columns from the right dataset </li>
<li>In other words, this join returns columns from the only left dataset for the records
match in the right dataset on join expression records not matched on join
expression are ignored from both left and right datasets.</li>
</ul>

In [52]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "leftsemi").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
+------+--------+---------------+-----------+-----------+------+------+



### Left Anti Join
<ul>
<li>leftanti join does the exact opposite of the leftsemi.</li>
<li>leftanti join returns only columns from the left dataset for non-matched records. </li>    
</ul>

In [53]:
empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, "leftanti").show(truncate=False)

+------+-----+---------------+-----------+-----------+------+------+
|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|6     |Brown|2              |2010       |50         |      |-1    |
|50    |Bob  |2              |2010       |50         |      |-1    |
+------+-----+---------------+-----------+-----------+------+------+



### Self Join

In [54]:
empDF.alias("emp1").join(empDF.alias("emp2"),
 col("emp1.superior_emp_id") == col("emp2.emp_id"), "inner") \
 .select(col("emp1.emp_id"), col("emp1.name"), \
 col("emp2.emp_id").alias("superior_emp_id"), \
 col("emp2.name").alias("superior_emp_name")) \
 .show(truncate=False)

+------+--------+---------------+-----------------+
|emp_id|name    |superior_emp_id|superior_emp_name|
+------+--------+---------------+-----------------+
|2     |Rose    |1              |Smith            |
|3     |Williams|1              |Smith            |
|4     |Jones   |2              |Rose             |
|5     |Brown   |2              |Rose             |
|6     |Brown   |2              |Rose             |
|50    |Bob     |2              |Rose             |
+------+--------+---------------+-----------------+

