In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder.appName('PySparkSQLJOINS').getOrCreate()
import pandas as pd

In [2]:
from pyspark.sql.functions import col,max,lit,count

In [3]:
### jOINS examples

In [4]:
valuesA = [('Luke',1),('Han',2),('Leia',3),('Darth Vader',4)]
TableA = spark.createDataFrame(valuesA,['name','id'])
 
valuesB = [('Yoda',1),('Lando',2),('Luke',3),('Darth Vader',4)]
TableB = spark.createDataFrame(valuesB,['name','id'])
 
TableA.show()
TableB.show()

+-----------+---+
|       name| id|
+-----------+---+
|       Luke|  1|
|        Han|  2|
|       Leia|  3|
|Darth Vader|  4|
+-----------+---+

+-----------+---+
|       name| id|
+-----------+---+
|       Yoda|  1|
|      Lando|  2|
|       Luke|  3|
|Darth Vader|  4|
+-----------+---+



In [6]:
ta = TableA.alias('ta')
tb = TableB.alias('tb')

In [7]:
inner_join = ta.join(tb, ta.name == tb.name)
inner_join.show()

+-----------+---+-----------+---+
|       name| id|       name| id|
+-----------+---+-----------+---+
|       Luke|  1|       Luke|  3|
|Darth Vader|  4|Darth Vader|  4|
+-----------+---+-----------+---+



In [8]:
left_join = ta.join(tb, ta.name == tb.name,how='left') # Could also use 'left_outer'
left_join.show()

+-----------+---+-----------+----+
|       name| id|       name|  id|
+-----------+---+-----------+----+
|       Leia|  3|       null|null|
|       Luke|  1|       Luke|   3|
|        Han|  2|       null|null|
|Darth Vader|  4|Darth Vader|   4|
+-----------+---+-----------+----+



In [9]:
right_join = ta.join(tb, ta.name == tb.name,how='right') # Could also use 'right_outer'
right_join.show()

+-----------+----+-----------+---+
|       name|  id|       name| id|
+-----------+----+-----------+---+
|       Luke|   1|       Luke|  3|
|       null|null|      Lando|  2|
|       null|null|       Yoda|  1|
|Darth Vader|   4|Darth Vader|  4|
+-----------+----+-----------+---+



In [10]:
full_outer_join = ta.join(tb, ta.name == tb.name,how='full') # Could also use 'full_outer'
full_outer_join.show()

+-----------+----+-----------+----+
|       name|  id|       name|  id|
+-----------+----+-----------+----+
|       Leia|   3|       null|null|
|       Luke|   1|       Luke|   3|
|       null|null|      Lando|   2|
|       null|null|       Yoda|   1|
|        Han|   2|       null|null|
|Darth Vader|   4|Darth Vader|   4|
+-----------+----+-----------+----+



In [11]:
left_semi_join = ta.join(tb, ta.name == tb.name,how='leftsemi') 
left_semi_join.show()

+-----------+---+
|       name| id|
+-----------+---+
|       Luke|  1|
|Darth Vader|  4|
+-----------+---+



In [12]:
valuesC = [('R2D2',4,18),('Obi Wan',4,17),('Darth Vader',4,15),('Darth Vader',4,12),('C3PO',4,11)]
TableC = spark.createDataFrame(valuesC,['name','id','leavedate'])
 
valuesD = [('Darth Vader',4,11),('Darth Maul',4,10),('Darth Sidious',4,25),('Darth Vader',4,22),('Darth Vader',4,21)]
TableD = spark.createDataFrame(valuesD,['name','id','leavedate'])
 
TableC.show()
TableD  = TableD.selectExpr("name as nameB", "id as idB","leavedate as leavedateB")
TableD.show()

+-----------+---+---------+
|       name| id|leavedate|
+-----------+---+---------+
|       R2D2|  4|       18|
|    Obi Wan|  4|       17|
|Darth Vader|  4|       15|
|Darth Vader|  4|       12|
|       C3PO|  4|       11|
+-----------+---+---------+

+-------------+---+----------+
|        nameB|idB|leavedateB|
+-------------+---+----------+
|  Darth Vader|  4|        11|
|   Darth Maul|  4|        10|
|Darth Sidious|  4|        25|
|  Darth Vader|  4|        22|
|  Darth Vader|  4|        21|
+-------------+---+----------+



In [13]:
TableC.join(TableD, TableC.name == TableD.nameB,how='leftsemi').explain()   #### show SQL Action plan 
joinedl = TableC.join(TableD, TableC.name == TableD.nameB,how='leftsemi')
joinedl.show()

== Physical Plan ==
SortMergeJoin [name#146], [nameB#169], LeftSemi
:- *(2) Sort [name#146 ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(name#146, 200)
:     +- *(1) Filter isnotnull(name#146)
:        +- Scan ExistingRDD[name#146,id#147L,leavedate#148L]
+- *(4) Sort [nameB#169 ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(nameB#169, 200)
      +- *(3) Project [name#152 AS nameB#169]
         +- Scan ExistingRDD[name#152,id#153L,leavedate#154L]
+-----------+---+---------+
|       name| id|leavedate|
+-----------+---+---------+
|Darth Vader|  4|       15|
|Darth Vader|  4|       12|
+-----------+---+---------+

