In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Join Types").getOrCreate()

In [3]:
emp = spark.createDataFrame([
    (0,"John C", "2000-01-01", "D101"),
    (1,"Tom D", "2002-02-01", "D102"),
    (2,"Max", "2003-04-01", "D104"),
    (3,"Peter J", "2005-06-01", "D104"),
    (4,"Mark P", "2007-01-01", "D1022")
]).toDF("id","name","joining_date","dept_id")

In [4]:
dept = spark.createDataFrame([
    ("D101", "Support"),
    ("D102", "HR"),
    ("D103", "Marketing"),
    ("D104", "Sells")
]).toDF("id", "dept_name")

In [5]:
emp.show()

+---+-------+------------+-------+
| id|   name|joining_date|dept_id|
+---+-------+------------+-------+
|  0| John C|  2000-01-01|   D101|
|  1|  Tom D|  2002-02-01|   D102|
|  2|    Max|  2003-04-01|   D104|
|  3|Peter J|  2005-06-01|   D104|
|  4| Mark P|  2007-01-01|  D1022|
+---+-------+------------+-------+



In [6]:
dept.show()

+----+---------+
|  id|dept_name|
+----+---------+
|D101|  Support|
|D102|       HR|
|D103|Marketing|
|D104|    Sells|
+----+---------+



In [7]:
emp.join(dept, emp["dept_id"] == dept["id"], "inner").show()

+---+-------+------------+-------+----+---------+
| id|   name|joining_date|dept_id|  id|dept_name|
+---+-------+------------+-------+----+---------+
|  2|    Max|  2003-04-01|   D104|D104|    Sells|
|  3|Peter J|  2005-06-01|   D104|D104|    Sells|
|  0| John C|  2000-01-01|   D101|D101|  Support|
|  1|  Tom D|  2002-02-01|   D102|D102|       HR|
+---+-------+------------+-------+----+---------+



In [8]:
joinType = "inner"
joinCondition = emp["dept_id"] == dept["id"]
emp.join(dept, joinCondition, joinType).show()

+---+-------+------------+-------+----+---------+
| id|   name|joining_date|dept_id|  id|dept_name|
+---+-------+------------+-------+----+---------+
|  2|    Max|  2003-04-01|   D104|D104|    Sells|
|  3|Peter J|  2005-06-01|   D104|D104|    Sells|
|  0| John C|  2000-01-01|   D101|D101|  Support|
|  1|  Tom D|  2002-02-01|   D102|D102|       HR|
+---+-------+------------+-------+----+---------+



In [9]:
emp.join(dept, joinCondition).show()

+---+-------+------------+-------+----+---------+
| id|   name|joining_date|dept_id|  id|dept_name|
+---+-------+------------+-------+----+---------+
|  2|    Max|  2003-04-01|   D104|D104|    Sells|
|  3|Peter J|  2005-06-01|   D104|D104|    Sells|
|  0| John C|  2000-01-01|   D101|D101|  Support|
|  1|  Tom D|  2002-02-01|   D102|D102|       HR|
+---+-------+------------+-------+----+---------+



In [10]:
joinType = "outer"
joinCondition = emp["dept_id"] == dept["id"]
emp.join(dept, joinCondition, joinType).show()

+----+-------+------------+-------+----+---------+
|  id|   name|joining_date|dept_id|  id|dept_name|
+----+-------+------------+-------+----+---------+
|   2|    Max|  2003-04-01|   D104|D104|    Sells|
|   3|Peter J|  2005-06-01|   D104|D104|    Sells|
|   4| Mark P|  2007-01-01|  D1022|null|     null|
|null|   null|        null|   null|D103|Marketing|
|   0| John C|  2000-01-01|   D101|D101|  Support|
|   1|  Tom D|  2002-02-01|   D102|D102|       HR|
+----+-------+------------+-------+----+---------+



In [11]:
joinType = "left_outer"
joinCondition = emp["dept_id"] == dept["id"]
emp.join(dept, joinCondition, joinType).show()

+---+-------+------------+-------+----+---------+
| id|   name|joining_date|dept_id|  id|dept_name|
+---+-------+------------+-------+----+---------+
|  2|    Max|  2003-04-01|   D104|D104|    Sells|
|  3|Peter J|  2005-06-01|   D104|D104|    Sells|
|  4| Mark P|  2007-01-01|  D1022|null|     null|
|  0| John C|  2000-01-01|   D101|D101|  Support|
|  1|  Tom D|  2002-02-01|   D102|D102|       HR|
+---+-------+------------+-------+----+---------+



In [12]:
joinType = "right_outer"
joinCondition = emp["dept_id"] == dept["id"]
emp.join(dept, joinCondition, joinType).show()

+----+-------+------------+-------+----+---------+
|  id|   name|joining_date|dept_id|  id|dept_name|
+----+-------+------------+-------+----+---------+
|   2|    Max|  2003-04-01|   D104|D104|    Sells|
|   3|Peter J|  2005-06-01|   D104|D104|    Sells|
|null|   null|        null|   null|D103|Marketing|
|   0| John C|  2000-01-01|   D101|D101|  Support|
|   1|  Tom D|  2002-02-01|   D102|D102|       HR|
+----+-------+------------+-------+----+---------+



In [13]:
joinType = "left_semi"
joinCondition = emp["dept_id"] == dept["id"]
emp.join(dept, joinCondition, joinType).show()

+---+-------+------------+-------+
| id|   name|joining_date|dept_id|
+---+-------+------------+-------+
|  2|    Max|  2003-04-01|   D104|
|  3|Peter J|  2005-06-01|   D104|
|  0| John C|  2000-01-01|   D101|
|  1|  Tom D|  2002-02-01|   D102|
+---+-------+------------+-------+



In [14]:
joinType = "left_anti"
joinCondition = emp["dept_id"] == dept["id"]
emp.join(dept, joinCondition, joinType).show()

+---+------+------------+-------+
| id|  name|joining_date|dept_id|
+---+------+------------+-------+
|  4|Mark P|  2007-01-01|  D1022|
+---+------+------------+-------+



In [22]:
emp.createOrReplaceTempView("emp")
dept.createOrReplaceTempView("dept")
spark.sql("select * from emp NATURAL JOIN dept").show()

+---+----+------------+-------+---------+
| id|name|joining_date|dept_id|dept_name|
+---+----+------------+-------+---------+
+---+----+------------+-------+---------+



In [23]:
spark.sql("select * from emp CROSS JOIN dept").show()

+---+-------+------------+-------+----+---------+
| id|   name|joining_date|dept_id|  id|dept_name|
+---+-------+------------+-------+----+---------+
|  0| John C|  2000-01-01|   D101|D101|  Support|
|  0| John C|  2000-01-01|   D101|D102|       HR|
|  0| John C|  2000-01-01|   D101|D103|Marketing|
|  0| John C|  2000-01-01|   D101|D104|    Sells|
|  1|  Tom D|  2002-02-01|   D102|D101|  Support|
|  1|  Tom D|  2002-02-01|   D102|D102|       HR|
|  1|  Tom D|  2002-02-01|   D102|D103|Marketing|
|  1|  Tom D|  2002-02-01|   D102|D104|    Sells|
|  2|    Max|  2003-04-01|   D104|D101|  Support|
|  2|    Max|  2003-04-01|   D104|D102|       HR|
|  2|    Max|  2003-04-01|   D104|D103|Marketing|
|  2|    Max|  2003-04-01|   D104|D104|    Sells|
|  3|Peter J|  2005-06-01|   D104|D101|  Support|
|  3|Peter J|  2005-06-01|   D104|D102|       HR|
|  3|Peter J|  2005-06-01|   D104|D103|Marketing|
|  3|Peter J|  2005-06-01|   D104|D104|    Sells|
|  4| Mark P|  2007-01-01|  D1022|D101|  Support|
