In [2]:
from pyspark.sql import(
    functions as f,
    SparkSession,
    types as t
)

In [3]:
spark = SparkSession.builder.appName("df_join").getOrCreate()

In [4]:
user_data = [
    ["1000", "Neville Hardy", "Apple"],
    ["2000", "Dacia Cohen", "Alphabet"],
    ["3000", "Elois Cox", "Neflix"],
    ["4000", "Junita Meyer", "Meta"],
    ["5000", "Cleora Banks", "Amazon"]]

In [5]:
user_col = ['id', 'name', 'company']

In [6]:
df_user = spark.createDataFrame(data=user_data, schema=user_col)

In [7]:
df_user.show()

+----+-------------+--------+
|  id|         name| company|
+----+-------------+--------+
|1000|Neville Hardy|   Apple|
|2000|  Dacia Cohen|Alphabet|
|3000|    Elois Cox|  Neflix|
|4000| Junita Meyer|    Meta|
|5000| Cleora Banks|  Amazon|
+----+-------------+--------+



In [8]:
salary_data = [
    ["1000", "150000", "engineer"],
    ["2000", "240000", "manager"],
    ["3000", "120000", "human resource"],
    ["6000", "100000", "sales"]]

In [9]:
salary_col = ['id', 'salary', 'department']

In [10]:
df_salary = spark.createDataFrame(data=salary_data, schema=salary_col)
df_salary.show()

+----+------+--------------+
|  id|salary|    department|
+----+------+--------------+
|1000|150000|      engineer|
|2000|240000|       manager|
|3000|120000|human resource|
|6000|100000|         sales|
+----+------+--------------+



In [13]:
##inner join
df_user.join(df_salary,
                df_user.id == df_salary.id,
                "inner").show()

+----+-------------+--------+----+------+--------------+
|  id|         name| company|  id|salary|    department|
+----+-------------+--------+----+------+--------------+
|1000|Neville Hardy|   Apple|1000|150000|      engineer|
|2000|  Dacia Cohen|Alphabet|2000|240000|       manager|
|3000|    Elois Cox|  Neflix|3000|120000|human resource|
+----+-------------+--------+----+------+--------------+



In [14]:
df_user.join(df_salary,
                df_user.id == df_salary.id,
                "inner").filter(df_user.id==1000).show()

+----+-------------+-------+----+------+----------+
|  id|         name|company|  id|salary|department|
+----+-------------+-------+----+------+----------+
|1000|Neville Hardy|  Apple|1000|150000|  engineer|
+----+-------------+-------+----+------+----------+



In [15]:
df_user.join(df_salary,
                df_user.id == df_salary.id,
                "inner").where(df_user.id==1000).show()

+----+-------------+-------+----+------+----------+
|  id|         name|company|  id|salary|department|
+----+-------------+-------+----+------+----------+
|1000|Neville Hardy|  Apple|1000|150000|  engineer|
+----+-------------+-------+----+------+----------+



In [17]:
df_user.join(df_salary,
                (df_user.id==df_salary.id) & (df_user.id==1000),
                ).show()

+----+-------------+-------+----+------+----------+
|  id|         name|company|  id|salary|department|
+----+-------------+-------+----+------+----------+
|1000|Neville Hardy|  Apple|1000|150000|  engineer|
+----+-------------+-------+----+------+----------+



In [18]:
print("== full outer join ==")
df_user.join(df_salary, 
               df_user.id == df_salary.id, 
               "fullouter").show()

== full outer join ==
+----+-------------+--------+----+------+--------------+
|  id|         name| company|  id|salary|    department|
+----+-------------+--------+----+------+--------------+
|1000|Neville Hardy|   Apple|1000|150000|      engineer|
|2000|  Dacia Cohen|Alphabet|2000|240000|       manager|
|3000|    Elois Cox|  Neflix|3000|120000|human resource|
|4000| Junita Meyer|    Meta|null|  null|          null|
|5000| Cleora Banks|  Amazon|null|  null|          null|
|null|         null|    null|6000|100000|         sales|
+----+-------------+--------+----+------+--------------+



In [19]:
print("== left join ==")
df_user.join(df_salary, 
               df_user.id == df_salary.id, 
               "left").show()

== left join ==
+----+-------------+--------+----+------+--------------+
|  id|         name| company|  id|salary|    department|
+----+-------------+--------+----+------+--------------+
|1000|Neville Hardy|   Apple|1000|150000|      engineer|
|2000|  Dacia Cohen|Alphabet|2000|240000|       manager|
|3000|    Elois Cox|  Neflix|3000|120000|human resource|
|4000| Junita Meyer|    Meta|null|  null|          null|
|5000| Cleora Banks|  Amazon|null|  null|          null|
+----+-------------+--------+----+------+--------------+



In [20]:
print("== right join ==")
df_user.join(df_salary, 
               df_user.id == df_salary.id, 
               "right").show()

== right join ==
+----+-------------+--------+----+------+--------------+
|  id|         name| company|  id|salary|    department|
+----+-------------+--------+----+------+--------------+
|1000|Neville Hardy|   Apple|1000|150000|      engineer|
|2000|  Dacia Cohen|Alphabet|2000|240000|       manager|
|3000|    Elois Cox|  Neflix|3000|120000|human resource|
|null|         null|    null|6000|100000|         sales|
+----+-------------+--------+----+------+--------------+



In [21]:
print("== left semi join ==")
df_user.join(df_salary, 
               df_user.id == df_salary.id, 
               "leftsemi").show()

== left semi join ==
+----+-------------+--------+
|  id|         name| company|
+----+-------------+--------+
|1000|Neville Hardy|   Apple|
|2000|  Dacia Cohen|Alphabet|
|3000|    Elois Cox|  Neflix|
+----+-------------+--------+



In [22]:
print("== left anti join ==")
df_user.join(df_salary, 
               df_user.id == df_salary.id, 
               "leftanti").show()


== left anti join ==
+----+------------+-------+
|  id|        name|company|
+----+------------+-------+
|4000|Junita Meyer|   Meta|
|5000|Cleora Banks| Amazon|
+----+------------+-------+



In [23]:
df_user.createOrReplaceTempView("user")
df_salary.createOrReplaceTempView("salary")

In [24]:
spark.sql("SELECT * FROM user, salary WHERE user.id == salary.id").show()


+----+-------------+--------+----+------+--------------+
|  id|         name| company|  id|salary|    department|
+----+-------------+--------+----+------+--------------+
|1000|Neville Hardy|   Apple|1000|150000|      engineer|
|2000|  Dacia Cohen|Alphabet|2000|240000|       manager|
|3000|    Elois Cox|  Neflix|3000|120000|human resource|
+----+-------------+--------+----+------+--------------+



In [25]:
spark.sql("SELECT * FROM user INNER JOIN salary ON user.id == salary.id").show()

+----+-------------+--------+----+------+--------------+
|  id|         name| company|  id|salary|    department|
+----+-------------+--------+----+------+--------------+
|1000|Neville Hardy|   Apple|1000|150000|      engineer|
|2000|  Dacia Cohen|Alphabet|2000|240000|       manager|
|3000|    Elois Cox|  Neflix|3000|120000|human resource|
+----+-------------+--------+----+------+--------------+

