In [70]:
""" Notebook Information """
# References: https://www.geeksforgeeks.org/pyspark-join-types-join-two-dataframes/
# Workbook Title: PySpark Joins Tutorial - PySpark and SQL syntax

' Notebook Information '

### PySpark Setup
--------------------------------------------
#### Installing relevant libraries; Instantiating a PySpark session; Creating a SparkSession

In [1]:
""" Importing libraries """
import pandas as pd 
import numpy as np 
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext 
from pyspark.sql import SparkSession

In [2]:
""" Instantiate a SparkContext """
sc = SparkContext.getOrCreate()

## Print the Spark version
print(sc.version)

3.0.3


In [3]:
""" Creating a SparkSession """ 
spark = SparkSession.builder.appName('JoinsTutorial').getOrCreate()

### PySpark Dataframes
--------------------------------------------
#### Creating tables as PySpark dataframes

In [4]:
""" Creating employees table """
# list  of employee data
data1 = [["1", "sravan", "company 1"],
        ["2", "ojaswi", "company 1"], 
        ["3", "rohith", "company 2"],
        ["4", "sridevi", "company 1"], 
        ["5", "bobby", "company 1"]]
  
# specify column names
columns = ['ID', 'Name', 'Company']
  
# creating a df1 from the lists of data
df1 = spark.createDataFrame(data1, columns)

In [5]:
""" Creating department table """
# list  of employee data
data2 = [["1", "45000", "IT"],
         ["2", "145000", "Manager"],
         ["6", "45000", "HR"],
         ["5", "34000", "Sales"]]
  
# specify column names
columns = ['ID', 'Salary', 'Department']
  
# creating a df1 from the lists of data
df2 = spark.createDataFrame(data2, columns)

In [6]:
""" Checking the PySpark dataframe """
# checking the dataframe  - Employees Table 
df1.show(truncate=False)

+---+-------+---------+
|ID |Name   |Company  |
+---+-------+---------+
|1  |sravan |company 1|
|2  |ojaswi |company 1|
|3  |rohith |company 2|
|4  |sridevi|company 1|
|5  |bobby  |company 1|
+---+-------+---------+



In [7]:
""" Checking the PySpark dataframe """
# checking the dataframe - Department Table
df2.show(truncate=False)

+---+------+----------+
|ID |Salary|Department|
+---+------+----------+
|1  |45000 |IT        |
|2  |145000|Manager   |
|6  |45000 |HR        |
|5  |34000 |Sales     |
+---+------+----------+



### PySpark Joins
--------------------------------------------
#### Creating various joins (Full, Left, Right and Inner) using PySpark syntax

In [78]:
""" Example 1 - Inner Join example """
df1.join(df2, df1.ID == df2.ID, "inner").show(truncate=False) # Inner joining df2 based on the ID

+---+------+---------+---+------+----------+
|ID |Name  |Company  |ID |Salary|Department|
+---+------+---------+---+------+----------+
|5  |bobby |company 1|5  |34000 |Sales     |
|1  |sravan|company 1|1  |45000 |IT        |
|2  |ojaswi|company 1|2  |145000|Manager   |
+---+------+---------+---+------+----------+



In [79]:
""" Example 2 - Full Join example """
df1.join(df2, df1.ID == df2.ID, "full").show(truncate=False) # Full joining df2 based on the ID

+----+-------+---------+----+------+----------+
|ID  |Name   |Company  |ID  |Salary|Department|
+----+-------+---------+----+------+----------+
|3   |rohith |company 2|null|null  |null      |
|5   |bobby  |company 1|5   |34000 |Sales     |
|null|null   |null     |6   |45000 |HR        |
|1   |sravan |company 1|1   |45000 |IT        |
|4   |sridevi|company 1|null|null  |null      |
|2   |ojaswi |company 1|2   |145000|Manager   |
+----+-------+---------+----+------+----------+



In [80]:
""" Example 3 - Left Join example """
df1.join(df2, df1.ID == df2.ID, "left").show(truncate=False) # Left joining df2 based on the ID

+---+-------+---------+----+------+----------+
|ID |Name   |Company  |ID  |Salary|Department|
+---+-------+---------+----+------+----------+
|3  |rohith |company 2|null|null  |null      |
|5  |bobby  |company 1|5   |34000 |Sales     |
|1  |sravan |company 1|1   |45000 |IT        |
|4  |sridevi|company 1|null|null  |null      |
|2  |ojaswi |company 1|2   |145000|Manager   |
+---+-------+---------+----+------+----------+



In [81]:
""" Example 4 - Right join example """
df1.join(df2, df1.ID == df2.ID, "right").show(truncate=False) # Right joining df2 on df2 based on the ID

+----+------+---------+---+------+----------+
|ID  |Name  |Company  |ID |Salary|Department|
+----+------+---------+---+------+----------+
|5   |bobby |company 1|5  |34000 |Sales     |
|null|null  |null     |6  |45000 |HR        |
|1   |sravan|company 1|1  |45000 |IT        |
|2   |ojaswi|company 1|2  |145000|Manager   |
+----+------+---------+---+------+----------+



### SQL Tables Setup in PySpark
--------------------------------------------
#### Creating Temporary Tables for spark.sql - these tables will be used for SQL quering

In [82]:
""" Creating Temporary Tables to be used in spark.sql """
## setting temporary views by creating dataframes
# creating a view for df1 named employees
df1.createOrReplaceTempView("Employees")
  
# creating a view for df2 named department
df2.createOrReplaceTempView("Department")

### SQL Queries in PySpark
--------------------------------------------
#### Queries: SELECT; FROM; WHERE; GROUP BY; HAVING; IN; ORDER BY; Aggregate functions: AVG, MAX, MIN; Joins: Full, Left, Right, Inner

In [83]:
""" Example 5 - Inner Join based on ID """
query = "SELECT E.ID, E.NAME, D.Department, D.Salary FROM Employees AS E LEFT JOIN Department AS D on E.ID == D.ID" # Selecting columns from different tables and merging two tables using a left join

spark.sql(query).show(truncate=False)

+---+-------+----------+------+
|ID |NAME   |Department|Salary|
+---+-------+----------+------+
|3  |rohith |null      |null  |
|5  |bobby  |Sales     |34000 |
|1  |sravan |IT        |45000 |
|4  |sridevi|null      |null  |
|2  |ojaswi |Manager   |145000|
+---+-------+----------+------+



In [84]:
""" Example 6 - Left Join based on ID """
query = "SELECT * FROM Employees AS E LEFT JOIN Department as D on E.ID == D.ID"

spark.sql(query).show(truncate=False)

+---+-------+---------+----+------+----------+
|ID |Name   |Company  |ID  |Salary|Department|
+---+-------+---------+----+------+----------+
|3  |rohith |company 2|null|null  |null      |
|5  |bobby  |company 1|5   |34000 |Sales     |
|1  |sravan |company 1|1   |45000 |IT        |
|4  |sridevi|company 1|null|null  |null      |
|2  |ojaswi |company 1|2   |145000|Manager   |
+---+-------+---------+----+------+----------+



In [85]:
""" Example 7 - Right Join based on ID """
query = "SELECT * FROM Employees AS E RIGHT JOIN Department AS D ON E.ID == D.ID"

spark.sql(query).show(truncate=False) 

+----+------+---------+---+------+----------+
|ID  |Name  |Company  |ID |Salary|Department|
+----+------+---------+---+------+----------+
|5   |bobby |company 1|5  |34000 |Sales     |
|null|null  |null     |6  |45000 |HR        |
|1   |sravan|company 1|1  |45000 |IT        |
|2   |ojaswi|company 1|2  |145000|Manager   |
+----+------+---------+---+------+----------+



In [86]:
""" Example 8 - Full Join based on ID """
query = "SELECT * FROM Employees AS E FULL JOIN Department AS D ON E.ID == D.ID"

spark.sql(query).show(truncate=False)

+----+-------+---------+----+------+----------+
|ID  |Name   |Company  |ID  |Salary|Department|
+----+-------+---------+----+------+----------+
|3   |rohith |company 2|null|null  |null      |
|5   |bobby  |company 1|5   |34000 |Sales     |
|null|null   |null     |6   |45000 |HR        |
|1   |sravan |company 1|1   |45000 |IT        |
|4   |sridevi|company 1|null|null  |null      |
|2   |ojaswi |company 1|2   |145000|Manager   |
+----+-------+---------+----+------+----------+



In [87]:
""" Example - SQL Query """ # SELECT FROM WHERE IN 
query = "SELECT * FROM Department WHERE ID IN (1,5,6) ORDER BY ID DESC"

spark.sql(query).show(truncate=False)

+---+------+----------+
|ID |Salary|Department|
+---+------+----------+
|6  |45000 |HR        |
|5  |34000 |Sales     |
|1  |45000 |IT        |
+---+------+----------+



In [88]:
""" Example - SQL Query """ # SELECT FROM WHERE 
query = """
        SELECT * 
        FROM Employees 
        WHERE NAME == "bobby"     
        """
spark.sql(query).show(truncate=False)

+---+-----+---------+
|ID |Name |Company  |
+---+-----+---------+
|5  |bobby|company 1|
+---+-----+---------+



In [89]:
""" Example - SQL Query """ # GROUP BY

# Find the average salary per department 
query = """
        SELECT Department, AVG(Salary) AS Avg_Salary
        FROM Department
        GROUP BY Department 
        """

spark.sql(query).show(truncate=False)

+----------+----------+
|Department|Avg_Salary|
+----------+----------+
|Sales     |34000.0   |
|HR        |45000.0   |
|IT        |45000.0   |
|Manager   |145000.0  |
+----------+----------+



In [90]:
""" Example - SQL Query using GROUP BY & ORDER BY """ 

# Find the average salary per department and sort this based on the top 3 most paid departments 
query = """
        SELECT Department, AVG(Salary) AS Avg_Salary
        FROM Department 
        GROUP BY Department
        ORDER BY Avg_Salary DESC
        LIMIT 3
        """
spark.sql(query).show(truncate=False)

+----------+----------+
|Department|Avg_Salary|
+----------+----------+
|Manager   |145000.0  |
|HR        |45000.0   |
|IT        |45000.0   |
+----------+----------+



In [91]:
""" Example - SQL Query using AVG MAX MIN and GROUP BY """

# Find the average salary per department along with their minimum and maximum salary per department 
query = """
        SELECT Department, AVG(Salary) AS Avg_Salary, MAX(Salary) AS Max_Salary, MIN(Salary) AS Min_Salary
        FROM Department 
        GROUP BY Department
        """
spark.sql(query).show(truncate=False)

+----------+----------+----------+----------+
|Department|Avg_Salary|Max_Salary|Min_Salary|
+----------+----------+----------+----------+
|Sales     |34000.0   |34000     |34000     |
|HR        |45000.0   |45000     |45000     |
|IT        |45000.0   |45000     |45000     |
|Manager   |145000.0  |145000    |145000    |
+----------+----------+----------+----------+



In [92]:
""" Example - SQL Query """

# Print the average salary earned as a manager 

query = """
        SELECT AVG(Salary) AS Avg_Salary
        FROM Department 
        WHERE Department = 'Manager'
        """

spark.sql(query).show(truncate=False)

+----------+
|Avg_Salary|
+----------+
|145000.0  |
+----------+



In [95]:
""" Example - SQL Query """ 

# Print all employees that have a salary

""" Example 5 - Inner Join based on ID """
query = "SELECT E.ID, E.NAME, D.Department, D.Salary FROM Employees AS E LEFT JOIN Department AS D on E.ID == D.ID WHERE Salary IS NOT NULL" # Selecting columns from different tables and merging two tables using a left join

spark.sql(query).show(truncate=False)

+---+------+----------+------+
|ID |NAME  |Department|Salary|
+---+------+----------+------+
|5  |bobby |Sales     |34000 |
|1  |sravan|IT        |45000 |
|2  |ojaswi|Manager   |145000|
+---+------+----------+------+



In [96]:
""" Concatenating the SQL query in a variable """
df = spark.sql(query) # Note: If you want to change a PySpark dataframe to a Pandas dataframe you need to remvoe the .show(truncate=False) at the end - spark.sql(query) concatenates the pyspark dataframe and converts it into a pandas dataframe

+---+------+----------+------+
|ID |NAME  |Department|Salary|
+---+------+----------+------+
|5  |bobby |Sales     |34000 |
|1  |sravan|IT        |45000 |
|2  |ojaswi|Manager   |145000|
+---+------+----------+------+



### From PySpark Dataframe to Pandas Dataframe
--------------------------------------------
#### Converting PySpark Dataframe into Pandas Dataframe

In [101]:
""" Converting Spark dataframe to Pandas dataframe """
pandasDF = spark.sql(query).toPandas()

In [102]:
""" Checking the Pandas Dataframe """
pandasDF.head()

Unnamed: 0,ID,NAME,Department,Salary
0,5,bobby,Sales,34000
1,1,sravan,IT,45000
2,2,ojaswi,Manager,145000
