In [1]:
""" Notebook Information """
# References: https://www.youtube.com/watch?v=m1KcNV-Zhmc&ab_channel=AlexTheAnalyst ; https://github.com/AlexTheAnalyst/SQL-Code
# Workbook Title: PySpark Joins Tutorial - PySpark and SQL syntax

' Notebook Information '

### PySpark Setup
--------------------------------------------
#### Installing relevant libraries; Instantiating a PySpark session; Creating a SparkSession

In [2]:
""" Importing libraries """
import pandas as pd 
import numpy as np 
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext 
from pyspark.sql import SparkSession

In [3]:
""" Instantiate a SparkContext """
sc = SparkContext.getOrCreate()

## Print the Spark version
print(sc.version)

3.0.3


In [4]:
""" Creating a SparkSession """ 
spark = SparkSession.builder.appName('AdvancedSQL_Tutorial').getOrCreate()

### PySpark Dataframes
--------------------------------------------
#### Creating tables as PySpark dataframes

In [5]:
""" Creating EmployeeDemographics table """
# list  of EmployeeDemographics data
data1 = [
        
        [1001, 'Jim', 'Halpert', 30, 'Male'],
        [1002, 'Pam', 'Beasley', 30, 'Female'],
        [1003, 'Dwight', 'Schrute', 29, 'Male'],
        [1004, 'Angela', 'Martin', 31, 'Female'],
        [1005, 'Toby', 'Flenderson', 32, 'Male'],
        [1006, 'Michael', 'Scott', 35, 'Male'],
        [1007, 'Meredith', 'Palmer', 32, 'Female'],
        [1008, 'Stanley', 'Hudson', 38, 'Male'],
        [1009, 'Kevin', 'Malone', 31, 'Male']
        
        ]

# specify column names
columns = ['EmployeeID', 'FirstName', 'LastName', 'Age', 'Gender']
  
# creating a df1 from the lists of data
df1 = spark.createDataFrame(data1, columns)

In [6]:
""" Creating EmployeeSalary table """
# list  of EmployeeSalary data
data2 = [
        
        [1001, 'Salesman', 45000],
        [1002, 'Receptionist', 36000],
        [1003, 'Salesman', 63000],
        [1004, 'Accountant', 47000],
        [1005, 'HR', 50000],
        [1006, 'Regional Manager', 65000],
        [1007, 'Supplier Relations', 41000],
        [1008, 'Salesman', 48000],
        [1009, 'Accountant', 42000]
        
        ]

# specify column names
columns = ['EmployeeID', 'JobTitle', 'Salary']
  
# creating a df1 from the lists of data
df2 = spark.createDataFrame(data2, columns)

In [7]:
""" Creating Temporary Tables to be used in spark.sql """
## setting temporary views by creating dataframes
# creating a view for df1 named Manufacturers
df1.createOrReplaceTempView("EmployeeDemographics")
  
# creating a view for df2 named Providers
df2.createOrReplaceTempView("EmployeeSalary")

### Queries 
--------------------------------------------

In [8]:
""" Checking the PySpark dataframe """

# checking the dataframe  - EmployeeDemographics Table 
df1.show(truncate=False)

+----------+---------+----------+---+------+
|EmployeeID|FirstName|LastName  |Age|Gender|
+----------+---------+----------+---+------+
|1001      |Jim      |Halpert   |30 |Male  |
|1002      |Pam      |Beasley   |30 |Female|
|1003      |Dwight   |Schrute   |29 |Male  |
|1004      |Angela   |Martin    |31 |Female|
|1005      |Toby     |Flenderson|32 |Male  |
|1006      |Michael  |Scott     |35 |Male  |
|1007      |Meredith |Palmer    |32 |Female|
|1008      |Stanley  |Hudson    |38 |Male  |
|1009      |Kevin    |Malone    |31 |Male  |
+----------+---------+----------+---+------+



In [9]:
""" Checking the PySpark dataframe """

# checking the dataframe  - EmployeeSalary Table 
df2.show(truncate=False)

+----------+------------------+------+
|EmployeeID|JobTitle          |Salary|
+----------+------------------+------+
|1001      |Salesman          |45000 |
|1002      |Receptionist      |36000 |
|1003      |Salesman          |63000 |
|1004      |Accountant        |47000 |
|1005      |HR                |50000 |
|1006      |Regional Manager  |65000 |
|1007      |Supplier Relations|41000 |
|1008      |Salesman          |48000 |
|1009      |Accountant        |42000 |
+----------+------------------+------+



In [15]:
""" Subqueries - In SELECT FROM and WHERE Statements """

# Get the employee ID, Salary and average salary of all employees

query = """
        SELECT EmployeeID, Salary, (SELECT AVG(Salary) FROM EmployeeSalary) AS All_average_salary
        FROM EmployeeSalary
        GROUP BY EmployeeID, Salary
        Order BY 1,2 
       """

spark.sql(query).show(truncate=False)

# Note: Order BY 1,2 sorts by the first and second column which is equivalent to sorting based on EmployeeID and Salary

+----------+------+------------------+
|EmployeeID|Salary|All_average_salary|
+----------+------+------------------+
|1001      |45000 |48555.555555555555|
|1002      |36000 |48555.555555555555|
|1003      |63000 |48555.555555555555|
|1004      |47000 |48555.555555555555|
|1005      |50000 |48555.555555555555|
|1006      |65000 |48555.555555555555|
|1007      |41000 |48555.555555555555|
|1008      |48000 |48555.555555555555|
|1009      |42000 |48555.555555555555|
+----------+------+------------------+



In [19]:
""" Subqueries - In the SELECT statement """

# Get the employee ID and salaries of each employee along with the difference between the employee salary and average salary of all employees

query = """
        SELECT EmployeeID, Salary, (SELECT AVG(Salary) FROM EmployeeSalary) AS All_average_salary, Salary - (SELECT AVG(Salary) FROM EmployeeSalary) AS Salary_difference
        FROM EmployeeSalary
        GROUP BY EmployeeID, Salary
        Order BY 1,2
       """

spark.sql(query).show(truncate=False)

# Note: Order BY 1,2 means that we are orderding the values based on the EmployeeID and Salary columns

+----------+------+------------------+-------------------+
|EmployeeID|Salary|All_average_salary|Salary_difference  |
+----------+------+------------------+-------------------+
|1001      |45000 |48555.555555555555|-3555.5555555555547|
|1002      |36000 |48555.555555555555|-12555.555555555555|
|1003      |63000 |48555.555555555555|14444.444444444445 |
|1004      |47000 |48555.555555555555|-1555.5555555555547|
|1005      |50000 |48555.555555555555|1444.4444444444453 |
|1006      |65000 |48555.555555555555|16444.444444444445 |
|1007      |41000 |48555.555555555555|-7555.555555555555 |
|1008      |48000 |48555.555555555555|-555.5555555555547 |
|1009      |42000 |48555.555555555555|-6555.555555555555 |
+----------+------+------------------+-------------------+



In [22]:
""" Subqueries - In the FROM statement """

query = """
        SELECT A.EmployeeID, A.AllAvgSalary
        FROM (SELECT EmployeeID, Salary, AVG(Salary) over () AllAvgSalary
              FROM EmployeeSalary) AS A 
       """

spark.sql(query).show(truncate=False)

+----------+------------------+
|EmployeeID|AllAvgSalary      |
+----------+------------------+
|1001      |48555.555555555555|
|1002      |48555.555555555555|
|1003      |48555.555555555555|
|1004      |48555.555555555555|
|1005      |48555.555555555555|
|1006      |48555.555555555555|
|1007      |48555.555555555555|
|1008      |48555.555555555555|
|1009      |48555.555555555555|
+----------+------------------+



In [25]:
""" Subqueries - In WHERE statement """

query = """
        SELECT EmployeeID, JobTitle, Salary
        FROM EmployeeSalary 
        WHERE EmployeeID IN (SELECT EmployeeID
                             FROM EmployeeDemographics
                             WHERE Age > 30)
       """

spark.sql(query).show(truncate=False)

# Note: You can also perform the above query using a join

+----------+------------------+------+
|EmployeeID|JobTitle          |Salary|
+----------+------------------+------+
|1009      |Accountant        |42000 |
|1007      |Supplier Relations|41000 |
|1005      |HR                |50000 |
|1008      |Salesman          |48000 |
|1004      |Accountant        |47000 |
|1006      |Regional Manager  |65000 |
+----------+------------------+------+



In [27]:
""" Subqueries - In FROM statement """

query = """
        SELECT *
        FROM EmployeeDemographics
        LIMIT 5
       """

spark.sql(query).show(truncate=False)

+----------+---------+----------+---+------+
|EmployeeID|FirstName|LastName  |Age|Gender|
+----------+---------+----------+---+------+
|1001      |Jim      |Halpert   |30 |Male  |
|1002      |Pam      |Beasley   |30 |Female|
|1003      |Dwight   |Schrute   |29 |Male  |
|1004      |Angela   |Martin    |31 |Female|
|1005      |Toby     |Flenderson|32 |Male  |
+----------+---------+----------+---+------+

