In [1]:
""" Notebook Information """
# References: https://github.com/XD-DENG/SQL-exercise 
# Workbook Title: PySpark SQL Exercises - Set 3

' Notebook Information '

### PySpark Setup
--------------------------------------------
#### Installing relevant libraries; Instantiating a PySpark session; Creating a SparkSession

In [2]:
""" Importing libraries """
import pandas as pd 
import numpy as np 
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext 
from pyspark.sql import SparkSession

In [3]:
""" Instantiate a SparkContext """
sc = SparkContext.getOrCreate()

## Print the Spark version
print(sc.version)

3.0.3


In [4]:
""" Creating a SparkSession """ 
spark = SparkSession.builder.appName('JoinsTutorial').getOrCreate()

### PySpark Dataframes
--------------------------------------------
#### Creating tables as PySpark dataframes

In [5]:
""" Building the schema """
# Table 1 - Warehouses Table 
data1 = [[1,'Chicago',3],
        [2,'Chicago',4], 
        [3,'New York',7],
        [4,'Los Angeles',2],
        [5,'San Francisco',8]]
  
# specify column names
columns = ['Code', 'Location', 'Capacity']
  
# creating a df1 from the lists of data
df1 = spark.createDataFrame(data1, columns)

# Table 2 - Boxes Table 
data2 = [['0MN7','Rocks',180,3],
         ['4H8P','Rocks',250,1],
         ['4RT3','Scissors',190,4],
         ['7G3H','Rocks',200,1],
         ['8JN6','Papers',75,1],
         ['8Y6U','Papers',50,3],
         ['9J6F','Papers',175,2],
         ['LL08','Rocks',140,4],
         ['P0H6','Scissors',125,1],
         ['P2T6','Scissors',150,2],
         ['TU55','Papers',90,5]]
  
# specify column names
columns = ['Code', 'Contents', 'Value', 'Warehouse']
  
# creating a df1 from the lists of data
df2 = spark.createDataFrame(data2, columns)

In [6]:
""" Creating Temporary Tables to be used in spark.sql """
## setting temporary views by creating dataframes
# creating a view for df1 named Manufacturers
df1.createOrReplaceTempView("Warehouses")
  
# creating a view for df2 named Products
df2.createOrReplaceTempView("Boxes")

In [7]:
""" Checking the column types for Table 1 - Warehouses """
df1.printSchema()

root
 |-- Code: long (nullable = true)
 |-- Location: string (nullable = true)
 |-- Capacity: long (nullable = true)



In [8]:
""" Checking the column types for Table 2 - Boxes """
df2.printSchema()

root
 |-- Code: string (nullable = true)
 |-- Contents: string (nullable = true)
 |-- Value: long (nullable = true)
 |-- Warehouse: long (nullable = true)



In [9]:
""" Checking the Warehouses table """ 

query = """
        SELECT *
        FROM Warehouses
        LIMIT 3
       """

spark.sql(query).show(truncate=False, n=5)

+----+--------+--------+
|Code|Location|Capacity|
+----+--------+--------+
|1   |Chicago |3       |
|2   |Chicago |4       |
|3   |New York|7       |
+----+--------+--------+



In [10]:
""" Checking the Boxes table """ 

query = """
        SELECT *
        FROM Boxes
        LIMIT 3
       """

spark.sql(query).show(truncate=False, n=5)

+----+--------+-----+---------+
|Code|Contents|Value|Warehouse|
+----+--------+-----+---------+
|0MN7|Rocks   |180  |3        |
|4H8P|Rocks   |250  |1        |
|4RT3|Scissors|190  |4        |
+----+--------+-----+---------+



### Queries - Set 3
--------------------------------------------

In [11]:
""" Query 3.1 """

# -- Select all warehouses.

query = """
        SELECT *
        FROM Warehouses
       """

spark.sql(query).show(truncate=False)

+----+-------------+--------+
|Code|Location     |Capacity|
+----+-------------+--------+
|1   |Chicago      |3       |
|2   |Chicago      |4       |
|3   |New York     |7       |
|4   |Los Angeles  |2       |
|5   |San Francisco|8       |
+----+-------------+--------+



In [12]:
""" Query 3.2 """

# -- Select all boxes with a value larger than $150.

query = """
        SELECT *
        FROM Boxes
       """

spark.sql(query).show(truncate=False)

+----+--------+-----+---------+
|Code|Contents|Value|Warehouse|
+----+--------+-----+---------+
|0MN7|Rocks   |180  |3        |
|4H8P|Rocks   |250  |1        |
|4RT3|Scissors|190  |4        |
|7G3H|Rocks   |200  |1        |
|8JN6|Papers  |75   |1        |
|8Y6U|Papers  |50   |3        |
|9J6F|Papers  |175  |2        |
|LL08|Rocks   |140  |4        |
|P0H6|Scissors|125  |1        |
|P2T6|Scissors|150  |2        |
|TU55|Papers  |90   |5        |
+----+--------+-----+---------+



In [13]:
""" Query 3.3 """

# -- Select all distinct contents in all the boxes.

query = """
        SELECT DISTINCT Contents
        FROM Boxes
       """

spark.sql(query).show(truncate=False)

+--------+
|Contents|
+--------+
|Rocks   |
|Papers  |
|Scissors|
+--------+



In [14]:
""" Query 3.4 """

# -- Select the average value of all the boxes.

query = """
        SELECT DISTINCT Contents
        FROM Boxes
       """

spark.sql(query).show(truncate=False)

+--------+
|Contents|
+--------+
|Rocks   |
|Papers  |
|Scissors|
+--------+



In [15]:
""" Query 3.5 """

# -- Select the warehouse code and the average value of the boxes in each warehouse.

query = """
        SELECT AVG(value) AS Average
        FROM Boxes
       """

spark.sql(query).show(truncate=False)

+------------------+
|Average           |
+------------------+
|147.72727272727272|
+------------------+



In [16]:
""" Query 3.6 """

# -- Same as previous exercise, but select only those warehouses where the average value of the boxes is greater than 150.

query = """
        SELECT warehouse, AVG(value) AS Average
        FROM Boxes
        GROUP BY warehouse
        HAVING AVG(value) > 150
       """

spark.sql(query).show(truncate=False)

+---------+-------+
|warehouse|Average|
+---------+-------+
|1        |162.5  |
|2        |162.5  |
|4        |165.0  |
+---------+-------+



In [17]:
""" Query 3.7 """

# -- Select the code of each box, along with the name of the city the box is located in.

query = """
        SELECT *
        FROM Boxes AS A INNER JOIN Warehouses AS B ON A.Warehouse = B.Code 
       """

spark.sql(query).show(truncate=False)

+----+--------+-----+---------+----+-------------+--------+
|Code|Contents|Value|Warehouse|Code|Location     |Capacity|
+----+--------+-----+---------+----+-------------+--------+
|TU55|Papers  |90   |5        |5   |San Francisco|8       |
|4H8P|Rocks   |250  |1        |1   |Chicago      |3       |
|7G3H|Rocks   |200  |1        |1   |Chicago      |3       |
|8JN6|Papers  |75   |1        |1   |Chicago      |3       |
|P0H6|Scissors|125  |1        |1   |Chicago      |3       |
|0MN7|Rocks   |180  |3        |3   |New York     |7       |
|8Y6U|Papers  |50   |3        |3   |New York     |7       |
|9J6F|Papers  |175  |2        |2   |Chicago      |4       |
|P2T6|Scissors|150  |2        |2   |Chicago      |4       |
|4RT3|Scissors|190  |4        |4   |Los Angeles  |2       |
|LL08|Rocks   |140  |4        |4   |Los Angeles  |2       |
+----+--------+-----+---------+----+-------------+--------+



In [18]:
""" Query 3.8 """

# -- Select the warehouse codes, along with the number of boxes in each warehouse. 

query = """
        SELECT Warehouse, COUNT(*) AS Boxes_count
        FROM Boxes
        GROUP BY Warehouse
       """

spark.sql(query).show(truncate=False)

+---------+-----------+
|Warehouse|Boxes_count|
+---------+-----------+
|5        |1          |
|1        |4          |
|3        |2          |
|2        |2          |
|4        |2          |
+---------+-----------+



In [44]:
""" Query 3.9 """ # CONTINUE FROM HERE 

# -- Select the codes of all warehouses that are saturated (a warehouse is saturated if the number of boxes in it is larger than the warehouse's capacity).

query = """
        SELECT Code
        FROM Warehouses
        WHERE Capacity < (SELECT COUNT(*)
                          FROM Boxes
                          WHERE Warehouse = Warehouses.Code)
       """

spark.sql(query).show(truncate=False)

+----+
|Code|
+----+
|1   |
+----+



In [36]:
""" Query 3.10 - Join Approach """ 

# -- Select the codes and location of all the boxes located in Chicago with the

query = """
        SELECT A.Code
        FROM Boxes AS A INNER JOIN Warehouses AS B ON A.Warehouse = B.Code
        WHERE B.Location = 'Chicago'
       """

spark.sql(query).show(truncate=False)

+----+
|Code|
+----+
|4H8P|
|7G3H|
|8JN6|
|P0H6|
|9J6F|
|P2T6|
+----+



In [34]:
""" Query 3.10 - Subquery Approach """ # CONTINUE FROM HERE

# -- Select the codes of all the boxes located in Chicago with the

query = """
        SELECT Code
        FROM Boxes
        WHERE Warehouse IN (SELECT Code
                            FROM Warehouses
                            WHERE Location = 'Chicago')
       """

spark.sql(query).show(truncate=False)

+----+
|Code|
+----+
|4H8P|
|7G3H|
|8JN6|
|P0H6|
|9J6F|
|P2T6|
+----+



In [39]:
""" Query 3.11 - Join Approach """ 

# -- Select the average capacity of the boxes that are located in Chicago's warehouse

query = """
        SELECT AVG(Capacity) AS Average_capacity
        FROM Boxes AS A INNER JOIN Warehouses AS B ON A.Warehouse = B.Code
        WHERE B.Location = 'Chicago'
       """

spark.sql(query).show(truncate=False)

+------------------+
|Average_capacity  |
+------------------+
|3.3333333333333335|
+------------------+

