In [None]:
""" Notebook Information """
# References: https://github.com/XD-DENG/SQL-exercise 
# Workbook Title: PySpark SQL Exercises - Set 5

### PySpark Setup
--------------------------------------------
#### Installing relevant libraries; Instantiating a PySpark session; Creating a SparkSession

In [14]:
""" Importing libraries """
import pandas as pd 
import numpy as np 
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext 
from pyspark.sql import SparkSession

In [15]:
""" Instantiate a SparkContext """
sc = SparkContext.getOrCreate()

## Print the Spark version
print(sc.version)

3.0.3


In [16]:
""" Creating a SparkSession """ 
spark = SparkSession.builder.appName('JoinsTutorial').getOrCreate()

### PySpark Dataframes
--------------------------------------------
#### Creating tables as PySpark dataframes

In [17]:
""" Building the schema """
# Table 1 - Pieces Table 
data1 = [['HAL','Clarke Enterprises'],
        ['RBT','Susan Calvin Corp.'], 
        ['TNBC','Skellington Supplies']]
  
# specify column names
columns = ['Code', 'Name']
  
# creating a df1 from the lists of data
df1 = spark.createDataFrame(data1, columns)

# Table 2 - Providers Table 
data2 = [[1,'Sprocket'],
         [2,'Screw'],
         [3,'Nut'],
         [4,'Bolt']]
  
# specify column names
columns = ['Code', 'Name']
  
# creating a df1 from the lists of data
df2 = spark.createDataFrame(data2, columns)

# Table 3 - Provides Table 
data3 = [[1,'HAL',10],
         [1,'RBT',15],
         [2,'HAL',20],
         [2,'RBT',15],
         [2,'TNBC',14],
         [3,'RBT',50],
         [3,'TNBC',45],
         [4,'HAL',5],
         [4,'RBT',7]]
  
# specify column names
columns = ['Piece', 'Provider', 'Price']
  
# creating a df1 from the lists of data
df3 = spark.createDataFrame(data3, columns)

In [18]:
""" Creating Temporary Tables to be used in spark.sql """
## setting temporary views by creating dataframes
# creating a view for df1 named Manufacturers
df1.createOrReplaceTempView("Pieces")
  
# creating a view for df2 named Providers
df2.createOrReplaceTempView("Providers")

# creating a view for df2 named Provides
df3.createOrReplaceTempView("Provides")

### Queries - Set 5
--------------------------------------------

In [19]:
""" Checking the Pieces table """

# -- Checking the first 5 records of the Pieces table 

query = """
        SELECT *
        FROM Pieces
        LIMIT 5
       """

spark.sql(query).show(truncate=False)

+----+--------------------+
|Code|Name                |
+----+--------------------+
|HAL |Clarke Enterprises  |
|RBT |Susan Calvin Corp.  |
|TNBC|Skellington Supplies|
+----+--------------------+



In [20]:
""" Checking the Providers table """

# -- Checking the first 5 records of the Providers table 

query = """
        SELECT *
        FROM Providers
        LIMIT 5
       """

spark.sql(query).show(truncate=False)

+----+--------+
|Code|Name    |
+----+--------+
|1   |Sprocket|
|2   |Screw   |
|3   |Nut     |
|4   |Bolt    |
+----+--------+



In [21]:
""" Checking the Provides table """

# -- Checking the first 5 records of the Provides table 

query = """
        SELECT *
        FROM Provides
        LIMIT 5
       """

spark.sql(query).show(truncate=False)

+-----+--------+-----+
|Piece|Provider|Price|
+-----+--------+-----+
|1    |HAL     |10   |
|1    |RBT     |15   |
|2    |HAL     |20   |
|2    |RBT     |15   |
|2    |TNBC    |14   |
+-----+--------+-----+



In [22]:
""" Query 5.1 """

# -- Get the name of all the pieces 

query = """
        SELECT DISTINCT Name
        FROM Pieces
       """

spark.sql(query).show(truncate=False)


+--------------------+
|Name                |
+--------------------+
|Susan Calvin Corp.  |
|Clarke Enterprises  |
|Skellington Supplies|
+--------------------+



In [24]:
""" Query 5.2 """

# -- Get the name of the providers

query = """
        SELECT DISTINCT Provider
        FROM Provides
       """

spark.sql(query).show(truncate=False)


+--------+
|Provider|
+--------+
|HAL     |
|TNBC    |
|RBT     |
+--------+



In [26]:
""" Query 5.3 """

# -- Get the average price of each piece (show only the piece code and the average price)

query = """
        SELECT Piece, AVG(Price) AS Average_price
        FROM Provides
        Group BY Piece
       """

spark.sql(query).show(truncate=False)


+-----+------------------+
|Piece|Average_price     |
+-----+------------------+
|1    |12.5              |
|3    |47.5              |
|2    |16.333333333333332|
|4    |6.0               |
+-----+------------------+



In [33]:
""" Query 5.4 - Subquery Approach """

# -- Get the name of all providers who will supply piece number 1

# Note: This can be solved using a join approach or a subquery approach 

query = """
        SELECT DISTINCT Name
        FROM Providers 
        WHERE Code IN (SELECT Piece
                       FROM Provides
                       WHERE Piece = 1)

       """

spark.sql(query).show(truncate=False)


+--------+
|Name    |
+--------+
|Sprocket|
+--------+



In [36]:
""" Query 5.4 - Join Approach """

# -- Get the name of all providers who will supply piece number 1

# Note: This can be solved using a join approach or a subquery approach 

query = """
        SELECT DISTINCT Name
        FROM Providers AS A INNER JOIN Provides AS B ON A.Code = B.Piece
        WHERE B.Piece = 1
       """

spark.sql(query).show(truncate=False)


+--------+
|Name    |
+--------+
|Sprocket|
+--------+



In [43]:
""" Query 5.5 - Subquery Approach """

# -- Select the name of pieces provided by provider with code "HAL".

query = """
        SELECT Name
        FROM Providers
        WHERE Code IN (SELECT Piece
                       FROM Provides
                       WHERE Provider = 'HAL')
       """

spark.sql(query).show(truncate=False)


+--------+
|Name    |
+--------+
|Sprocket|
|Screw   |
|Bolt    |
+--------+



In [46]:
""" Query 5.5 - Subquery Approach """

# -- Select the name of pieces provided by provider with code "HAL".
# Note: Two table are needed for this - Providers and Provides 

query = """
        SELECT Name
        FROM Providers AS A INNER JOIN Provides AS B ON A.Code = B.Piece
        WHERE B.Provider = 'HAL'
       """

spark.sql(query).show(truncate=False)


+--------+
|Name    |
+--------+
|Sprocket|
|Screw   |
|Bolt    |
+--------+



In [56]:
""" Query 5.6 """ # Look into this again 

# -- For each piece, find the most expensive offering of that piece and include the piece name, provider name, and price 

# Note: Tables needed Pieces, Provider and Provides

# Step 1: Get the master table that includes the pieces, provider and provides columns

query = """
        SELECT *
        FROM Providers AS A INNER JOIN Provides AS B ON A.Code = B.Piece
                            INNER JOIN Pieces AS C ON B.Provider = C.Code
        WHERE Price = (SELECT MAX(Price)
                       FROM Provides)
       """

spark.sql(query).show(truncate=False)


+----+----+-----+--------+-----+----+------------------+
|Code|Name|Piece|Provider|Price|Code|Name              |
+----+----+-----+--------+-----+----+------------------+
|3   |Nut |3    |RBT     |50   |RBT |Susan Calvin Corp.|
+----+----+-----+--------+-----+----+------------------+

