In [11]:
""" Notebook Information """
# References: https://github.com/XD-DENG/SQL-exercise 
# Workbook Title: PySpark SQL Exercises - Set 4

' Notebook Information '

### PySpark Setup
--------------------------------------------
#### Installing relevant libraries; Instantiating a PySpark session; Creating a SparkSession

In [12]:
""" Importing libraries """
import pandas as pd 
import numpy as np 
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext 
from pyspark.sql import SparkSession

In [13]:
""" Instantiate a SparkContext """
sc = SparkContext.getOrCreate()

## Print the Spark version
print(sc.version)

3.0.3


In [14]:
""" Creating a SparkSession """ 
spark = SparkSession.builder.appName('JoinsTutorial').getOrCreate()

### PySpark Dataframes
--------------------------------------------
#### Creating tables as PySpark dataframes

In [15]:
""" Building the schema """
# Table 1 - Movies Table 
data1 = [[1,'Citizen Kane','PG'],
        [2,'Singin'' in the Rain','G'], 
        [3,'The Wizard of Oz','G'],
        [4,'The Quiet Man','PG-13'],
        [5,'North by Northwest','PG-13'],
        [6,'The Last Tango in Paris','NC-17'],
        [7,'Some Like it Hot','PG-13'],
        [8,'A Night at the Opera','PG-13']]
  
# specify column names
columns = ['Code', 'Title', 'Rating']
  
# creating a df1 from the lists of data
df1 = spark.createDataFrame(data1, columns)

# Table 2 - MoviesTheaters Table 
data2 = [[1,'Odeon',5],
         [2,'Imperial',1],
         [3,'Majestic',4],
         [4,'Royale',6],
         [5,'Paraiso',3],
         [6,'Nickelodeon',7]]
  
# specify column names
columns = ['Code', 'Name', 'Movie']
  
# creating a df1 from the lists of data
df2 = spark.createDataFrame(data2, columns)

In [16]:
""" Creating Temporary Tables to be used in spark.sql """
## setting temporary views by creating dataframes
# creating a view for df1 named Manufacturers
df1.createOrReplaceTempView("Movies")
  
# creating a view for df2 named MoviesTheaters
df2.createOrReplaceTempView("MoviesTheaters")

In [17]:
""" Checking the column types for Table 1 - Movies """
df1.printSchema()

root
 |-- Code: long (nullable = true)
 |-- Title: string (nullable = true)
 |-- Rating: string (nullable = true)



In [18]:
""" Checking the column types for Table 2 - MoviesTheaters """
df2.printSchema()

root
 |-- Code: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Movie: long (nullable = true)



### Queries - Set 4
--------------------------------------------

In [19]:
""" Checking the Movies table """

# -- Select all warehouses.

query = """
        SELECT *
        FROM MoviesTheaters
        LIMIT 5
       """

spark.sql(query).show(truncate=False)

+----+-----------+-----+
|Code|Name       |Movie|
+----+-----------+-----+
|1   |Odeon      |5    |
|2   |Imperial   |1    |
|3   |Majestic   |4    |
|4   |Royale     |6    |
|5   |Paraiso    |3    |
|6   |Nickelodeon|7    |
+----+-----------+-----+



In [20]:
""" Checking the MoviesTheaters table """

# -- Select all MoviesTheaters.

query = """
        SELECT *
        FROM MoviesTheaters
        LIMIT 5
       """

spark.sql(query).show(truncate=False)

+----+--------+-----+
|Code|Name    |Movie|
+----+--------+-----+
|1   |Odeon   |5    |
|2   |Imperial|1    |
|3   |Majestic|4    |
|4   |Royale  |6    |
|5   |Paraiso |3    |
+----+--------+-----+



In [21]:
""" Query 4.1 """

# -- Select the title of all movies.

query = """
        SELECT *
        FROM Movies
       """

spark.sql(query).show(truncate=False)


+----+------------------+------+
|Code|Title             |Rating|
+----+------------------+------+
|1   |Citizen Kane      |PG    |
|2   |Singin in the Rain|G     |
|3   |The Wizard of Oz  |G     |
|4   |The Quiet Man     |PG-13 |
|5   |North by Northwest|PG-13 |
+----+------------------+------+



In [22]:
""" Query 4.2 """

# -- Show all the distinct ratings in the database.

query = """
        SELECT DISTINCT Rating
        FROM Movies
       """

spark.sql(query).show(truncate=False)


+------+
|Rating|
+------+
|PG    |
|NC-17 |
|G     |
|PG-13 |
+------+



In [25]:
""" Query 4.3 """

# -- Show all unrated movies.

query = """
        SELECT COUNT(Code) AS Unrated_Movies
        FROM Movies
        WHERE Rating IS NULL
       """

spark.sql(query).show(truncate=False)



+--------------+
|Unrated_Movies|
+--------------+
|0             |
+--------------+



In [29]:
""" Query 4.4 """

# -- Select all movie theaters that are not currently showing a movie.

query = """
        SELECT *
        FROM MoviesTheaters AS A LEFT JOIN Movies AS B ON A.Movie = B.Code
        WHERE Title IS NULL
       """

spark.sql(query).show(truncate=False)

+----+----+-----+----+-----+------+
|Code|Name|Movie|Code|Title|Rating|
+----+----+-----+----+-----+------+
+----+----+-----+----+-----+------+



In [32]:
""" Query 4.5 """

# -- Select all data from all movie theaters and, additionally, the data from the movie that is being shown in the theater (if one is being shown).

query = """
        SELECT A.Code, A.Name AS Theater_name, A.Movie, B.Title AS Movie_title, B.Rating
        FROM MoviesTheaters AS A LEFT JOIN Movies AS B ON A.Movie = B.Code
       """

spark.sql(query).show(truncate=False)

+----+------------+-----+-----------------------+------+
|Code|Theater_name|Movie|Movie_title            |Rating|
+----+------------+-----+-----------------------+------+
|6   |Nickelodeon |7    |Some Like it Hot       |PG-13 |
|4   |Royale      |6    |The Last Tango in Paris|NC-17 |
|1   |Odeon       |5    |North by Northwest     |PG-13 |
|2   |Imperial    |1    |Citizen Kane           |PG    |
|5   |Paraiso     |3    |The Wizard of Oz       |G     |
|null|null        |null |A Night at the Opera   |PG-13 |
|null|null        |null |Singin in the Rain     |G     |
|3   |Majestic    |4    |The Quiet Man          |PG-13 |
+----+------------+-----+-----------------------+------+



In [38]:
""" Query 4.6 """

# -- Select all data from all movies and, if that movie is being shown in a theater, show the data from the theater.

query = """
        SELECT A.Code, A.Name AS Theater_name, A.Movie, B.Title AS Movie_title, B.Rating
        FROM MoviesTheaters AS A RIGHT JOIN Movies AS B ON A.Movie = B.Code
       """

spark.sql(query).show(truncate=False)

# Note: You can also perform a left join if the Movies table is positioned on the left and MoviesTheaters is positioned on the right 

+----+------------+-----+-----------------------+------+
|Code|Theater_name|Movie|Movie_title            |Rating|
+----+------------+-----+-----------------------+------+
|6   |Nickelodeon |7    |Some Like it Hot       |PG-13 |
|4   |Royale      |6    |The Last Tango in Paris|NC-17 |
|1   |Odeon       |5    |North by Northwest     |PG-13 |
|2   |Imperial    |1    |Citizen Kane           |PG    |
|5   |Paraiso     |3    |The Wizard of Oz       |G     |
|null|null        |null |A Night at the Opera   |PG-13 |
|null|null        |null |Singin in the Rain     |G     |
|3   |Majestic    |4    |The Quiet Man          |PG-13 |
+----+------------+-----+-----------------------+------+



In [37]:
""" Query 4.7 - Join Approach """

# -- Show the titles of movies not currently being shown in any theaters.

query = """
        SELECT A.Title
        FROM Movies AS A LEFT JOIN MoviesTheaters AS B ON A.Code = B.Movie
        WHERE B.Code IS NULL
       """

spark.sql(query).show(truncate=False)


+--------------------+
|Title               |
+--------------------+
|A Night at the Opera|
|Singin in the Rain  |
+--------------------+



In [47]:
""" Query 4.7 - Subquery Approach """

# -- Show the titles of movies not currently being shown in any theaters.

query = """
        SELECT Title
        FROM Movies 
        WHERE Code NOT IN (SELECT Movie
                           FROM MoviesTheaters
                           WHERE Movie IS NOT NULL)
       """

spark.sql(query).show(truncate=False)


+--------------------+
|Title               |
+--------------------+
|Singin in the Rain  |
|A Night at the Opera|
+--------------------+

