In [None]:
""" Notebook Information """
# References: https://github.com/XD-DENG/SQL-exercise ; https://en.wikibooks.org/wiki/SQL_Exercises/The_computer_store
# Workbook Title: PySpark SQL Exercises - Set 2

### PySpark Setup
--------------------------------------------
#### Installing relevant libraries; Instantiating a PySpark session; Creating a SparkSession

In [None]:
""" Importing libraries """
import pandas as pd 
import numpy as np 
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext 
from pyspark.sql import SparkSession

In [None]:
""" Instantiate a SparkContext """
sc = SparkContext.getOrCreate()

## Print the Spark version
print(sc.version)

In [None]:
""" Creating a SparkSession """ 
spark = SparkSession.builder.appName('JoinsTutorial').getOrCreate()

### PySpark Dataframes
--------------------------------------------
#### Creating tables as PySpark dataframes

In [None]:
""" Building the schema """
# Table 1 - Manufacturers Table 
data1 = [[14,'IT',65000],
        [37,'Accounting',15000], 
        [59,'Human Resources',240000],
        [77,'Research',55000]]
  
# specify column names
columns = ['Code', 'Name', 'Budget']
  
# creating a df1 from the lists of data
df1 = spark.createDataFrame(data1, columns)

# Table 2 - Products Table 
data2 = [[123234877,'Michael','Rogers',14],
         [152934485,'Anand','Manikutty',14],
         [222364883,'Carol','Smith',37],
         [326587417,'Joe','Stevens',37],
         [332154719,'Mary-Anne','Foster',14],
         [332569843,'George','ODonnell',77],
         [546523478,'John','Doe',59],
         [631231482,'David','Smith',77],
         [654873219,'Zacary','Efron',59],
         [745685214,'Eric','Goldsmith',59],
         [845657245,'Elizabeth','Doe',14],
         [845657246,'Kumar','Swamy',14]]
  
# specify column names
columns = ['SSN', 'Name', 'LastName', 'Department']
  
# creating a df1 from the lists of data
df2 = spark.createDataFrame(data2, columns)

In [None]:
""" Creating Temporary Tables to be used in spark.sql """
## setting temporary views by creating dataframes
# creating a view for df1 named Manufacturers
df1.createOrReplaceTempView("Departments")
  
# creating a view for df2 named Products
df2.createOrReplaceTempView("Employees")

In [None]:
""" Checking the column types for Table 1 - Departments """
df1.printSchema()

In [None]:
""" Checking the column types for Table 2 - Employees """
df2.printSchema()

In [None]:
""" Checking the tables """ 

query = """
        SELECT *
        FROM Departments
        LIMIT 3
       """

spark.sql(query).show(truncate=False, n=5)

In [None]:
""" Checking the tables """ 

query = """
        SELECT *
        FROM Employees
        LIMIT 5
       """

spark.sql(query).show(truncate=False, n=5)

### Queries - Set 2
--------------------------------------------

In [None]:
""" Query 2.1 """

# -- 2.1 Select the last name of all employees.

query = """
        SELECT LastName
        FROM Employees
       """

spark.sql(query).show(truncate=False)

In [None]:
""" Query 2.2 """

# -- 2.2 Select the last name of all unique employees, without duplicates.

query = """
        SELECT DISTINCT LastName
        FROM Employees
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.3 """

# -- 2.3 Select all the data of employees whose last name is "Smith".

query = """
        SELECT *
        FROM Employees
        WHERE LastName = "Smith"
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.4 """

# -- 2.4 Select all the data of employees whose last name is "Smith" or "Doe".

query = """
        SELECT *
        FROM Employees
        WHERE LastName IN ("Smith","Doe")
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.5 """

# -- 2.5 Select all the data of employees that work in department 14.

query = """
        SELECT *
        FROM Employees
        WHERE Department = '14'
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.6 """

# -- 2.6 Select all the data of employees that work in department 37 or department 77.

query = """
        SELECT *
        FROM Employees
        WHERE Department IN (37,77)
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.7 """

# -- 2.7 Select all the data of employees whose last name begins with an "S".

query = """
        SELECT *
        FROM Employees
        WHERE LastName LIKE 'S%'
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.8 """

# -- 2.8 Select the sum of all the departments' budgets.

query = """
        SELECT Name AS Department_name, SUM(Budget) AS Department_budget
        FROM Departments
        GROUP BY Name
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.9 """

# -- 2.9 Select the number of employees in each department (you only need to show the department code and the number of employees).

query = """
        SELECT Name AS Department_name, SUM(Budget) AS Department_budget
        FROM Departments
        GROUP BY Name
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.10 """

# -- 2.10 Select all the data of employees, including each employee's department's data.

query = """
        SELECT *, B.Name AS Department_name, Budget AS Department_budget
        FROM Employees AS A INNER JOIN Departments AS B ON A.Department = B.Code
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.11 """

# -- 2.11 Select the name and last name of each employee, along with the name and budget of the employee's department.

query = """
        SELECT A.Name AS FirstName, A.LastName, B.Name AS Department_name, B.Budget AS Department_budget
        FROM Employees AS A INNER JOIN Departments AS B ON A.Department = B.Code
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.12 """

# -- 2.12 Select the name and last name of employees working for departments with a budget greater than $60,000.

query = """
        SELECT A.Name AS FirstName, A.LastName, B.Name AS Department_name, B.Budget AS Department_budget
        FROM Employees AS A INNER JOIN Departments AS B ON A.Department = B.Code
        WHERE B.Budget > 60000
       """

spark.sql(query).show(truncate=False)

In [None]:
""" Query 2.13 """

# -- 2.13 Select the departments with a budget larger than the average budget of all the departments.

# Creating the sub-query
sub_query = """
        SELECT AVG(Budget) AS Average_budget
        FROM Departments
       """

query = """
        SELECT *
        FROM Departments
        WHERE Budget > (SELECT AVG(Budget) AS Average_budget
                      FROM Departments)
       """

spark.sql(query).show(truncate=False)

In [None]:
""" Query 2.14 - Subquery Approach """

# -- 2.14 Select the names of departments with more than two employees.

# Creating the sub-query to get the departments that have more than two employees 

# Step 1: Getting the employees that work in more than one department
sub_query = """
        SELECT Department
        FROM Employees
        GROUP BY Department
        HAVING COUNT(*) > 2
        """

# Step 2: Getting the departments name that have a code which falls in the range of departments with more than 2 employees 
query = """
        SELECT Name
        FROM Departments
        WHERE Code IN ( SELECT Department
                        FROM Employees
                        GROUP BY Department
                        HAVING COUNT(*) > 2 )
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.14 - Join Approach """ 

# -- 2.14 Select the names of departments with more than two employees.

query = """
        SELECT B.Name AS Department_name
        FROM Employees AS A INNER JOIN Departments AS B ON A.Department = B.Code
        GROUP BY B.Name
        HAVING COUNT(*)>2
       """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.15 - Subquery Approach """

# -- 2.15 Select the name and last name of employees working for departments with second lowest budget.

# Step 1: Getting all the columns in the departments table with the two lowest budgets 
query1 = """
        SELECT *
        FROM Departments
        ORDER BY Budget
        LIMIT 2
        """

# Step 2: Getting the department codes of two lowest departments
query2 = """
        SELECT Code
        FROM (SELECT *
              FROM Departments
              ORDER BY Budget
              LIMIT 2)
        """

# Step 3: Getting the name and last name of the employees from the codes while ordering by the greatest budget of these two and taking the first observation of these two
query = """
        SELECT E.Name, E.LastName
        FROM Employees AS E
        WHERE Department IN (SELECT Code
                             FROM (SELECT *
                                   FROM Departments
                                   ORDER BY Budget
                                   LIMIT 2) AS D
                             ORDER BY D.Budget DESC LIMIT 1)
        """

spark.sql(query).show(truncate=False)


In [None]:
""" Query 2.16 - CASE """

# -- 2.16 Create a new column which adds the name for each department as per the following IT, Accounting, Research and Human Resources for Departments 14, 37, 77 and 59
query = """
        SELECT  E.SSN, E.Name AS First_name, E.LastName AS Last_name, E.Department, 
        CASE 
                WHEN E.Department = 14 THEN 'IT'
                WHEN E.Department = 37 THEN 'Accounting'
                WHEN E.Department = 59 THEN 'Human Resources'
                WHEN E.Department = 77 THEN 'Research'
                END AS Department_name
        FROM Employees AS E 
        """

spark.sql(query).show(truncate=False)
