### PySpark Setup
--------------------------------------------
#### Installing relevant libraries; Instantiating a PySpark session; Creating a SparkSession

In [16]:
""" Importing libraries """
import pandas as pd 
import numpy as np 
import findspark
findspark.init()
import pyspark
from pyspark import SparkContext 
from pyspark.sql import SparkSession

In [17]:
""" Instantiate a SparkContext """
sc = SparkContext.getOrCreate()

## Print the Spark version
print(sc.version)

3.0.3


In [18]:
""" Creating a SparkSession """ 
spark = SparkSession.builder.appName('JoinsTutorial').getOrCreate()

### Section 0 - Setting PySpark Dataframes & Registering Temporary Tables
--------------------------------------------

##### Overview on Databases Schema - Each DataFrame (df1, df2, df3, df4) contains 30 rows of data. You can perform joins between these tables based on your requirement. For example, you might join the Assignment table with the Employees table based on the Employee_ID to get employee details for each assignment. Similarly, you might join Assignment with Departments and Projects on Department_ID and Project_ID respectively.

In [19]:
# List of employee data
data1 = [[str(i), "Employee_" + str(i), "Company_" + str(i % 3), i * 5000] for i in range(1, 31)]
  
# Specify column names
columns1 = ['ID', 'Name', 'Company', 'Salary']
  
# Create a DataFrame from the list of data
df1 = spark.createDataFrame(data1, columns1)


In [20]:
# List of department data
data2 = [[str(i), "Department_" + str(i), "Company_" + str(i % 3)] for i in range(1, 31)]
  
# Specify column names
columns2 = ['Department_ID', 'Department_Name', 'Company']
  
# Create a DataFrame from the list of data
df2 = spark.createDataFrame(data2, columns2)


In [21]:
# List of project data
data3 = [[str(i), "Project_" + str(i), "Company_" + str(i % 3)] for i in range(1, 31)]
  
# Specify column names
columns3 = ['Project_ID', 'Project_Name', 'Company']
  
# Create a DataFrame from the list of data
df3 = spark.createDataFrame(data3, columns3)


In [22]:
# List of assignment data
data4 = [[str(i), str(i), str((i+5) % 30 + 1), str((i+10) % 30 + 1)] for i in range(1, 31)]
  
# Specify column names
columns4 = ['Assignment_ID', 'Employee_ID', 'Department_ID', 'Project_ID']
  
# Create a DataFrame from the list of data
df4 = spark.createDataFrame(data4, columns4)


In [23]:
# Register the DataFrame as a SQL temporary view
df1.createOrReplaceTempView("employees")
df2.createOrReplaceTempView("departments")
df3.createOrReplaceTempView("projects")
df4.createOrReplaceTempView("assignments")


### Section 1 - Advanced Joins
--------------------------------------------

In [34]:
# First, calculate the average salary and store it in a variable.
avg_salary = spark.sql("SELECT AVG(Salary) as avg_salary FROM employees").first().avg_salary

# Then, write the main query, using the average salary in the WHERE clause.
df5 = spark.sql(f"""
    SELECT 
        a.*,
        e.Name AS Employee_Name, 
        e.Company AS Employee_Company, 
        d.Department_Name, 
        d.Company AS Department_Company,
        p.Project_Name, 
        p.Company AS Project_Company, 
        e.Salary AS Employee_Salary
    FROM 
        assignments a
    LEFT JOIN 
        employees e ON a.Employee_ID = e.ID
    LEFT JOIN 
        departments d ON a.Department_ID = d.Department_ID
    LEFT JOIN 
        projects p ON a.Project_ID = p.Project_ID
    WHERE e.Salary > {avg_salary}
""")

df5.show(5, truncate=False)


+-------------+-----------+-------------+----------+-------------+----------------+---------------+------------------+------------+---------------+---------------+
|Assignment_ID|Employee_ID|Department_ID|Project_ID|Employee_Name|Employee_Company|Department_Name|Department_Company|Project_Name|Project_Company|Employee_Salary|
+-------------+-----------+-------------+----------+-------------+----------------+---------------+------------------+------------+---------------+---------------+
|26           |26         |2            |7         |Employee_26  |Company_2       |Department_2   |Company_2         |Project_7   |Company_1      |130000         |
|30           |30         |6            |11        |Employee_30  |Company_0       |Department_6   |Company_0         |Project_11  |Company_2      |150000         |
|18           |18         |24           |29        |Employee_18  |Company_0       |Department_24  |Company_0         |Project_29  |Company_2      |90000          |
|22           |2

In [35]:
## Another approach - without calling avg_salary variable 

df5 = spark.sql("""
    SELECT 
        main.* 
    FROM 
        (
            SELECT 
                a.*,
                e.Name AS Employee_Name, 
                e.Company AS Employee_Company, 
                d.Department_Name, 
                d.Company AS Department_Company,
                p.Project_Name, 
                p.Company AS Project_Company, 
                e.Salary AS Employee_Salary
            FROM 
                assignments a
            LEFT JOIN 
                employees e ON a.Employee_ID = e.ID
            LEFT JOIN 
                departments d ON a.Department_ID = d.Department_ID
            LEFT JOIN 
                projects p ON a.Project_ID = p.Project_ID
        ) as main
    JOIN 
        (SELECT AVG(Salary) as avg_salary FROM employees) as sub 
    WHERE main.Employee_Salary > sub.avg_salary -- Filtering out employees that have greater than average salary
""")

df5.show(5, truncate=False)


+-------------+-----------+-------------+----------+-------------+----------------+---------------+------------------+------------+---------------+---------------+
|Assignment_ID|Employee_ID|Department_ID|Project_ID|Employee_Name|Employee_Company|Department_Name|Department_Company|Project_Name|Project_Company|Employee_Salary|
+-------------+-----------+-------------+----------+-------------+----------------+---------------+------------------+------------+---------------+---------------+
|26           |26         |2            |7         |Employee_26  |Company_2       |Department_2   |Company_2         |Project_7   |Company_1      |130000         |
|30           |30         |6            |11        |Employee_30  |Company_0       |Department_6   |Company_0         |Project_11  |Company_2      |150000         |
|18           |18         |24           |29        |Employee_18  |Company_0       |Department_24  |Company_0         |Project_29  |Company_2      |90000          |
|22           |2

### Section 2 - Window Functions
--------------------------------------------

#### Aggregate Window Functions: These functions perform a calculation on a set of rows and return a single value for each row. The syntax for aggregate window functions is identical to that used with other aggregate functions. Some of the commonly used aggregate window functions are: SUM(), AVG(), MIN(), MAX(), COUNT()

#### Ranking Window Functions: These functions provide a way to rank rows in each partition. Each row in each partition is assigned a unique rank number. Some of the commonly used ranking window functions are: RANK(), DENSE_RANK(), ROW_NUMBER(), NTILE(), CUME_DIST(), PERCENT_RANK(), LEAD(), LAG(), FIRST_VALUE(), LAST_VALUE(), NTH_VALUE()

In [46]:
df_salary_rank = spark.sql("""
    SELECT 
        ID,
        Name, 
        Salary,
        RANK() OVER (ORDER BY Salary DESC) as Salary_Rank
    FROM employees
    ORDER BY Salary_Rank
    LIMIT 10
                           
""")

df_salary_rank.show(truncate=False)


+---+-----------+------+-----------+
|ID |Name       |Salary|Salary_Rank|
+---+-----------+------+-----------+
|30 |Employee_30|150000|1          |
|29 |Employee_29|145000|2          |
|28 |Employee_28|140000|3          |
|27 |Employee_27|135000|4          |
|26 |Employee_26|130000|5          |
|25 |Employee_25|125000|6          |
|24 |Employee_24|120000|7          |
|23 |Employee_23|115000|8          |
|22 |Employee_22|110000|9          |
|21 |Employee_21|105000|10         |
+---+-----------+------+-----------+

