In [None]:
# Import Necessary Libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col, broadcast
import os
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime
from sqlalchemy.orm import sessionmaker, declarative_base
import dotenv

In [3]:
# Set Java Environment
os.environ['JAVA_HOME'] = "C:/java8"

In [4]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PayrollETL") \
    .config("spark.sql.shuffle.partitions", "50") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

#### Data Extraction

In [5]:
#Load Master Data
employee_df = spark.read.csv(r'dataset\raw\EmpMaster.csv', header=True, inferSchema=True)
agency_df = spark.read.csv(r'dataset\raw\AgencyMaster.csv', header=True, inferSchema=True)
jobtitle_df = spark.read.csv(r'dataset/raw/TitleMaster.csv', header=True, inferSchema=True)

In [6]:
# Function to load the multiple payroll datasets and merge them 
def load_data(file_paths):
    payroll_df = [spark.read.csv(file, header=True, inferSchema=True) for file in file_paths]
    merged_df = payroll_df[0]
    for df in payroll_df[1:]:
        merged_df = merged_df.union(df)
    return merged_df.dropDuplicates(["EmployeeID", "FiscalYear"])

In [7]:
# List all payroll files
payroll_files = [r"dataset\raw\nycpayroll_2020.csv", r"dataset\raw\nycpayroll_2021.csv"]

In [27]:
# Load and merge payroll data
payroll_df = load_data(payroll_files)

payroll_df.write.csv("payroll_df.csv", header=True, mode="overwrite")

#### Data Transformation

In [42]:
# Convert AgencyStartDate to Date datatype
payroll_df = payroll_df.withColumn("AgencyStartDate", pyspark.sql.functions.to_date(payroll_df["AgencyStartDate"], "M/d/yyyy"))

payroll_df.show()

+----------+-------------+--------+--------------------+----------+---------+---------+---------------+-------------------+---------+--------------------+---------------------+----------+---------+------------+----------------+-------+-----------+-------------+
|FiscalYear|PayrollNumber|AgencyID|          AgencyName|EmployeeID| LastName|FirstName|AgencyStartDate|WorkLocationBorough|TitleCode|    TitleDescription|LeaveStatusasofJune30|BaseSalary| PayBasis|RegularHours|RegularGrossPaid|OTHours|TotalOTPaid|TotalOtherPay|
+----------+-------------+--------+--------------------+----------+---------+---------+---------------+-------------------+---------+--------------------+---------------------+----------+---------+------------+----------------+-------+-----------+-------------+
|      2020|           17|    2120|OFFICE OF EMERGEN...|     10001|   GEAGER| VERONICA|     2016-09-12|           BROOKLYN|    40447|EMERGENCY PREPARE...|               ACTIVE|   86005.0|per Annum|      1820.0|    

In [10]:
# Check for Null values
for column in payroll_df.columns:
    print(column, 'Nulls: ', payroll_df.filter(payroll_df[column].isNull()).count())

FiscalYear Nulls:  0
PayrollNumber Nulls:  0
AgencyID Nulls:  0
AgencyName Nulls:  0
EmployeeID Nulls:  0
LastName Nulls:  0
FirstName Nulls:  0
AgencyStartDate Nulls:  0
WorkLocationBorough Nulls:  0
TitleCode Nulls:  0
TitleDescription Nulls:  0
LeaveStatusasofJune30 Nulls:  0
BaseSalary Nulls:  0
PayBasis Nulls:  0
RegularHours Nulls:  0
RegularGrossPaid Nulls:  0
OTHours Nulls:  0
TotalOTPaid Nulls:  0
TotalOtherPay Nulls:  0


In [11]:
# Fill null values with defaults
for col_name, dtype in payroll_df.dtypes:
    if dtype == "string":
        payroll_df = payroll_df.fillna({col_name: "Unknown"})
    elif dtype in ["double", "float"]:
        payroll_df = payroll_df.fillna({col_name: 0.0})
    elif dtype in ["int", "bigint"]:
        payroll_df = payroll_df.fillna({col_name: 0})

In [54]:
# Create Payroll fact Table
payroll_fact_table = payroll_df.alias("p") \
    .join(employee_df.alias("e"), ["EmployeeID", "LastName", "FirstName"], "left") \
    .join(agency_df.alias("a"), ["AgencyID", "AgencyName"], "left") \
    .join(jobtitle_df.alias("t"), ["TitleCode", "TitleDescription"], "left") \
    .withColumn("PayrollID", monotonically_increasing_id()) \
    .select("PayrollID", "FiscalYear", "PayrollNumber", "EmployeeID", "LastName", "FirstName", "AgencyID", "AgencyName", "TitleCode", "TitleDescription", "AgencyStartDate", "LeaveStatusasofJune30", "BaseSalary", "PayBasis", "RegularHours", "RegularGrossPaid", "OTHours", "TotalOTPaid", "TotalOtherPay")


In [55]:
payroll_fact_table.show(300)

+---------+----------+-------------+----------+---------------+-----------+--------+--------------------+---------+--------------------+---------------+---------------------+----------+---------+------------+----------------+-------+-----------+-------------+
|PayrollID|FiscalYear|PayrollNumber|EmployeeID|       LastName|  FirstName|AgencyID|          AgencyName|TitleCode|    TitleDescription|AgencyStartDate|LeaveStatusasofJune30|BaseSalary| PayBasis|RegularHours|RegularGrossPaid|OTHours|TotalOTPaid|TotalOtherPay|
+---------+----------+-------------+----------+---------------+-----------+--------+--------------------+---------+--------------------+---------------+---------------------+----------+---------+------------+----------------+-------+-----------+-------------+
|        0|      2020|           17|     10001|         GEAGER|   VERONICA|    2120|OFFICE OF EMERGEN...|    40447|EMERGENCY PREPARE...|     2016-09-12|               ACTIVE|   86005.0|per Annum|      1820.0|        8469

#### Data Loading