In [None]:
# Import Necessary Libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
import os, glob
import dotenv
import psycopg2

In [2]:
# Set Java Environment
os.environ['JAVA_HOME'] = "C:/java8"

In [3]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PayrollETL") \
    .config("spark.sql.shuffle.partitions", "50") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

#### Data Extraction

In [4]:
#Load Master Data
employee_df = spark.read.csv(r'dataset\raw\EmpMaster.csv', header=True, inferSchema=True)
agency_df = spark.read.csv(r'dataset\raw\AgencyMaster.csv', header=True, inferSchema=True)
jobtitle_df = spark.read.csv(r'dataset/raw/TitleMaster.csv', header=True, inferSchema=True)

In [None]:
#Dynamically Loading and Merging all Payroll data

# Define directory containing payroll CSV files
payroll_dir = r"dataset\payroll_data" 

# Find all payroll CSV files dynamically
payroll_files = glob.glob(os.path.join(payroll_dir, "nycpayroll_*.csv"))

# Check if any files were found
if not payroll_files:
    raise ValueError("No payroll files found in the directory!")

# Load and merge payroll data
def load_payroll_data(files):
    """Loads multiple payroll datasets dynamically and merges them."""
    dataframes = [spark.read.csv(file, header=True, inferSchema=True) for file in files]
    merged_df = dataframes[0]
    for df in dataframes[1:]:
        merged_df = merged_df.union(df)
    return merged_df.dropDuplicates(["EmployeeID", "FiscalYear"])

# Load and process payroll data
payroll_df = load_payroll_data(payroll_files)

# Show the schema and first few rows for validation
payroll_df.printSchema()
payroll_df.show()

root
 |-- FiscalYear: integer (nullable = true)
 |-- PayrollNumber: integer (nullable = true)
 |-- AgencyID: integer (nullable = true)
 |-- AgencyName: string (nullable = true)
 |-- EmployeeID: integer (nullable = true)
 |-- LastName: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- AgencyStartDate: string (nullable = true)
 |-- WorkLocationBorough: string (nullable = true)
 |-- TitleCode: integer (nullable = true)
 |-- TitleDescription: string (nullable = true)
 |-- LeaveStatusasofJune30: string (nullable = true)
 |-- BaseSalary: double (nullable = true)
 |-- PayBasis: string (nullable = true)
 |-- RegularHours: double (nullable = true)
 |-- RegularGrossPaid: double (nullable = true)
 |-- OTHours: double (nullable = true)
 |-- TotalOTPaid: double (nullable = true)
 |-- TotalOtherPay: double (nullable = true)

+----------+-------------+--------+--------------------+----------+---------------+-----------+---------------+-------------------+---------+---------------

In [None]:
# Load and merge payroll data
#payroll_df = load_data(payroll_files)

#payroll_df.write.csv("payroll_df.csv", header=True, mode="overwrite")

#### Data Transformation

In [181]:
# Convert AgencyStartDate to Date datatype
payroll_df = payroll_df.withColumn("AgencyStartDate", pyspark.sql.functions.to_date(payroll_df["AgencyStartDate"], "M/d/yyyy"))

payroll_df.show(300)

+----------+-------------+--------+--------------------+----------+---------------+-----------+---------------+-------------------+---------+--------------------+---------------------+----------+---------+------------+----------------+-------+-----------+-------------+
|FiscalYear|PayrollNumber|AgencyID|          AgencyName|EmployeeID|       LastName|  FirstName|AgencyStartDate|WorkLocationBorough|TitleCode|    TitleDescription|LeaveStatusasofJune30|BaseSalary| PayBasis|RegularHours|RegularGrossPaid|OTHours|TotalOTPaid|TotalOtherPay|
+----------+-------------+--------+--------------------+----------+---------------+-----------+---------------+-------------------+---------+--------------------+---------------------+----------+---------+------------+----------------+-------+-----------+-------------+
|      2020|           17|    2120|OFFICE OF EMERGEN...|     10001|         GEAGER|   VERONICA|     2016-09-12|           BROOKLYN|    40447|EMERGENCY PREPARE...|               ACTIVE|   860

In [58]:
# Check for Null values
for column in payroll_df.columns:
    print(column, 'Nulls: ', payroll_df.filter(payroll_df[column].isNull()).count())

FiscalYear Nulls:  0
PayrollNumber Nulls:  0
AgencyID Nulls:  0
AgencyName Nulls:  0
EmployeeID Nulls:  0
LastName Nulls:  0
FirstName Nulls:  0
AgencyStartDate Nulls:  0
WorkLocationBorough Nulls:  0
TitleCode Nulls:  0
TitleDescription Nulls:  0
LeaveStatusasofJune30 Nulls:  0
BaseSalary Nulls:  0
PayBasis Nulls:  0
RegularHours Nulls:  0
RegularGrossPaid Nulls:  0
OTHours Nulls:  0
TotalOTPaid Nulls:  0
TotalOtherPay Nulls:  0


In [59]:
# Fill null values with defaults
for col_name, dtype in payroll_df.dtypes:
    if dtype == "string":
        payroll_df = payroll_df.fillna({col_name: "Unknown"})
    elif dtype in ["double", "float"]:
        payroll_df = payroll_df.fillna({col_name: 0.0})
    elif dtype in ["int", "bigint"]:
        payroll_df = payroll_df.fillna({col_name: 0})

In [122]:
# Merge all data together
merged_data = payroll_df \
    .join(employee_df, ["EmployeeID", "LastName", "FirstName"], "left") \
    .join(agency_df, ["AgencyID", "AgencyName"], "left") \
    .join(jobtitle_df, ["TitleCode", "TitleDescription"], "left")


In [126]:
merged_data.count()

201

In [186]:

# Create Employee Dimension Table
employee_dim = merged_data.select('EmployeeID', 'LastName', 'FirstName', 'WorkLocationBorough', 'LeaveStatusasofJune30')

#Create Agency Dimension Table
agency_dim = merged_data.select('AgencyID', 'AgencyName').dropDuplicates()
        # .withColumn('AgencyDimID', monotonically_increasing_id()) \
        # .select('AgencyDimID', 'AgencyID', 'AgencyName', 'AgencyStartDate')

#Create Job_Title Dimension Table
jobtitle_dim = merged_data.select('TitleCode', 'TitleDescription').dropDuplicates() 
        # .withColumn('TitleID', monotonically_increasing_id()) \
        # .select('TitleID', 'TitleCode', 'TitleDescription')

#Create Time Dimension Table
time_dim = merged_data.select('FiscalYear').dropDuplicates() \
        .withColumn('TimeID', monotonically_increasing_id()) \
        .select('TimeID', 'FiscalYear')
# # payroll_fact_table.write.csv(r"dataset\cleaned_data\nycpayroll.csv", header=True, mode="overwrite")

# # Create Payroll_Fact_table
payroll_fact_tbl = merged_data.join(employee_dim.alias('e'), ['LastName', 'FirstName', 'LeaveStatusasofJune30', 'WorkLocationBorough'], 'inner') \
                .join(agency_dim.alias('a'), ['AgencyName'], 'inner') \
                .join(jobtitle_dim.alias('t'), ['TitleDescription'], 'inner') \
                .join(time_dim, ['FiscalYear'], 'inner') \
                .withColumn('PayrollID', monotonically_increasing_id()) \
                .select('PayrollID','e.EmployeeID', 'a.AgencyID', 't.TitleCode', 'TimeID', 'PayrollNumber', 'BaseSalary', 'PayBasis', 'AgencyStartDate', 'RegularHours', 'RegularGrossPaid', 'OTHours', 'TotalOTPaid', 'TotalOtherPay')

In [193]:
payroll_fact_tbl.printSchema()

root
 |-- PayrollID: long (nullable = false)
 |-- EmployeeID: integer (nullable = false)
 |-- AgencyID: integer (nullable = false)
 |-- TitleCode: integer (nullable = false)
 |-- TimeID: long (nullable = false)
 |-- PayrollNumber: integer (nullable = false)
 |-- BaseSalary: double (nullable = false)
 |-- PayBasis: string (nullable = false)
 |-- AgencyStartDate: date (nullable = true)
 |-- RegularHours: double (nullable = false)
 |-- RegularGrossPaid: double (nullable = false)
 |-- OTHours: double (nullable = false)
 |-- TotalOTPaid: double (nullable = false)
 |-- TotalOtherPay: double (nullable = false)



In [194]:
time_dim.printSchema()

root
 |-- TimeID: long (nullable = false)
 |-- FiscalYear: integer (nullable = false)



In [189]:
# Save tables to Cleaned_data folder using parquet
employee_dim.write.mode("overwrite").parquet(r"dataset\cleaned_data\employee_dim")
agency_dim.write.mode("overwrite").parquet(r"dataset\cleaned_data\agency_dim")
jobtitle_dim.write.mode("overwrite").parquet(r"dataset\cleaned_data\jobtitle_dim")
time_dim.write.mode("overwrite").parquet(r"dataset\cleaned_data\time_dim")
payroll_fact_tbl.write.mode("overwrite").parquet(r"dataset\cleaned_data\payroll_fact_table")

#### Data Loading