1️⃣ Import Spark

In [1]:
# Import the SparkSession class from pyspark.sql module
from pyspark.sql import SparkSession

# Create a new SparkSession instance with local mode explicitly specified
# - master("local"): Runs Spark locally with one worker thread (no parallelism)
# - config(): Add configuration options to prevent hanging
spark = SparkSession.builder \
    .appName("pyspark-assessment") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .config("spark.hadoop.io.nativeio.NativeIO$Windows.access0", "false") \
    .config("spark.driver.maxResultSize", "1g") \
    .getOrCreate()

# Optional: Set log level to reduce console output
spark.sparkContext.setLogLevel("ERROR")

spark.conf.set("spark.hadoop.fs.file.impl.disable.cache", "true")


# Verify the session was created successfully
print("Spark session created successfully!")

Spark session created successfully!


2️⃣ Import the Modules

In [2]:
import sys
sys.path.append("../")

from src.data_processing import *
from src.feature_engineering import *
from src.kpi_analysis import *

3️⃣ Read Data

In [3]:
df = spark.read.csv("../data/nyc-jobs.csv", header=True, inferSchema=True)


4️⃣ Call Processing Functions in Correct Order

In [4]:
df = handle_null_values(df)
df = clean_salary_columns(df)
df = remove_unnecessary_columns(df)
df = add_degree_flag(df)
df = extract_experience_years(df)

skills = ["python", "sql", "aws", "spark"]
df = add_skill_flags(df, skills)


5️⃣ Run KPIs

In [5]:
top_10_job_categories(df).show()
highest_salary_per_agency(df).show()

+--------------------+-----+
|        Job Category|count|
+--------------------+-----+
|Engineering, Arch...|  504|
|Technology, Data ...|  313|
|       Legal Affairs|  226|
|Public Safety, In...|  182|
|Building Operatio...|  181|
|Finance, Accounti...|  169|
|Administration & ...|  134|
|Constituent Servi...|  129|
|              Health|  125|
|Policy, Research ...|  124|
+--------------------+-----+

+--------------------+----------+
|              Agency|max_salary|
+--------------------+----------+
|OFFICE OF COLLECT...|     9.555|
|ADMIN FOR CHILDRE...|  117474.5|
|MANHATTAN COMMUNI...|      19.0|
|TAXI & LIMOUSINE ...|  140000.0|
|DEPARTMENT OF BUS...|  111377.0|
|DEPT OF DESIGN & ...|  151795.0|
|FINANCIAL INFO SV...|  135000.0|
|HOUSING PRESERVAT...|  135000.0|
|CIVILIAN COMPLAIN...|  115732.0|
|OFFICE OF MANAGEM...|  117810.0|
|DEPARTMENT OF CIT...|  125000.0|
|DEPT OF HEALTH/ME...| 170133.84|
| BOARD OF CORRECTION|   96222.5|
|   POLICE DEPARTMENT|  217201.0|
|BUSINESS INTEG

6️⃣ Save Processed Data

In [8]:
df.write.mode("overwrite").parquet("../output/processed_nyc_jobs.parquet")

In [9]:
df2 = spark.read.parquet("../output/processed_nyc_jobs.parquet")
df2.show()


+------+--------------------+------------+--------------+--------------------+--------------------+-------------+-----+--------------------+-----------------------------+-----------------+---------------+----------------+--------------------+--------------------+--------------------+-------------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------------+----------+---------------+----------------+------+---+---+-----+
|Job ID|              Agency|Posting Type|# Of Positions|      Business Title| Civil Service Title|Title Code No|Level|        Job Category|Full-Time/Part-Time indicator|Salary Range From|Salary Range To|Salary Frequency|       Work Location|  Division/Work Unit|     Job Description|Minimum Qual Requirements|    Preferred Skills|            To Apply|         Hours/Shift|     Work Location 1|Residency Requirement|        Posting Da