1️⃣ Import Spark

In [None]:
# Import the SparkSession class from pyspark.sql module
from pyspark.sql import SparkSession

# Create a new SparkSession instance
# - builder: Creates a SparkSession builder object
# - appName: Sets the application name that appears in the Spark UI
# - getOrCreate: Returns an existing SparkSession or creates a new one if it doesn't exist
spark = SparkSession.builder \
    .appName("pyspark-assessment") \
    .getOrCreate()

2️⃣ Import the Modules

In [None]:
import sys
sys.path.append("../")

from src.data_processing import *
from src.feature_engineering import *
from src.kpi_analysis import *

3️⃣ Read Data

In [None]:
df = spark.read.csv("../data/nyc-jobs.csv", header=True, inferSchema=True)

4️⃣ Call Processing Functions in Correct Order

In [None]:
df = handle_null_values(df)
df = clean_salary_columns(df)
df = remove_unnecessary_columns(df)
df = add_degree_flag(df)
df = extract_experience_years(df)

skills = ["python", "sql", "aws", "spark"]
df = add_skill_flags(df, skills)

5️⃣ Run KPIs

In [None]:
top_10_job_categories(df).show()
highest_salary_per_agency(df).show()

6️⃣ Save Processed Data

In [None]:
df.write.mode("overwrite").parquet("../output/processed_nyc_jobs.parquet")