---
title: Assignment 04
author:
  - name: Dakota Alder
    affiliations:
      - id: bu
        name: Boston University
        city: Boston
        state: MA
number-sections: true
date: '2025-10-05'
date-modified: today
date-format: long
format:
  html:
    theme: cerulean
    toc: true
    toc-depth: 2

execute:
  echo: false
  eval: false
  freeze: auto
---

In [3]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/07 13:52:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [None]:
# Missing Value Treatment



In [4]:
#| eval: true
#| echo: false
#| fig-align: center

from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import BooleanType, StringType, IntegerType

# Data Cleaning

regression_df = df.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "EDUCATION_LEVELS_NAME",
    "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
]).select("SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "EDUCATION_LEVELS_NAME",
    "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING")

#Categorical Encoding
categorical_columns = ["EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING"]

#Convert Boolean to Integer
regression_df = regression_df.withColumn("IS_INTERNSHIP", col("IS_INTERNSHIP").cast(IntegerType()))
regression_df = regression_df.withColumn("COMPANY_IS_STAFFING", col("COMPANY_IS_STAFFING").cast(IntegerType()))

regression_df.show(5, truncate=False)


+------+--------------------+--------------------+------------------------------------------------------------------------------------+----------------------+----------------+--------+-------------+-------------------+
|SALARY|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|EDUCATION_LEVELS_NAME                                                               |EMPLOYMENT_TYPE_NAME  |REMOTE_TYPE_NAME|DURATION|IS_INTERNSHIP|COMPANY_IS_STAFFING|
+------+--------------------+--------------------+------------------------------------------------------------------------------------+----------------------+----------------+--------+-------------+-------------------+
|131100|2                   |2                   |[\n  "Bachelor's degree"\n]                                                         |Full-time (> 32 hours)|[None]          |11      |0            |0                  |
|136950|3                   |3                   |[\n  "Bachelor's degree",\n  "Master's degree",\n  "Ph.D. or professional 

In [5]:

#Clean education levels
from pyspark.sql.functions import regexp_replace, trim
regression_df = regression_df.withColumn("EDUCATION_LEVELS_NAME", trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\[\]\n]","")))


#Indexing and One-Hot Encoding

indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid='skip') for col in categorical_columns]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_columns]

#Assemble Features
assembler = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
        ] + [f"{col}_vec" for col in categorical_columns],
    outputCol="features"
)

pipeline = Pipeline(stages=indexers + encoders + [assembler])
regression_data = pipeline.fit(regression_df).transform(regression_df)

regression_data.select("SALARY", "features").show(5, truncate=False)





                                                                                

+------+--------------------------------------------------------------------+
|SALARY|features                                                            |
+------+--------------------------------------------------------------------+
|131100|(87,[0,1,2,5,22,24,27,85,86],[2.0,2.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|136950|(87,[0,1,2,9,22,25,48,85,86],[3.0,3.0,28.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|136950|(87,[0,1,2,9,22,25,48,85,86],[3.0,3.0,28.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|104000|(87,[0,1,2,5,22,24,30,85,86],[3.0,3.0,8.0,1.0,1.0,1.0,1.0,1.0,1.0]) |
|80000 |(87,[0,1,2,5,22,24,64,85,86],[3.0,3.0,37.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+------+--------------------------------------------------------------------+
only showing top 5 rows


In [6]:
#| eval: true
#| echo: false
#| fig-align: center

regression_train_data, regression_test_data = regression_data.randomSplit([0.8, 0.2], seed=42)
print((regression_data.count(), len(regression_data.columns)))
print((regression_train_data.count(), len(regression_train_data.columns)))
print((regression_test_data.count(), len(regression_test_data.columns)))

                                                                                

(2243, 22)


                                                                                

(1848, 22)


[Stage 46:>                                                         (0 + 1) / 1]

(395, 22)


                                                                                