---
title: Assignment 04
author:
  - name: Wei Wang
    affiliations:
      - id: bu
        name: Boston University
        city: Boston
        state: MA
number-sections: true
date: '2025-10-04'
date-modified: today
date-format: long
format:
  html:
    theme: cerulean
    toc: true
    toc-depth: 2

execute:
  echo: false
  eval: false
  freeze: auto
---

# Load the Dataset

In [10]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")

# Show Schema and Sample Data
# print("---This is Diagnostic check, No need to print it in the final doc---")

# df.printSchema() # comment this line when rendering the submission
df.show(5)

                                                                                

+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+------

# Feature Engineering

In [12]:
#| eval: true
#| echo: false
#| fig-align: center

from pyspark.sql.functions import col, pow, when
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import BooleanType, StringType, IntegerType

# Creating simplified REMOTE_TYPE column
df = df.withColumn(
    "REMOTE_TYPE",
    when(col("REMOTE_TYPE_NAME") == "Remote", "Remote")
    .when(col("REMOTE_TYPE_NAME") == "Hybrid Remote", "Hybrid")
    .otherwise("Onsite")
)

# Drop rows with missing values in relevant columns
regression_df = df.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE", "DURATION",
    "COMPANY_IS_STAFFING", "IS_INTERNSHIP",
     "STATE_NAME", "REMOTE_TYPE"
]).select(
    "SALARY", "MIN_YEARS_EXPERIENCE", "DURATION",
    "COMPANY_IS_STAFFING", "IS_INTERNSHIP",
     "STATE_NAME", "REMOTE_TYPE"
)

# Cast boolean columns to integer
regression_df = regression_df.withColumn("IS_INTERNSHIP", col("IS_INTERNSHIP").cast(IntegerType()))
regression_df = regression_df.withColumn("COMPANY_IS_STAFFING", col("COMPANY_IS_STAFFING").cast(IntegerType()))

# Categorical columns
categorical_cols = ["STATE_NAME", "REMOTE_TYPE"]

# Index and One-Hot Encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid='skip') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

# Create squared experience feature
regression_df = regression_df.withColumn("MIN_YEARS_EXPERIENCE_SQ", pow(col("MIN_YEARS_EXPERIENCE"), 2))

# Assemble base features
base_features = [
    "MIN_YEARS_EXPERIENCE", "DURATION", "COMPANY_IS_STAFFING",
    "IS_INTERNSHIP", "STATE_NAME_vec", "REMOTE_TYPE_vec"
]

# Assemble polynomial features (add squared column)
poly_features = base_features + ["MIN_YEARS_EXPERIENCE_SQ"]

assembler_base = VectorAssembler(inputCols=base_features, outputCol="features")
assembler_poly = VectorAssembler(inputCols=poly_features, outputCol="features_poly")

# Build pipeline
pipeline = Pipeline(stages=indexers + encoders + [assembler_base, assembler_poly])

# Fit and transform

regression_data = pipeline.fit(regression_df).transform(regression_df)

regression_data.select("SALARY","features","features_poly").show(5, truncate=False)

                                                                                

+------+-----------------------------------------+-------------------------------------------------+
|SALARY|features                                 |features_poly                                    |
+------+-----------------------------------------+-------------------------------------------------+
|192800|(56,[0,1,12,54],[6.0,55.0,1.0,1.0])      |(57,[0,1,12,54,56],[6.0,55.0,1.0,1.0,36.0])      |
|125900|(56,[0,1,15,54],[12.0,18.0,1.0,1.0])     |(57,[0,1,15,54,56],[12.0,18.0,1.0,1.0,144.0])    |
|118560|(56,[0,1,2,25,55],[5.0,20.0,1.0,1.0,1.0])|(57,[0,1,2,25,55,56],[5.0,20.0,1.0,1.0,1.0,25.0])|
|192800|(56,[0,1,11,54],[6.0,55.0,1.0,1.0])      |(57,[0,1,11,54,56],[6.0,55.0,1.0,1.0,36.0])      |
|116500|(56,[0,1,15,54],[12.0,16.0,1.0,1.0])     |(57,[0,1,15,54,56],[12.0,16.0,1.0,1.0,144.0])    |
+------+-----------------------------------------+-------------------------------------------------+
only showing top 5 rows


# Train/Test Split