---
title: Assignment 04
author:
  - name: Bhargavi Manyala
    affiliations:
      - id: bu
        name: Boston University
        city: Boston
        state: MA
number-sections: true
date: today
date-modified: today
date-format: long
format:
  html:
    theme: cerulean
    toc: true
    toc-depth: 2
engine: jupyter
jupyter: assignment-04-kernel
execute:
  echo: true
  eval: true
  output: true
  freeze: auto
---

# Load the Dataset

In [14]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np
from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, regexp_replace, trim


np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("multiLine", "true")
    .option("escape", "\"")  
    .csv("data/lightcast_job_postings.csv")

)

df.createOrReplaceTempView("job_postings")
#df.show(5)


                                                                                

# Feature Engineering

### Selection of Variables

In [None]:
min_years = "MIN_YEARS_EXPERIENCE"
cont_cols = ["MAX_YEARS_EXPERIENCE", "DURATION", "SALARY_FROM"]
cat_cols = ["EMPLOYMENT_TYPE_NAME", "EDUCATION_LEVELS_NAME"]
y_col = "SALARY"

required_cols = [y_col, min_years] + cont_cols + cat_cols
df_clean = df.dropna(subset=required_cols).select(required_cols)


print("Rows before:", df.count(), "| Rows after NA drop:", df_clean.count())

# --- Clean EDUCATION_LEVELS_NAME ---
df_clean = df_clean.withColumn(
    "EDUCATION_LEVELS_NAME",
    trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\[\]\n\"]", ""))
)

# --- Cast numeric columns to double ---
for c in [y_col, min_years] + cont_cols:
    df_clean = df_clean.withColumn(c, col(c).cast("double"))

# --- Final Check ---
print("[OK] Data cleaned and numeric columns casted.")
df_clean.show(5)


                                                                                

Rows before: 72498 | Rows after NA drop: 2243
[OK] Data cleaned and numeric columns casted.
+--------+--------------------+--------------------+--------+-----------+--------------------+---------------------+
|  SALARY|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|DURATION|SALARY_FROM|EMPLOYMENT_TYPE_NAME|EDUCATION_LEVELS_NAME|
+--------+--------------------+--------------------+--------+-----------+--------------------+---------------------+
|131100.0|                 2.0|                 2.0|    11.0|   113400.0|Full-time (> 32 h...|    Bachelor's degree|
|136950.0|                 3.0|                 3.0|    28.0|   115300.0|Full-time (> 32 h...| Bachelor's degree...|
|136950.0|                 3.0|                 3.0|    28.0|   115300.0|Full-time (> 32 h...| Bachelor's degree...|
|104000.0|                 3.0|                 3.0|     8.0|   104000.0|Full-time (> 32 h...|    Bachelor's degree|
| 80000.0|                 3.0|                 3.0|    37.0|    60000.0|Full-time (> 32 