# Load the Dataset


In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")

# Show Schema and Sample Data
# print("---This is Diagnostic check, No need to print it in the final doc---")

# df.printSchema() # comment this line when rendering the submission
# df.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/07 01:47:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                


# Feature Engineering

In [2]:
#| eval: true
#| echo: false
#| fig-align: center

from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Drop Missing Values in target variable and key features

df = df.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE","EMPLOYMENT_TYPE_NAME","REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP","COMPANY_IS_STAFFING", "STATE_NAME", "MIN_EDULEVELS_NAME"
])

# Selecting 2 categorical variables
categorical_cols =[
    "REMOTE_TYPE_NAME", "EMPLOYMENT_TYPE_NAME"
]

#Index and One-Hot encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid='skip') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

# Assemble base features (for GLR and Random Forrest)
from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

assembler = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features"
)

# Build pipeline and transform
pipeline = Pipeline(stages=indexers + encoders + [assembler])
data = pipeline.fit(df).transform(df)

# Create squared term for Polynomical Regression
data = data.withColumn("MIN_YEARS_EXPERIENCE_SQ", pow(col("MIN_YEARS_EXPERIENCE"),2))

#Assemble polynomial features
assembler_poly = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features_poly"
)

data = assembler_poly.transform(data)

data.select("SALARY","features", "features_poly").show(5, truncate=False)

# Split into training and testing sets (80% training, 20% testing)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Check number of records in each
print("Training Data Count:", train_data.count())
print("Testing Data Count:", test_data.count())



                                                                                

+------+--------------------------------------+--------------------------------------+
|SALARY|features                              |features_poly                         |
+------+--------------------------------------+--------------------------------------+
|192800|(9,[0,1,4,7],[6.0,55.0,1.0,1.0])      |(9,[0,1,4,7],[6.0,55.0,1.0,1.0])      |
|125900|(9,[0,1,4,7],[12.0,18.0,1.0,1.0])     |(9,[0,1,4,7],[12.0,18.0,1.0,1.0])     |
|118560|[5.0,20.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0]|[5.0,20.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0]|
|192800|(9,[0,1,4,7],[6.0,55.0,1.0,1.0])      |(9,[0,1,4,7],[6.0,55.0,1.0,1.0])      |
|116500|(9,[0,1,4,7],[12.0,16.0,1.0,1.0])     |(9,[0,1,4,7],[12.0,16.0,1.0,1.0])     |
+------+--------------------------------------+--------------------------------------+
only showing top 5 rows



25/10/07 01:47:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Training Data Count: 11604


[Stage 12:>                                                         (0 + 1) / 1]

Testing Data Count: 2812


                                                                                

# Train/Test Split

In [6]:
# Split into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Check number of records in each
print("Training Data Count:", train_data.count())
print("Testing Data Count:", test_data.count())

# Split Data
regression_train, regression_test = regression_data.randomSplit([0.8, 0.2], seed=42)
print((regression_data.count(), len(regression_data.columns)))
print((regression_train.count(), len(regression_train.columns)))
print((regression_test.count(), len(regression_test.columns)))



                                                                                

Training Data Count: 11604


                                                                                

Testing Data Count: 2812


                                                                                

(2243, 14)


                                                                                

(1848, 14)


[Stage 41:>                                                         (0 + 1) / 1]

(395, 14)


                                                                                

I chose the 80/20 split after consulting with AI (CHATGPT SEE APPENDIX 1) basing it on the widely accepted standard for machine learning, which is good for model effectiveness and performance.

# Regression_DF

In [4]:
from pyspark.sql.functions import regexp_replace, trim, col

regression_df = df.select(
    "SALARY",
    "MIN_YEARS_EXPERIENCE",
    "MAX_YEARS_EXPERIENCE",
    "DURATION",
    "IS_INTERNSHIP",
    "COMPANY_IS_STAFFING",
    "REMOTE_TYPE_NAME",
    "EMPLOYMENT_TYPE_NAME",
    "EDUCATION_LEVELS_NAME"
)

regression_df = regression_df.withColumn(
    "EDUCATION_LEVELS_NAME",
    trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\.\-]", " "))
)

regression_df = regression_df.dropna(subset=[
    "SALARY",
    "MIN_YEARS_EXPERIENCE",
    "MAX_YEARS_EXPERIENCE",
    "DURATION",
    "IS_INTERNSHIP",
    "COMPANY_IS_STAFFING",
    "REMOTE_TYPE_NAME",
    "EMPLOYMENT_TYPE_NAME",
    "EDUCATION_LEVELS_NAME"
])

regression_df.show(5, truncate=False)

+------+--------------------+--------------------+--------+-------------+-------------------+----------------+----------------------+------------------------------------------------------------------------------------+
|SALARY|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|DURATION|IS_INTERNSHIP|COMPANY_IS_STAFFING|REMOTE_TYPE_NAME|EMPLOYMENT_TYPE_NAME  |EDUCATION_LEVELS_NAME                                                               |
+------+--------------------+--------------------+--------+-------------+-------------------+----------------+----------------------+------------------------------------------------------------------------------------+
|131100|2                   |2                   |11      |false        |false              |[None]          |Full-time (> 32 hours)|[\n  "Bachelor's degree"\n]                                                         |
|136950|3                   |3                   |28      |false        |false              |Remote          |Full-time (> 3

# Linear Regression Model (OLS)

In [5]:

from pyspark.sql.functions import regexp_replace, trim

regression_df = regression_df.withColumn("EDUCATION_LEVELS_NAME", trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\.\-]", " ")))

# Index and One-Hot Encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid='skip') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

# Assemble base features (for GLR and Random Forrest)
assembler = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features"
)

pipeline_reg = Pipeline(stages=indexers + encoders + [assembler])
regression_data = pipeline_reg.fit(regression_df).transform(regression_df)

regression_data.select("SALARY","features").show(5, truncate=False)



                                                                                

+------+---------------------------------------+
|SALARY|features                               |
+------+---------------------------------------+
|131100|(10,[0,1,2,5,8],[2.0,2.0,11.0,1.0,1.0])|
|136950|(10,[0,1,2,6,8],[3.0,3.0,28.0,1.0,1.0])|
|136950|(10,[0,1,2,6,8],[3.0,3.0,28.0,1.0,1.0])|
|104000|(10,[0,1,2,5,8],[3.0,3.0,8.0,1.0,1.0]) |
|80000 |(10,[0,1,2,5,8],[3.0,3.0,37.0,1.0,1.0])|
+------+---------------------------------------+
only showing top 5 rows

