# Load the Dataset


In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")

# Show Schema and Sample Data
# print("---This is Diagnostic check, No need to print it in the final doc---")

# df.printSchema() # comment this line when rendering the submission
# df.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/06 21:06:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                


# Feature Engineering

In [None]:
#| eval: true
#| echo: false
#| fig-align: center

from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Drop Missing Values in target variable and key features

df = df.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE","EMPLOYMENT_TYPE_NAME","REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP","COMPANY_IS_STAFFING", "STATE_NAME", "MIN_EDULEVELS_NAME"
])

# Selecting 2 categorical variables
categorical_cols =[
    "REMOTE_TYPE_NAME", "EMPLOYMENT_TYPE_NAME"
]

#Index and One-Hot encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid='skip') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

# Assemble base features (for GLR and Random Forrest)
from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

assembler = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features"
)

# Build pipeline and transform
pipeline = Pipeline(stages=indexers + encoders + [assembler])
data = pipeline.fit(df).transform(df)

# Create squared term for Polynomical Regression
data = data.withColumn("MIN_YEARS_EXPERIENCE_SQ", pow(col("MIN_YEARS_EXPERIENCE"),2))

#Assemble polynomial features
assembler_poly = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features_poly"
)

data = assembler_poly.transform(data)

data.select("SALARY","features", "features_poly").show(5, truncate=False)

# Split into training and testing sets (80% training, 20% testing)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Check number of records in each
print("Training Data Count:", train_data.count())
print("Testing Data Count:", test_data.count())



                                                                                

+------+---------------------------------+---------------------------------+
|SALARY|features                         |features_poly                    |
+------+---------------------------------+---------------------------------+
|92962 |(9,[0,1,4,7],[2.0,18.0,1.0,1.0]) |(9,[0,1,4,7],[2.0,18.0,1.0,1.0]) |
|107645|(9,[0,1,7],[10.0,18.0,1.0])      |(9,[0,1,7],[10.0,18.0,1.0])      |
|192800|(9,[0,1,4,7],[6.0,55.0,1.0,1.0]) |(9,[0,1,4,7],[6.0,55.0,1.0,1.0]) |
|125900|(9,[0,1,4,7],[12.0,18.0,1.0,1.0])|(9,[0,1,4,7],[12.0,18.0,1.0,1.0])|
|170000|(9,[0,1,4,7],[6.0,18.0,1.0,1.0]) |(9,[0,1,4,7],[6.0,18.0,1.0,1.0]) |
+------+---------------------------------+---------------------------------+
only showing top 5 rows



                                                                                

Training Data Count: 18966


[Stage 141:>                                                        (0 + 1) / 1]

Testing Data Count: 4731


                                                                                

# Train/Test Split

In [None]:
# Split into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Check number of records in each
print("Training Data Count:", train_data.count())
print("Testing Data Count:", test_data.count())



                                                                                

Training Data Count: 18966


[Stage 147:>                                                        (0 + 1) / 1]

Testing Data Count: 4731


                                                                                

I chose the 80/20 split after consulting with AI (CHATGPT SEE APPENDIX 1) basing it on the widely accepted standard for machine learning, which is good for model effectiveness and performance.

# Linear Regression Model (OLS)

25/10/06 22:11:22 WARN Instrumentation: [9844a335] regParam is zero, which might cause numerical instability and overfitting.
[Stage 157:>                                                        (0 + 1) / 1]

Intercept: 75298.51651262121
Coefficients: [6615.06097689284,-90.88575270939587,-95.6950324882812,-2338.5967976991965,10727.559248773261,13561.321280558403,18360.09331832105,1085.2671075200626,-6512.923684351729]
R²: 0.2632
RMSE: 37291.4951
MAE: 28918.2778


                                                                                

AttributeError: 'LinearRegressionTrainingSummary' object has no attribute 'featureNames'