# Load the Dataset


In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("data/lightcast_job_postings.csv")

# Show Schema and Sample Data
# print("---This is Diagnostic check, No need to print it in the final doc---")

# df.printSchema() # comment this line when rendering the submission
# df.show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/07 03:00:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                


# Feature Engineering

In [2]:
#| eval: true
#| echo: false
#| fig-align: center

from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Drop Missing Values in target variable and key features

df = df.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE","EMPLOYMENT_TYPE_NAME","REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP","COMPANY_IS_STAFFING", "STATE_NAME", "MIN_EDULEVELS_NAME"
])

# Selecting 2 categorical variables
categorical_cols =[
    "REMOTE_TYPE_NAME", "EMPLOYMENT_TYPE_NAME"
]

#Index and One-Hot encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid='skip') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

# Assemble base features (for GLR and Random Forrest)
from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

assembler = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features"
)

# Build pipeline and transform
pipeline = Pipeline(stages=indexers + encoders + [assembler])
data = pipeline.fit(df).transform(df)

# Create squared term for Polynomical Regression
data = data.withColumn("MIN_YEARS_EXPERIENCE_SQ", pow(col("MIN_YEARS_EXPERIENCE"),2))

#Assemble polynomial features
assembler_poly = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features_poly"
)

data = assembler_poly.transform(data)

data.select("SALARY","features", "features_poly").show(5, truncate=False)

# Split into training and testing sets (80% training, 20% testing)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Check number of records in each
print("Training Data Count:", train_data.count())
print("Testing Data Count:", test_data.count())



                                                                                

+------+--------------------------------------+--------------------------------------+
|SALARY|features                              |features_poly                         |
+------+--------------------------------------+--------------------------------------+
|192800|(9,[0,1,4,7],[6.0,55.0,1.0,1.0])      |(9,[0,1,4,7],[6.0,55.0,1.0,1.0])      |
|125900|(9,[0,1,4,7],[12.0,18.0,1.0,1.0])     |(9,[0,1,4,7],[12.0,18.0,1.0,1.0])     |
|118560|[5.0,20.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0]|[5.0,20.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0]|
|192800|(9,[0,1,4,7],[6.0,55.0,1.0,1.0])      |(9,[0,1,4,7],[6.0,55.0,1.0,1.0])      |
|116500|(9,[0,1,4,7],[12.0,16.0,1.0,1.0])     |(9,[0,1,4,7],[12.0,16.0,1.0,1.0])     |
+------+--------------------------------------+--------------------------------------+
only showing top 5 rows



25/10/07 03:01:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Training Data Count: 11604


[Stage 12:>                                                         (0 + 1) / 1]

Testing Data Count: 2812


                                                                                

# Train/Test Split

In [7]:
# Split into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Check number of records in each
print("Training Data Count:", train_data.count())
print("Testing Data Count:", test_data.count())

# Split Data
regression_train, regression_test = regression_data.randomSplit([0.8, 0.2], seed=42)
print((regression_data.count(), len(regression_data.columns)))
print((regression_train.count(), len(regression_train.columns)))
print((regression_test.count(), len(regression_test.columns)))



                                                                                

Training Data Count: 11604


                                                                                

Testing Data Count: 2812


                                                                                

(2243, 14)


                                                                                

(1848, 14)


[Stage 41:>                                                         (0 + 1) / 1]

(395, 14)


                                                                                

- I chose the 80/20 split after consulting with AI (CHATGPT SEE APPENDIX 1) basing it on the widely accepted standard for machine learning, which is good for model effectiveness and performance.

# Regression_DF

In [8]:
from pyspark.sql.functions import regexp_replace, trim, col
from pyspark.sql.types import IntegerType

regression_df = df.select(
    "SALARY",
    "MIN_YEARS_EXPERIENCE",
    "MAX_YEARS_EXPERIENCE",
    "DURATION",
    "IS_INTERNSHIP",
    "COMPANY_IS_STAFFING",
    "REMOTE_TYPE_NAME",
    "EMPLOYMENT_TYPE_NAME",
    "EDUCATION_LEVELS_NAME"
)

regression_df = regression_df.withColumn(
    "EDUCATION_LEVELS_NAME",
    trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\.\-]", " "))
)

regression_df = regression_df.dropna(subset=[
    "SALARY",
    "MIN_YEARS_EXPERIENCE",
    "MAX_YEARS_EXPERIENCE",
    "DURATION",
    "IS_INTERNSHIP",
    "COMPANY_IS_STAFFING",
    "REMOTE_TYPE_NAME",
    "EMPLOYMENT_TYPE_NAME",
    "EDUCATION_LEVELS_NAME"
])

# Convert Duration to numeric (in days)
regression_df =regression_df.withColumn("DURATION", col("DURATION").cast(IntegerType()))

regression_df.show(5, truncate=False)

+------+--------------------+--------------------+--------+-------------+-------------------+----------------+----------------------+------------------------------------------------------------------------------------+
|SALARY|MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|DURATION|IS_INTERNSHIP|COMPANY_IS_STAFFING|REMOTE_TYPE_NAME|EMPLOYMENT_TYPE_NAME  |EDUCATION_LEVELS_NAME                                                               |
+------+--------------------+--------------------+--------+-------------+-------------------+----------------+----------------------+------------------------------------------------------------------------------------+
|131100|2                   |2                   |11      |false        |false              |[None]          |Full-time (> 32 hours)|[\n  "Bachelor's degree"\n]                                                         |
|136950|3                   |3                   |28      |false        |false              |Remote          |Full-time (> 3

# Linear Regression Model (OLS)

In [9]:

from pyspark.sql.functions import regexp_replace, trim

regression_df = regression_df.withColumn("EDUCATION_LEVELS_NAME", trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\.\-]", " ")))

# Index and One-Hot Encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid='skip') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

# Assemble base features (for GLR and Random Forrest)
assembler = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features"
)

pipeline_reg = Pipeline(stages=indexers + encoders + [assembler])
regression_data = pipeline_reg.fit(regression_df).transform(regression_df)

regression_data.select("SALARY","features").show(5, truncate=False)



                                                                                

+------+---------------------------------------+
|SALARY|features                               |
+------+---------------------------------------+
|131100|(10,[0,1,2,5,8],[2.0,2.0,11.0,1.0,1.0])|
|136950|(10,[0,1,2,6,8],[3.0,3.0,28.0,1.0,1.0])|
|136950|(10,[0,1,2,6,8],[3.0,3.0,28.0,1.0,1.0])|
|104000|(10,[0,1,2,5,8],[3.0,3.0,8.0,1.0,1.0]) |
|80000 |(10,[0,1,2,5,8],[3.0,3.0,37.0,1.0,1.0])|
+------+---------------------------------------+
only showing top 5 rows



# Linear Regression

In [None]:
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression

feature_names = assembler.getInputCols()

glr = GeneralizedLinearRegression(
    featuresCol="features",
    labelCol="SALARY",
    family="gaussian",
    link="identity",
    maxIter=10,
    regParam=0.3
)

glr_model = glr.fit(regression_data)
summary = glr_model.summary


                                                                                

In [11]:
# Coefficients and Intercept
print("Intercept: {:.4f}".format(glr_model.intercept))
print("Coefficients: ")
for i, coef in enumerate(glr_model.coefficients):
    print(f"  Feature {i + 1}: {coef:.4f}")

Intercept: 72867.8012
Coefficients: 
  Feature 1: 6788.4217
  Feature 2: -30.2179
  Feature 3: -7430.8468
  Feature 4: -462.8348
  Feature 5: 9376.7518
  Feature 6: 10966.5333
  Feature 7: 24479.1926
  Feature 8: 873.5864
  Feature 9: -1045.6683


In [12]:
# Summary stats

print("\n---Regression Summary---")
print("Coefficient Standard Errors:", [f"{val:.4f}" for val in summary.coefficientStandardErrors])
print("T-values:", [f"{val:.4f}" for val in summary.tValues])
print("P-values:", [f"{val:.4f}" for val in summary.pValues])


---Regression Summary---


[Stage 53:>                                                         (0 + 1) / 1]

Coefficient Standard Errors: ['101.7731', '23.4317', '6719.3065', '1059.4688', '2390.1405', '2457.1968', '3083.8552', '2864.5937', '3513.3919', '3466.3799']
T-values: ['66.7015', '-1.2896', '-1.1059', '-0.4369', '3.9231', '4.4630', '7.9379', '0.3050', '-0.2976', '21.0213']
P-values: ['0.0000', '0.1972', '0.2688', '0.6622', '0.0001', '0.0000', '0.0000', '0.7604', '0.7660', '0.0000']


                                                                                

In [13]:
print(f"nullDeviance: {summary.nullDeviance:.4f}")
print(f"Residual DF Null: {summary.residualDegreeOfFreedom}")
print(f"Residual DF: {summary.residualDegreeOfFreedom}")
print(f"AIC: {summary.aic:.4f}")
print(f"Deviance: {summary.deviance:.4f}")

                                                                                

nullDeviance: 21009946856970.0977
Residual DF Null: 11594
Residual DF: 11594


[Stage 57:>                                                         (0 + 1) / 1]

AIC: 276466.6390
Deviance: 15080861035926.1094


                                                                                

In [14]:
# Pull feature names directly from Java backend

feature_names = summary._call_java("featureNames")

# Construct full table including intercept
features = ["Intercept"] + feature_names
coef = [glr_model.intercept] + list(glr_model.coefficients)
se = list(summary.coefficientStandardErrors)
tvals = list(summary.tValues)
pvals = list(summary.pValues)

print("Length of features:", len(features))
print("Length of coefficients:", len(coef))
print("Length of standard errors:", len(se))
print("Length of t-values:", len(tvals))
print("Length of p-values:", len(pvals))

Length of features: 10
Length of coefficients: 10
Length of standard errors: 10
Length of t-values: 10
Length of p-values: 10


In [15]:
import pandas as pd
from tabulate import tabulate
from IPython.display import HTML

coef_table = pd.DataFrame({
    "Features": features,
    "Estimate": [f"{v:.4f}" if v is not None else None for v in coef],
    "Std. Error": [f"{v:.4f}" if v is not None else None for v in se],
    "t value": [f"{v:.4f}" if v is not None else None for v in tvals],
    "P-value": [f"{v:.4f}" if v is not None else None for v in pvals]
})

# Save Report
coef_table.to_csv("output/glr_summary.csb", index=False)

# Optional pretty print

HTML (coef_table.to_html())

Unnamed: 0,Features,Estimate,Std. Error,t value,P-value
0,Intercept,72867.8012,101.7731,66.7015,0.0
1,MIN_YEARS_EXPERIENCE,6788.4217,23.4317,-1.2896,0.1972
2,DURATION,-30.2179,6719.3065,-1.1059,0.2688
3,IS_INTERNSHIP,-7430.8468,1059.4688,-0.4369,0.6622
4,COMPANY_IS_STAFFING,-462.8348,2390.1405,3.9231,0.0001
5,REMOTE_TYPE_NAME_vec_[None],9376.7518,2457.1968,4.463,0.0
6,REMOTE_TYPE_NAME_vec_Remote,10966.5333,3083.8552,7.9379,0.0
7,REMOTE_TYPE_NAME_vec_Hybrid Remote,24479.1926,2864.5937,0.305,0.7604
8,EMPLOYMENT_TYPE_NAME_vec_Full-time (> 32 hours),873.5864,3513.3919,-0.2976,0.766
9,EMPLOYMENT_TYPE_NAME_vec_Part-time (â‰¤ 32 hours),-1045.6683,3466.3799,21.0213,0.0


- Performance

# Generalized Linear Regression Summary
- 

# Polynomical Regression

In [28]:
#| eval: true
#| echo: false
#| fig-align: center

# Index and One-Hot encode

poly_data = regression_data.withColumn("MIN_YEARS_EXPERIENCE_SQ", pow(col("MIN_YEARS_EXPERIENCE"),2))

assembler_poly = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "MIN_YEARS_EXPERIENCE_SQ", "MAX_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features_poly"
)  

poly_data = assembler_poly.transform(poly_data)

poly_data.select("SALARY","features_poly").show(5, truncate=False)

+------+---------------------------------------------+
|SALARY|features_poly                                |
+------+---------------------------------------------+
|131100|(11,[0,1,2,3,6,9],[2.0,4.0,2.0,11.0,1.0,1.0])|
|136950|(11,[0,1,2,3,7,9],[3.0,9.0,3.0,28.0,1.0,1.0])|
|136950|(11,[0,1,2,3,7,9],[3.0,9.0,3.0,28.0,1.0,1.0])|
|104000|(11,[0,1,2,3,6,9],[3.0,9.0,3.0,8.0,1.0,1.0]) |
|80000 |(11,[0,1,2,3,6,9],[3.0,9.0,3.0,37.0,1.0,1.0])|
+------+---------------------------------------------+
only showing top 5 rows



In [29]:
# Split Data
polyreg_train, polyreg_test = poly_data.randomSplit([0.8, 0.2], seed=42)
print((poly_data.count(), len(poly_data.columns)))
print((polyreg_train.count(), len(polyreg_train.columns)))
print((polyreg_test.count(), len(polyreg_test.columns)))


                                                                                

(2243, 16)


                                                                                

(1848, 16)


[Stage 86:>                                                         (0 + 1) / 1]

(395, 16)


                                                                                

In [30]:
from pyspark.ml.regression import GeneralizedLinearRegression, LinearRegression

feature_names = assembler.getInputCols()

poly_glr_max_years_model = GeneralizedLinearRegression(
    featuresCol="features",
    labelCol="SALARY",
    family="gaussian",
    link="identity",
    maxIter=10,
    regParam=0.3
)

poly_glr_max_years_model = poly_glr_max_years_model.fit(poly_data)
poly_summary = poly_glr_max_years_model.summary

                                                                                

In [31]:
# Coefficients and Intercept
print("Intercept: {:.4f}".format(poly_glr_max_years_model.intercept))
print("Coefficients: ")
for i, coef in enumerate(poly_glr_max_years_model.coefficients):
    print(f"  Feature {i + 1}: {coef:.4f}")

    # Summary stats

print("\n---Regression Summary---")
print("Coefficient Standard Errors:", [f"{val:.4f}" for val in poly_summary.coefficientStandardErrors])
print("T-values:", [f"{val:.4f}" for val in poly_summary.tValues])
print("P-values:", [f"{val:.4f}" for val in poly_summary.pValues])

Intercept: 75819.4000
Coefficients: 
  Feature 1: 4215.6844
  Feature 2: 4215.6844
  Feature 3: -23.7725
  Feature 4: -23942.2288
  Feature 5: -98.6466
  Feature 6: 6430.6643
  Feature 7: 16463.9439
  Feature 8: 655.1948
  Feature 9: -9251.7034
  Feature 10: -12761.7200

---Regression Summary---


[Stage 90:>                                                         (0 + 1) / 1]

Coefficient Standard Errors: ['57432.0320', '57432.0320', '40.9532', '12518.4270', '1651.7511', '5327.5134', '5421.5678', '6892.7578', '7503.5432', '8182.7045', '9011.3793']
T-values: ['0.0734', '0.0734', '-0.5805', '-1.9126', '-0.0597', '1.2071', '3.0367', '0.0951', '-1.2330', '-1.5596', '8.4137']
P-values: ['0.9415', '0.9415', '0.5617', '0.0559', '0.9524', '0.2275', '0.0024', '0.9243', '0.2177', '0.1190', '0.0000']


                                                                                

In [32]:
print(f"nullDeviance: {poly_summary.nullDeviance:.4f}")
print(f"Residual DF Null: {poly_summary.residualDegreeOfFreedom}")
print(f"Residual DF: {poly_summary.residualDegreeOfFreedom}")
print(f"AIC: {poly_summary.aic:.4f}")
print(f"Deviance: {poly_summary.deviance:.4f}")

[Stage 93:>                                                         (0 + 1) / 1]

nullDeviance: 2728391925894.5732
Residual DF Null: 2232
Residual DF: 2232


[Stage 94:>                                                         (0 + 1) / 1]

AIC: 52299.8485
Deviance: 1738269402130.5212


                                                                                

In [34]:
# Pull feature names directly from Java backend

feature_names = poly_summary._call_java("featureNames")

# Construct full table including intercept
poly_features = ["Intercept"] + feature_names
poly_coef = [poly_glr_max_years_model.intercept] + list(poly_glr_max_years_model.coefficients)
poly_se = list(poly_summary.coefficientStandardErrors)
poly_tvals = list(poly_summary.tValues)
poly_pvals = list(poly_summary.pValues)

print("Length of features:", len(poly_features))
print("Length of coefficients:", len(poly_coef))
print("Length of standard errors:", len(poly_se))
print("Length of t-values:", len(poly_tvals))
print("Length of p-values:", len(poly_pvals))

Length of features: 11
Length of coefficients: 11
Length of standard errors: 11
Length of t-values: 11
Length of p-values: 11


In [35]:
import pandas as pd
from tabulate import tabulate
from IPython.display import HTML

poly_coef_table = pd.DataFrame({
    "Features": poly_features,
    "Estimate": [f"{v:.4f}" if v is not None else None for v in poly_coef],
    "Std. Error": [f"{v:.4f}" if v is not None else None for v in poly_se],
    "t value": [f"{v:.4f}" if v is not None else None for v in poly_tvals],
    "P-value": [f"{v:.4f}" if v is not None else None for v in poly_pvals]
})

# Save Report
poly_coef_table.to_csv("output/poly_summary.csb", index=False)

# Optional pretty print

HTML (poly_coef_table.to_html())

Unnamed: 0,Features,Estimate,Std. Error,t value,P-value
0,Intercept,75819.4,57432.032,0.0734,0.9415
1,MIN_YEARS_EXPERIENCE,4215.6844,57432.032,0.0734,0.9415
2,MAX_YEARS_EXPERIENCE,4215.6844,40.9532,-0.5805,0.5617
3,DURATION,-23.7725,12518.427,-1.9126,0.0559
4,IS_INTERNSHIP,-23942.2288,1651.7511,-0.0597,0.9524
5,COMPANY_IS_STAFFING,-98.6466,5327.5134,1.2071,0.2275
6,REMOTE_TYPE_NAME_vec_[None],6430.6643,5421.5678,3.0367,0.0024
7,REMOTE_TYPE_NAME_vec_Remote,16463.9439,6892.7578,0.0951,0.9243
8,REMOTE_TYPE_NAME_vec_Hybrid Remote,655.1948,7503.5432,-1.233,0.2177
9,EMPLOYMENT_TYPE_NAME_vec_Full-time (> 32 hours),-9251.7034,8182.7045,-1.5596,0.119
