---
title: Assignment 04
author:
  - name: Bhargavi Manyala
    affiliations:
      - id: bu
        name: Boston University
        city: Boston
        state: MA
number-sections: true
date: today
date-modified: today
date-format: long
format:
  html:
    theme: cerulean
    toc: true
    toc-depth: 2
  pdf: 
    embed-resources: true
    toc-depth: 2
    geometry: 
      - landscape
      - margin=0.5in
engine: jupyter
jupyter: assignment-04-kernel
execute:
  echo: true
  eval: true
  output: true
  freeze: auto
---



 # Load the Dataset

In [68]:
from pyspark.sql import SparkSession
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np


np.random.seed(42)

pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("multiLine", "true")
    .option("escape", "\"")  
    .csv("data/lightcast_job_postings.csv")

)

df.createOrReplaceTempView("job_postings")
#df.show(5)


                                                                                

### Missing Value Treatment

In [69]:
# Missing Value Treatment
from pyspark.sql import Window
from pyspark.sql.functions import col, when, isnan, count, expr, median
from pyspark.sql import functions as F

# Calculate overall median salary
overall_median_salarly = df.approxQuantile("SALARY", [0.5], 0.01)[0]

median_by_employment_type = df.groupBy("EMPLOYMENT_TYPE").agg(expr("percentile_approx(SALARY, 0.5)").alias("median_salary_emp_type"))
median_by_employment_type_name = df.groupBy("EMPLOYMENT_TYPE_NAME").agg(expr("percentile_approx(SALARY, 0.5)").alias("median_salary_emp_type_name"))

# Join median values back to the original dataframe
df_salary_imputed = df.join(median_by_employment_type, on="EMPLOYMENT_TYPE", how = "left").join(median_by_employment_type_name, on="EMPLOYMENT_TYPE_NAME", how = "left")


# Replace missing SALARY values
df_salary_imputed=df_salary_imputed.withColumn("SALARY", when(col("SALARY").isNull(), 
                                when (col("median_salary_emp_type").isNotNull(), col("median_salary_emp_type"))
                                .when(col("median_salary_emp_type_name").isNotNull(), col("median_salary_emp_type_name"))
                                .otherwise(overall_median_salarly)
).otherwise(col("SALARY"))) 




                                                                                

# Feature Engineering

### Take columns needed for anlaysis

In [70]:

from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.types import BooleanType, StringType, IntegerType
from pyspark.sql.functions import regexp_replace, trim

# Drop rows with NA values 
regression_df = df_salary_imputed.dropna(subset=[
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE",
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING","median_salary_emp_type", "median_salary_emp_type_name"
]).select(
    "SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE",
    "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME",
    "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING","median_salary_emp_type", "median_salary_emp_type_name"
   
)

# Cast Duration to integer
regression_df = regression_df.withColumn("DURATION", col("DURATION").cast(IntegerType()))



### Clean categorical columns 


In [71]:
# Categorical columns
categorical_cols = [ "EDUCATION_LEVELS_NAME", "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "IS_INTERNSHIP", "COMPANY_IS_STAFFING"]

# Cast boolean columns to integer
regression_df = regression_df.withColumn("IS_INTERNSHIP", col("IS_INTERNSHIP").cast(IntegerType()))
regression_df = regression_df.withColumn("COMPANY_IS_STAFFING", col("COMPANY_IS_STAFFING").cast(IntegerType()))


# Clean Remote Type Name
regression_df = regression_df.withColumn(
    "REMOTE_TYPE_NAME",
    when(col("REMOTE_TYPE_NAME") == "Remote", "Remote")
    .when(col("REMOTE_TYPE_NAME") == "[None]", "Undefined")
    .when(col("REMOTE_TYPE_NAME") == "Not Remote", "On Premise")
    .when(col("REMOTE_TYPE_NAME") == "Hybrid Remote", "Hybrid")
    .when(col("REMOTE_TYPE_NAME").isNull(), "On Premise")
    .otherwise(col("REMOTE_TYPE_NAME"))
)

# Clean Employment Type Name
regression_df = regression_df.withColumn(
    "EMPLOYMENT_TYPE_NAME",
    when(col("EMPLOYMENT_TYPE_NAME") == "Part-time / full-time", "Flexible")
    .when(col("EMPLOYMENT_TYPE_NAME") == "Part-time (â‰¤ 32 hours)", "Parttime")
    .when(col("EMPLOYMENT_TYPE_NAME") == "Full-time (> 32 hours)", "Fulltime")
    .when(col("EMPLOYMENT_TYPE_NAME").isNull(), "Fulltime")
    .otherwise(col("EMPLOYMENT_TYPE_NAME"))
)

# Clean Education Levels
regression_df = regression_df.withColumn(
    "EDUCATION_LEVELS_NAME",
    trim(regexp_replace(col("EDUCATION_LEVELS_NAME"), r"[\[\]\n]", ""))
)


regression_df.show(5, truncate=False)



[Stage 1071:>               (0 + 1) / 1][Stage 1072:>               (0 + 1) / 1]

+--------+--------------------+--------------------+---------------------+--------------------+----------------+--------+-------------+-------------------+----------------------+---------------------------+
|SALARY  |MIN_YEARS_EXPERIENCE|MAX_YEARS_EXPERIENCE|EDUCATION_LEVELS_NAME|EMPLOYMENT_TYPE_NAME|REMOTE_TYPE_NAME|DURATION|IS_INTERNSHIP|COMPANY_IS_STAFFING|median_salary_emp_type|median_salary_emp_type_name|
+--------+--------------------+--------------------+---------------------+--------------------+----------------+--------+-------------+-------------------+----------------------+---------------------------+
|116500.0|2                   |2                   |"Bachelor's degree"  |Fulltime            |Undefined       |6       |0            |0                  |116500                |116500                     |
|116500.0|7                   |7                   |"No Education Listed"|Fulltime            |Undefined       |18      |0            |1                  |116500           

                                                                                

### Final Features Structure

In [72]:
# Index and One-Hot Encode
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_idx", handleInvalid="skip") for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=f"{col}_idx", outputCol=f"{col}_vec") for col in categorical_cols]

#Assemble base features
assembler = VectorAssembler(inputCols=["MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "DURATION", "IS_INTERNSHIP", "COMPANY_IS_STAFFING"] + [f"{col}_vec" for col in categorical_cols], outputCol="features")


pipeline = Pipeline(stages=indexers + encoders + [assembler])
regression_data =  pipeline.fit(regression_df).transform(regression_df)
regression_data.select("SALARY","features").show(5, truncate=False)

[Stage 1130:>                                                       (0 + 1) / 1]

+--------+-------------------------------------------------------+
|SALARY  |features                                               |
+--------+-------------------------------------------------------+
|117500.0|(30,[0,1,2,7,25,28,29],[3.0,3.0,14.0,1.0,1.0,1.0,1.0]) |
|100000.0|(30,[0,1,2,5,25,28,29],[3.0,3.0,42.0,1.0,1.0,1.0,1.0]) |
|100000.0|(30,[0,1,2,5,25,28,29],[3.0,3.0,20.0,1.0,1.0,1.0,1.0]) |
|100000.0|(30,[0,1,2,10,25,28,29],[1.0,1.0,42.0,1.0,1.0,1.0,1.0])|
|162050.0|(30,[0,1,2,6,28,29],[5.0,5.0,33.0,1.0,1.0,1.0])        |
+--------+-------------------------------------------------------+
only showing top 5 rows



                                                                                

# Train/Test Split

In [73]:
# Split Data
regression_train, regression_test = regression_data.randomSplit([0.8, 0.2], seed=42)

print((regression_data.count(), len(regression_data.columns)))
print((regression_train.count(), len(regression_train.columns)))
print((regression_test.count(), len(regression_test.columns)))

                                                                                

(5039, 22)


                                                                                

(4070, 22)


[Stage 1165:>                                                       (0 + 1) / 1]

(969, 22)


                                                                                

# Linear Regression

In [74]:
from pyspark.ml.regression import GeneralizedLinearRegression

feature_names = assembler.getInputCols()

glr = GeneralizedLinearRegression(
    featuresCol="features",
    labelCol="SALARY",
    family="gaussian",
    link="identity",  
    maxIter=10,
    regParam=0.3
)

glr_model = glr.fit(regression_train)
summary = glr_model.summary

                                                                                

### Coefficients and Intercept

In [75]:
# Coefficients and Intercept
print("Intercept: {:.4f}".format(glr_model.intercept))
print("Coefficients:")
for i, coef in enumerate(glr_model.coefficients):
    print(f"Feature {i + 1}: {coef:.4f}")

Intercept: 85652.7724
Coefficients:
Feature 1: 1611.6790
Feature 2: 1611.6790
Feature 3: 28.5445
Feature 4: 1121.2885
Feature 5: 18.3956
Feature 6: 1247.2920
Feature 7: 4395.2939
Feature 8: 9613.9354
Feature 9: -25786.3100
Feature 10: 14121.9131
Feature 11: -10802.3370
Feature 12: -1697.8713
Feature 13: 9148.5643
Feature 14: -12465.7922
Feature 15: 1354.9289
Feature 16: -16520.4017
Feature 17: 33093.9737
Feature 18: -9792.1286
Feature 19: -5104.6947
Feature 20: 3404.4200
Feature 21: 5003.3703
Feature 22: -27508.5112
Feature 23: 2140.9090
Feature 24: 6459.6838
Feature 25: -6724.0612
Feature 26: 3701.7083
Feature 27: 7847.9024
Feature 28: 2132.7804
Feature 29: -1121.2880
Feature 30: -18.3956


### Regression Summary

In [76]:
# Summary stats
print("\n--- Regression Summary ---")
print("Coefficient Standard Errors:", [f"{val:.4f}" for val in summary.coefficientStandardErrors])
print("T-Values:", [f"{val:.4f}" for val in summary.tValues])
print("P-Values:", [f"{val:.4f}" for val in summary.pValues])


--- Regression Summary ---


[Stage 1187:>                                                       (0 + 1) / 1]

Coefficient Standard Errors: ['25284.0846', '25284.0846', '23.0776', '997287.1423', '178878.9931', '20925.2354', '20941.6279', '20945.0333', '21085.0222', '21087.7503', '21103.9802', '21115.3979', '21275.7023', '21353.2901', '21581.0996', '21655.1516', '21756.4357', '22072.8849', '22089.2698', '22646.1041', '22407.8241', '25731.0670', '25725.8832', '3160.6398', '3736.1231', '3336.2900', '3407.5906', '3790.8019', '997287.1423', '178878.9931', '1013428.7606']
T-Values: ['0.0637', '0.0637', '1.2369', '0.0011', '0.0001', '0.0596', '0.2099', '0.4590', '-1.2230', '0.6697', '-0.5119', '-0.0804', '0.4300', '-0.5838', '0.0628', '-0.7629', '1.5211', '-0.4436', '-0.2311', '0.1503', '0.2233', '-1.0691', '0.0832', '2.0438', '-1.7997', '1.1095', '2.3031', '0.5626', '-0.0011', '-0.0001', '0.0845']
P-Values: ['0.9492', '0.9492', '0.2162', '0.9991', '0.9999', '0.9525', '0.8338', '0.6463', '0.2214', '0.5031', '0.6088', '0.9359', '0.6672', '0.5594', '0.9499', '0.4456', '0.1283', '0.6573', '0.8173', '0.88

                                                                                

### Dispersion Summary

In [77]:
# print(f"\nDispersion: {summary.dispersion:.4f}")
print(f"Null Deviance: {summary.nullDeviance:.4f}")
print(f"Residual DF Null: {summary.residualDegreeOfFreedomNull}")
print(f"Deviance: {summary.deviance:.4f}")
print(f"Residual DF: {summary.residualDegreeOfFreedom}")
print(f"AIC: {summary.aic:.4f}")

                                                                                

Null Deviance: 2321265663379.3027
Residual DF Null: 4069
Deviance: 1808610663869.2112
Residual DF: 4039


[Stage 1203:>                                                       (0 + 1) / 1]

AIC: 92656.7396


                                                                                

### Diagnostics

In [78]:
feature_names = summary._call_java("featureNames")


features = ["Intercept"] + list(feature_names)
coefs    = [glr_model.intercept] + list(glr_model.coefficients)
se       = list(summary.coefficientStandardErrors)
tvals    = list(summary.tValues)
pvals    = list(summary.pValues)


print("--- This is a diagnostic check, no need to print in the final doc ---")
print("Length of features:", len(features))
print("Length of coefs:", len(coefs))
print("Length of se:", len(se))
print("Length of tvals:", len(tvals))
print("Length of pvals:", len(pvals))

--- This is a diagnostic check, no need to print in the final doc ---
Length of features: 31
Length of coefs: 31
Length of se: 31
Length of tvals: 31
Length of pvals: 31


### Tabulate

In [79]:
import pandas as pd
from tabulate import tabulate
from IPython.display import HTML

coef_table = pd.DataFrame({
    "Feature": features,
    "Estimate": [f"{v:.4f}" if v is not None else None for v in coefs],
    "Std Error": [f"{v:.4f}" if v is not None else None for v in se],
    "t-stat": [f"{v:.4f}" if v is not None else None for v in tvals],
    "P-Value": [f"{v:.4f}" if v is not None else None for v in pvals]
})


coef_table.to_csv("output/glr_summary.csv", index=False)


HTML(coef_table.to_html())

Unnamed: 0,Feature,Estimate,Std Error,t-stat,P-Value
0,Intercept,85652.7724,25284.0846,0.0637,0.9492
1,MIN_YEARS_EXPERIENCE,1611.679,25284.0846,0.0637,0.9492
2,MAX_YEARS_EXPERIENCE,1611.679,23.0776,1.2369,0.2162
3,DURATION,28.5445,997287.1423,0.0011,0.9991
4,IS_INTERNSHIP,1121.2885,178878.9931,0.0001,0.9999
5,COMPANY_IS_STAFFING,18.3956,20925.2354,0.0596,0.9525
6,"EDUCATION_LEVELS_NAME_vec_""Bachelor's degree""",1247.292,20941.6279,0.2099,0.8338
7,"EDUCATION_LEVELS_NAME_vec_""No Education Listed""",4395.2939,20945.0333,0.459,0.6463
8,"EDUCATION_LEVELS_NAME_vec_""Bachelor's degree"", ""Master's degree""",9613.9354,21085.0222,-1.223,0.2214
9,"EDUCATION_LEVELS_NAME_vec_""High school or GED"", ""Bachelor's degree""",-25786.31,21087.7503,0.6697,0.5031
