---
title: Assignment 04
subtitle: https://github.com/met-ad-688/assignment-04-leoliu36.git
author:
  - name: Leo Liu
    affiliations:
      - id: bu
        name: Boston University
        city: Boston
        state: MA
number-sections: true
date: '2025-9-30'
format:
  docx:
    toc: true
    number-sections: true
date-modified: today
date-format: long
execute:
  echo: false
  eval: true
  freeze: auto
---

Feature Engineering and Missing Value Imputation

In [None]:
# DATA LOADING & SETUP
import pandas as pd
import plotly.express as px
import plotly.io as pio
from pyspark.sql import SparkSession
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when, trim, monotonically_increasing_id, pow, length, sum as spark_sum
from pyspark.sql import functions as F
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine", "true").option("quote", "\"").option("escape", "\"").csv("../data/lightcast_job_postings.csv")
df.createOrReplaceTempView("job_postings")

# Show Schema and Sample Data
#print("---This is Diagnostic check, No need to print it in the final doc---")
#df.printSchema() # comment this line when rendering the submission
df.show(5)

                                                                                

+--------------------+-----------------+----------------------+----------+--------+---------+--------+--------------------+--------------------+--------------------+-----------+-------------------+--------------------+--------------------+---------------+----------------+--------+--------------------+-----------+-------------------+----------------+---------------------+-------------+-------------------+-------------+------------------+---------------+--------------------+--------------------+--------------------+-------------+------+-----------+----------------+-------------------+---------+-----------+--------------------+--------------------+-------------+------+--------------+-----+--------------------+-----+----------+---------------+--------------------+---------------+--------------------+------------+--------------------+------------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+--------------------+------+------

In [None]:
# DATA CLEANING
# Drop columns that are not needed for this analysis 
columns_to_drop = [
  # tracking & other metadata
    "ID", "LAST_UPDATED_DATE", "LAST_UPDATED_TIMESTAMP", "DUPLICATES",
    "SOURCE_TYPES", "SOURCES", "URL", "ACTIVE_URLS", "ACTIVE_SOURCES_INFO", "MODELED_EXPIRED", "MODELED_DURATION", "TITLE_RAW", "ORIGINAL_PAY_PERIOD"
  # outdated NAICS and SOC codes
    "NAICS2", "NAICS2_NAME", "NAICS3", "NAICS3_NAME",
    "NAICS4", "NAICS4_NAME", "NAICS5", "NAICS5_NAME",
    "NAICS6", "NAICS6_NAME", 
    "SOC_2", "SOC_2_NAME", "SOC_3", "SOC_3_NAME",
    "SOC_4", "SOC_4_NAME", "SOC_5", "SOC_5_NAME",
    "SOC_2021_2", "SOC_2021_2_NAME", "SOC_2021_3", "SOC_2021_3_NAME",
    "SOC_2021_5", "SOC_2021_5_NAME",
    "NAICS_2022_2", "NAICS_2022_2_NAME", "NAICS_2022_3", "NAICS_2022_3_NAME",
    "NAICS_2022_4", "NAICS_2022_4_NAME", "NAICS_2022_5", "NAICS_2022_5_NAME"
  # Location encodings
    "COUNTY_OUTGOING", "COUNTY_NAME_OUTGOING",
    "COUNTY_INCOMING", "COUNTY_NAME_INCOMING",
    "MSA_OUTGOING", "MSA_NAME_OUTGOING",
    "MSA_INCOMING", "MSA_NAME_INCOMING"
]

# Drop columns 
df = df.drop(*columns_to_drop)

# Show resulting schema
df.printSchema()

root
 |-- POSTED: string (nullable = true)
 |-- EXPIRED: string (nullable = true)
 |-- DURATION: integer (nullable = true)
 |-- BODY: string (nullable = true)
 |-- COMPANY: integer (nullable = true)
 |-- COMPANY_NAME: string (nullable = true)
 |-- COMPANY_RAW: string (nullable = true)
 |-- COMPANY_IS_STAFFING: boolean (nullable = true)
 |-- EDUCATION_LEVELS: string (nullable = true)
 |-- EDUCATION_LEVELS_NAME: string (nullable = true)
 |-- MIN_EDULEVELS: integer (nullable = true)
 |-- MIN_EDULEVELS_NAME: string (nullable = true)
 |-- MAX_EDULEVELS: integer (nullable = true)
 |-- MAX_EDULEVELS_NAME: string (nullable = true)
 |-- EMPLOYMENT_TYPE: integer (nullable = true)
 |-- EMPLOYMENT_TYPE_NAME: string (nullable = true)
 |-- MIN_YEARS_EXPERIENCE: integer (nullable = true)
 |-- MAX_YEARS_EXPERIENCE: integer (nullable = true)
 |-- IS_INTERNSHIP: boolean (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- REMOTE_TYPE: integer (nullable = true)
 |-- REMOTE_TYPE_NAME: string (nul

In [None]:
# Define columns for EDA: 
# dependent variable: SALARY
# indepdendent variable: MIN_YEARS_EXPERIENCE, SALARY_FROM, SALARY_TO, DURATION 
# categorical variables: COMPANY_IS_STAFFING, IS_INTERNSHIP, REMOTE_TYPE_NAME, EMPLOYMENT_TYPE_NAME, MIN_EDULEVELS_NAME, MAX_EDULEVELS_NAME, STATE_NAME

from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

eda_columns = [
    "SALARY",
    "MIN_YEARS_EXPERIENCE", "DURATION",
    "COMPANY_IS_STAFFING", "IS_INTERNSHIP", "REMOTE_TYPE_NAME", "EMPLOYMENT_TYPE_NAME",
    "MIN_EDULEVELS_NAME", "STATE_NAME"
]
df_eda = df.select(eda_columns)
df_eda.show(5, truncate=False)

+------+--------------------+--------+-------------------+-------------+----------------+----------------------+-------------------+----------+
|SALARY|MIN_YEARS_EXPERIENCE|DURATION|COMPANY_IS_STAFFING|IS_INTERNSHIP|REMOTE_TYPE_NAME|EMPLOYMENT_TYPE_NAME  |MIN_EDULEVELS_NAME |STATE_NAME|
+------+--------------------+--------+-------------------+-------------+----------------+----------------------+-------------------+----------+
|NULL  |2                   |6       |false              |false        |[None]          |Full-time (> 32 hours)|Bachelor's degree  |Arkansas  |
|NULL  |3                   |NULL    |true               |false        |Remote          |Full-time (> 32 hours)|No Education Listed|Maine     |
|NULL  |5                   |35      |false              |false        |[None]          |Full-time (> 32 hours)|Bachelor's degree  |Texas     |
|NULL  |3                   |48      |false              |false        |[None]          |Full-time (> 32 hours)|No Education Listed|Ariz

In [33]:
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when, trim, monotonically_increasing_id, pow, length, sum as spark_sum
import hvplot.pandas

# Visualize the percentage of missing values for each column
df_na = df_eda.select([
    spark_sum(
        when(col(c).isNull() | (length(trim(col(c))) == 0), 1)
    ).alias(c)
    for c in df_eda.columns
])

df_na_pd = df_na.toPandas().T.reset_index()
df_na_pd.columns = ["column", "missing_count"]

total_rows = df.count()
df_na_pd["missing_pct"] = df_na_pd["missing_count"] / total_rows * 100


df_na_pd.sort_values("missing_pct", ascending=False).hvplot.bar(
    x="column",
    y="missing_pct",
    title="Percentage of Missing Values by Column",
    xlabel="Column Name",
    ylabel="Percentage of Missing Values",
    rot=45,
    height=600,
    width=1000
)

                                                                                

In [34]:
import pandas as pd
import hvplot.pandas  # make sure this is imported for hvplot support

# Sample a small fraction of the data and convert to Pandas
df_sample = df_eda.sample(fraction=0.05, seed=42).toPandas()

# Create a boolean mask of missing values
missing_mask = df_sample.isnull()

# Melt the mask into long-form format
missing_long = (
    missing_mask.reset_index()
    .melt(id_vars="index", var_name="column", value_name="is_missing")
)

# Convert boolean to int (True → 1, False → 0)
missing_long["is_missing"] = missing_long["is_missing"].astype(int)

# Plot heatmap
missing_long.hvplot.heatmap(
    x="column", y="index", C="is_missing",
    cmap="Reds", colorbar=False,
    width=900, height=700,
    title="Heatmap of Missing Values (Sample)"
).opts(xrotation=45)


                                                                                

In [None]:
from pyspark.sql.functions import countDistinct

# Count number of unique values per column
df_eda.select([
    countDistinct(c).alias(c + "_nunique")
    for c in df_eda.columns
]).show(truncate=False)

# Select REMOTE_TYPE_NAME and MIN_EDULEVELS_NAME as the two categorical columns for further inspection
categorical_cols = [
    #"STATE_NAME", 
    "REMOTE_TYPE_NAME", 
    #"EMPLOYMENT_TYPE_NAME",
    "MIN_EDULEVELS_NAME",
    #"COMPANY_IS_STAFFING", "IS_INTERNSHIP"
]

for colname in categorical_cols:
    print(f"\n---- {colname} ----")
    df_eda.select(colname).distinct().show(50, truncate=False)

                                                                                

+--------------+----------------------------+----------------+---------------------------+---------------------+------------------------+----------------------------+--------------------------+------------------+
|SALARY_nunique|MIN_YEARS_EXPERIENCE_nunique|DURATION_nunique|COMPANY_IS_STAFFING_nunique|IS_INTERNSHIP_nunique|REMOTE_TYPE_NAME_nunique|EMPLOYMENT_TYPE_NAME_nunique|MIN_EDULEVELS_NAME_nunique|STATE_NAME_nunique|
+--------------+----------------------------+----------------+---------------------------+---------------------+------------------------+----------------------------+--------------------------+------------------+
|6052          |16                          |60              |2                          |2                    |4                       |3                           |6                         |51                |
+--------------+----------------------------+----------------+---------------------------+---------------------+------------------------+-----------

                                                                                

+----------------+
|REMOTE_TYPE_NAME|
+----------------+
|Remote          |
|[None]          |
|Not Remote      |
|Hybrid Remote   |
|NULL            |
+----------------+


---- MIN_EDULEVELS_NAME ----


[Stage 137:>                                                        (0 + 1) / 1]

+----------------------------+
|MIN_EDULEVELS_NAME          |
+----------------------------+
|Bachelor's degree           |
|Ph.D. or professional degree|
|High school or GED          |
|Master's degree             |
|No Education Listed         |
|Associate degree            |
|NULL                        |
+----------------------------+



                                                                                

In [36]:
# For REMOTE_TYPE_NAME replace Remote with Remote, [None] with undefined, Not Remote with On Premise, Hybrid Remote with Hybrid, and Null with On Premise 

from pyspark.sql.functions import col, when

df_eda = df_eda.withColumn(
    "REMOTE_TYPE_NAME",
    when(col("REMOTE_TYPE_NAME") == "Remote", "Remote")
    .when(col("REMOTE_TYPE_NAME") == "[None]", "On Premise")
    .when(col("REMOTE_TYPE_NAME") == "Not Remote", "On Premise")
    .when(col("REMOTE_TYPE_NAME") == "Hybrid Remote", "Hybrid")
    .when(col("REMOTE_TYPE_NAME").isNull(), "On Premise")
    .otherwise(col("REMOTE_TYPE_NAME"))
)

# create a temporary SQL view if using Spark SQL queries later
df_eda.createOrReplaceTempView("df_eda")
print(f"\n---- Distinct Values in REMOTE_TYPE_NAME ----")
df_eda.select("REMOTE_TYPE_NAME").distinct().show(10, truncate=False)


---- Distinct Values in REMOTE_TYPE_NAME ----


[Stage 140:>                                                        (0 + 1) / 1]

+----------------+
|REMOTE_TYPE_NAME|
+----------------+
|Remote          |
|On Premise      |
|Hybrid          |
+----------------+



                                                                                

In [None]:
# For MIN_EDULEVELS_NAME, replace null with No Education Listed

from pyspark.sql.functions import col, when

df_eda = df_eda.withColumn(
    "MIN_EDULEVELS_NAME",
    when(col("MIN_EDULEVELS_NAME").isNull(), "No Education Listed")
    .otherwise(col("MIN_EDULEVELS_NAME"))
)

# create a temporary SQL view if using Spark SQL queries later
df_eda.createOrReplaceTempView("df_eda")
print(f"\n---- Distinct Values in MIN_EDULEVELS_NAME ----")
df_eda.select("MIN_EDULEVELS_NAME").distinct().show(10, truncate=False)


---- Distinct Values in MIN_EDULEVELS_NAME ----


[Stage 143:>                                                        (0 + 1) / 1]

+----------------------------+
|MIN_EDULEVELS_NAME          |
+----------------------------+
|Bachelor's degree           |
|Ph.D. or professional degree|
|High school or GED          |
|Master's degree             |
|No Education Listed         |
|Associate degree            |
+----------------------------+



                                                                                

In [38]:
# For EMPLOYMENT_TYPE_NAME replace Remote with Remote, [None] with undefined, Not Remote with On Premise, Hybrid Remote with Hybrid, and Null with On Premise 

#from pyspark.sql.functions import col, when

#df_eda = df_eda.withColumn(
#    "EMPLOYMENT_TYPE_NAME",
#    when(col("EMPLOYMENT_TYPE_NAME") == "Part-time / full-time", "Flexible")
#    .when(col("EMPLOYMENT_TYPE_NAME") == "Part-time (â‰¤ 32 hours)", "Part Time")
#    .when(col("EMPLOYMENT_TYPE_NAME") == "Full-time (> 32 hours)", "Full Time")
#    .when(col("EMPLOYMENT_TYPE_NAME").isNull(), "Full Time")
#    .otherwise(col("EMPLOYMENT_TYPE_NAME"))
#)

# create a temporary SQL view if using Spark SQL queries later
#df_eda.createOrReplaceTempView("df_eda")
#print(f"\n---- Distinct Values in EMPLOYMENT_TYPE_NAME ----")
#df_eda.select("EMPLOYMENT_TYPE_NAME").distinct().show(10, truncate=False)

In [None]:
# Calculate median for DURATION
median_DURATION = df_eda.approxQuantile("DURATION", [0.5], 0.01)[0]

# Check for nulls in DURATION and impute with median

df_eda = df_eda.withColumn(
    "DURATION",
    when(col("DURATION").isNull(), median_DURATION)
    .otherwise(col("DURATION"))
)

                                                                                

In [None]:
# FEATURE ENGINEERING

from pyspark.sql.functions import col, pow
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Drop rows with NA Values 
df_eda = df_eda.dropna(subset=[
    "SALARY",
    "MIN_YEARS_EXPERIENCE", "DURATION",
    "COMPANY_IS_STAFFING", "IS_INTERNSHIP", "REMOTE_TYPE_NAME", "EMPLOYMENT_TYPE_NAME",
    "MIN_EDULEVELS_NAME", "STATE_NAME"
])

# Define categorical columns to encode
categorical_cols = [
    "MIN_EDULEVELS_NAME", 
    "REMOTE_TYPE_NAME"
]

# Index and One-Hot Encode
indexers = [
    StringIndexer(
        inputCol=col, 
        outputCol=f"{col}_idx", 
        handleInvalid="skip"
    ) for col in categorical_cols
]

encoders = [
    OneHotEncoder(
        inputCol=f"{col}_idx", 
        outputCol=f"{col}_vec"
    ) for col in categorical_cols
]

In [None]:
pipeline = Pipeline(stages=indexers+encoders)
index_df = pipeline.fit(df_eda).transform(df_eda)
index_df.select("MIN_EDULEVELS_NAME", "MIN_EDULEVELS_NAME_idx","MIN_EDULEVELS_NAME_vec", "REMOTE_TYPE_NAME","REMOTE_TYPE_NAME_idx","REMOTE_TYPE_NAME_vec").show()

In [None]:
# Assemble base features (for GLR and Random Forest)
assembler = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features"
)

# Build pipeline and transform df_eda
pipeline = Pipeline(stages=indexers + encoders + [assembler])
data = pipeline.fit(df_eda).transform(df_eda)

# Create squared term for Polynomial Regression
#data = data.withColumn("MIN_YEARS_EXPERIENCE_SQ", pow(col("MIN_YEARS_EXPERIENCE"), 2))

# Assemble polynomial features
assembler_poly = VectorAssembler(
    inputCols=[
        "MIN_YEARS_EXPERIENCE", "MIN_YEARS_EXPERIENCE_SQ", "DURATION",
        "IS_INTERNSHIP", "COMPANY_IS_STAFFING"
    ] + [f"{col}_vec" for col in categorical_cols],
    outputCol="features_poly"
)

# Transform with polynomial features
data = assembler_poly.transform(data)

# Show sample of features and label
data.select("SALARY", "features", "features_poly").show(5, truncate=False)

                                                                                

+------+----------------------------------+-------------------------------------------+
|SALARY|features                          |features_poly                              |
+------+----------------------------------+-------------------------------------------+
|92962 |(11,[0,1,4,9],[2.0,18.0,1.0,1.0]) |(12,[0,1,2,5,10],[2.0,4.0,18.0,1.0,1.0])   |
|107645|(11,[0,1,7,9],[10.0,18.0,1.0,1.0])|(12,[0,1,2,8,10],[10.0,100.0,18.0,1.0,1.0])|
|192800|(11,[0,1,4,9],[6.0,55.0,1.0,1.0]) |(12,[0,1,2,5,10],[6.0,36.0,55.0,1.0,1.0])  |
|125900|(11,[0,1,6,9],[12.0,18.0,1.0,1.0])|(12,[0,1,2,7,10],[12.0,144.0,18.0,1.0,1.0])|
|170000|(11,[0,1,5,9],[6.0,18.0,1.0,1.0]) |(12,[0,1,2,6,10],[6.0,36.0,18.0,1.0,1.0])  |
+------+----------------------------------+-------------------------------------------+
only showing top 5 rows


In [50]:
# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Confirm the split sizes
print(f"Training rows: {train_data.count()}")
print(f"Testing rows: {test_data.count()}")

                                                                                

Training rows: 18966


[Stage 227:>                                                        (0 + 1) / 1]

Testing rows: 4731


                                                                                

In [56]:
from pyspark.ml.regression import GeneralizedLinearRegression

feature_names = assembler.getInputCols()

# Define Generalized Linear Regression model
glr = GeneralizedLinearRegression(
    featuresCol="features",
    labelCol="SALARY",
    family="gaussian",     # continuous outcome, normal distribution
    link="identity",       # standard linear regression
    maxIter=10,            # number of iterations 
    regParam=0.3           # regularization parameter (lambda)
)

# Fit model on training data
glr_model = glr.fit(train_data)

# Get model summary
summary = glr_model.summary

# Coefficients and Intercept
# Intercept represents the predicted salary when all input features = 0.
# Coefficient represents the estimated change in salary per one-unit change in respective feature (while everything else is constant).
print("Intercept: {:.4f}".format(glr_model.intercept))
print("Coefficients:")
for i, coef in enumerate(glr_model.coefficients):
    print(f" Feature {i+1}: {coef:.4f}")

# Regression Summary
#Smaller coefficient standard error means more reliable estimates
print("\n--- Regression Summary: ---")
print("Coefficient Standard Errors:",[f"{val:.4f}" for val in summary.coefficientStandardErrors])

# T-values measures how many standard errors the coefficient is away from 0; >2 is statistically significant
# P-values measure the probability that the coefficient is actually 0; <0.05 is statistically significant
print("T Values:", [f"{val:.4f}" for val in summary.tValues])
print("P Values:", [f"{val:.4f}" for val in summary.pValues])

# Dispersion - variance of residuals
print(f"\nDispersion: {summary.dispersion:.4f}")
# Null Deviance - how much “unexplained” variation exists with no predictors.
print(f"Null Deviance: {summary.nullDeviance:.4f}")
# Residual DF Null - how many data points are left after accounting for 1 parameter (the mean)
print(f"Residual DF Null: {summary.residualDegreeOfFreedomNull}")
# Residual Deviance - Variation not explained by the model. Smaller deviance = better fit.
print(f"Deviance: {summary.deviance:.4f}")
print(f"Residual DF: {summary.residualDegreeOfFreedom}")
# AIC - metric for model quality that balances goodness of fit and model complexity. Lower AIC = better model
print(f"AIC: {summary.aic:.4f}")

                                                                                

Intercept: 118077.7423
Coefficients:
 Feature 1: 7441.2826
 Feature 2: -73.8928
 Feature 3: -3048.7849
 Feature 4: -3147.0268
 Feature 5: -31047.8353
 Feature 6: -27419.8882
 Feature 7: -65370.0192
 Feature 8: -65034.1694
 Feature 9: 5845.2032
 Feature 10: -1291.2053
 Feature 11: -1726.5664

--- Regression Summary: ---


                                                                                

Coefficient Standard Errors: ['80.9467', '21.9980', '4744.7230', '876.2733', '7975.2611', '7997.4419', '8037.8830', '8030.3752', '8100.2761', '1473.5361', '1535.6262', '8125.5372']
T Values: ['91.9281', '-3.3591', '-0.6426', '-3.5914', '-3.8930', '-3.4286', '-8.1327', '-8.0985', '0.7216', '-0.8763', '-1.1243', '14.5317']
P Values: ['0.0000', '0.0008', '0.5205', '0.0003', '0.0001', '0.0006', '0.0000', '0.0000', '0.4705', '0.3809', '0.2609', '0.0000']

Dispersion: 1210670973.7533


                                                                                

Null Deviance: 35794690345776.1094
Residual DF Null: 18965
Deviance: 22947057636519.6797
Residual DF: 18954


[Stage 244:>                                                        (0 + 1) / 1]

AIC: 450500.4524


                                                                                

In [None]:
from IPython.display import HTML
import pandas as pd

# Step 1: Get feature names from GLR summary (Java backend)
feature_names = summary._call_java("featureNames")
features = ["Intercept"] + feature_names

# Step 2: Build all stats lists (include intercept)
coefs = [glr_model.intercept] + list(glr_model.coefficients)
se = list(summary.coefficientStandardErrors)
tvals = list(summary.tValues)
pvals = list(summary.pValues)

# Step 3: Sanity check
print("--- This is a diagnostic check ---")
print("Length of features:", len(features))
print("Length of coefs:", len(coefs))
print("Length of se:", len(se))
print("Length of tvals:", len(tvals))
print("Length of pvals:", len(pvals))

# Step 4: Create summary DataFrame
coef_table = pd.DataFrame({
    "Feature": features,
    "Estimate": [f"{v:.4f}" if v is not None else None for v in coefs],
    "Std Error": [f"{v:.4f}" if v is not None else None for v in se],
    "t-Value": [f"{v:.4f}" if v is not None else None for v in tvals],
    "p-Value": [f"{v:.4f}" if v is not None else None for v in pvals]
})

# Step 5: Export to CSV
coef_table.to_csv("output/glr_summary.csv", index=False)

# Step 6: Display in notebook
HTML(coef_table.to_html(index=False))

--- This is a diagnostic check ---
Length of features: 12
Length of coefs: 12
Length of se: 12
Length of tvals: 12
Length of pvals: 12


Feature,Estimate,Std Error,t-Value,p-Value
Intercept,118077.7423,80.9467,91.9281,0.0
MIN_YEARS_EXPERIENCE,7441.2826,21.998,-3.3591,0.0008
DURATION,-73.8928,4744.723,-0.6426,0.5205
IS_INTERNSHIP,-3048.7849,876.2733,-3.5914,0.0003
COMPANY_IS_STAFFING,-3147.0268,7975.2611,-3.893,0.0001
MIN_EDULEVELS_NAME_vec_Bachelor's degree,-31047.8353,7997.4419,-3.4286,0.0006
MIN_EDULEVELS_NAME_vec_No Education Listed,-27419.8882,8037.883,-8.1327,0.0
MIN_EDULEVELS_NAME_vec_Associate degree,-65370.0192,8030.3752,-8.0985,0.0
MIN_EDULEVELS_NAME_vec_High school or GED,-65034.1694,8100.2761,0.7216,0.4705
MIN_EDULEVELS_NAME_vec_Master's degree,5845.2032,1473.5361,-0.8763,0.3809
