In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
import pandas as pd

# Initialize Spark
spark = SparkSession.builder.appName("Lightcast Salary Model").getOrCreate()

# Load data
df = spark.read.option("header", "true").option("inferSchema", "true") \
    .csv("data/lightcast_job_postings.csv")

# Show schema and first few rows
df.printSchema()
df.select("SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "EMPLOYMENT_TYPE_NAME").show(5)

# Filter nulls
df_clean = df.select("SALARY", "MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "EMPLOYMENT_TYPE_NAME") \
             .dropna()

# Encode categorical
indexer = StringIndexer(inputCol="EMPLOYMENT_TYPE_NAME", outputCol="EMP_TYPE_IDX")
encoder = OneHotEncoder(inputCols=["EMP_TYPE_IDX"], outputCols=["EMP_TYPE_VEC"])

# Assemble final feature vector
assembler = VectorAssembler(
    inputCols=["MIN_YEARS_EXPERIENCE", "MAX_YEARS_EXPERIENCE", "EMP_TYPE_VEC"],
    outputCol="features"
)

# Create Pipeline
pipeline = Pipeline(stages=[indexer, encoder, assembler])
model = pipeline.fit(df_clean)
final_df = model.transform(df_clean).select("features", "SALARY")

# Train/Test split
train_data, test_data = final_df.randomSplit([0.8, 0.2], seed=42)
