In [None]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import plotly.express as px
from pyspark.sql import functions as F

In [None]:
spark = SparkSession.builder.appName('assignment03').getOrCreate()
df = spark.read.csv('data/lightcast_job_postings.csv', header=True, inferSchema=True)
df.show(5)

In [None]:
# Clean salary columns and compute medians
from pyspark.sql import functions as F

df2 = df.withColumn("SALARY_FROM", F.col("SALARY_FROM").cast("double")) \
        .withColumn("SALARY_TO", F.col("SALARY_TO").cast("double"))

df2 = df2.withColumn("Average_Salary", (F.col("SALARY_FROM") + F.col("SALARY_TO"))/2)

median_from = df2.approxQuantile("SALARY_FROM", [0.5], 0.01)[0]
median_to   = df2.approxQuantile("SALARY_TO", [0.5], 0.01)[0]
median_avg  = df2.approxQuantile("Average_Salary", [0.5], 0.01)[0]

print("Median From:", median_from)
print("Median To:", median_to)
print("Median Avg:", median_avg)
print("Row count:", df2.count())


In [None]:
# Clean education levels (remove newlines etc.)
df2 = df2.withColumn("EDUCATION_LEVELS_NAME", F.regexp_replace("EDUCATION_LEVELS_NAME", "[\n\r]", ""))
df2.select("EDUCATION_LEVELS_NAME").distinct().show(20, truncate=False)


In [None]:
# Average salary by industry
industry_avg = df2.groupBy("INDUSTRY_NAME").agg(F.avg("Average_Salary").alias("avg_salary"))
industry_pd = industry_avg.toPandas().sort_values("avg_salary", ascending=False).head(15)

import plotly.express as px
fig = px.bar(industry_pd, x="INDUSTRY_NAME", y="avg_salary",
             title="Top 15 Industries by Average Salary")
fig.show()
