In [None]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import plotly.express as px
from pyspark.sql import functions as F

In [None]:
spark = SparkSession.builder.appName('assignment03').getOrCreate()
df = spark.read.csv('data/lightcast_job_postings.csv', header=True, inferSchema=True)
df.show(5)

In [None]:
# Clean salary columns and compute medians
from pyspark.sql import functions as F

df2 = df.withColumn("SALARY_FROM", F.col("SALARY_FROM").cast("double")) \
        .withColumn("SALARY_TO", F.col("SALARY_TO").cast("double"))

df2 = df2.withColumn("Average_Salary", (F.col("SALARY_FROM") + F.col("SALARY_TO"))/2)

median_from = df2.approxQuantile("SALARY_FROM", [0.5], 0.01)[0]
median_to   = df2.approxQuantile("SALARY_TO", [0.5], 0.01)[0]
median_avg  = df2.approxQuantile("Average_Salary", [0.5], 0.01)[0]

print("Median From:", median_from)
print("Median To:", median_to)
print("Median Avg:", median_avg)
print("Row count:", df2.count())


In [None]:
# Clean education levels (remove newlines etc.)
df2 = df2.withColumn("EDUCATION_LEVELS_NAME", F.regexp_replace("EDUCATION_LEVELS_NAME", "[\n\r]", ""))
df2.select("EDUCATION_LEVELS_NAME").distinct().show(20, truncate=False)


In [None]:
# Average salary by industry
industry_avg = df2.groupBy("INDUSTRY_NAME").agg(F.avg("Average_Salary").alias("avg_salary"))
industry_pd = industry_avg.toPandas().sort_values("avg_salary", ascending=False).head(15)

import plotly.express as px
fig = px.bar(industry_pd, x="INDUSTRY_NAME", y="avg_salary",
             title="Top 15 Industries by Average Salary")
fig.show()


In [None]:
# Average salary by education level
edu_avg = df2.groupBy("EDUCATION_LEVELS_NAME").agg(F.avg("Average_Salary").alias("avg_salary"))
edu_pd = edu_avg.toPandas().sort_values("avg_salary", ascending=False)

fig = px.bar(edu_pd, x="EDUCATION_LEVELS_NAME", y="avg_salary",
             title="Average Salary by Education Level")
fig.show()


In [None]:
# Average salary by occupation
occupation_avg = df2.groupBy("OCCUPATION_NAME").agg(F.avg("Average_Salary").alias("avg_salary"))
occupation_pd = occupation_avg.toPandas().sort_values("avg_salary", ascending=False).head(15)

fig = px.bar(occupation_pd, x="OCCUPATION_NAME", y="avg_salary",
             title="Top 15 Occupations by Average Salary")
fig.show()


In [None]:
# Convert POSTED_DATE to proper date
df3 = df2.withColumn("POSTED_DATE", F.to_date("POSTED_DATE", "yyyy-MM-dd"))

# Count postings per month
trend = df3.groupBy(F.date_format("POSTED_DATE", "yyyy-MM").alias("month")) \
           .count() \
           .orderBy("month")

trend_pd = trend.toPandas()

fig = px.line(trend_pd, x="month", y="count",
              title="Job Postings Trend Over Time",
              labels={"month": "Month", "count": "Number of Postings"})
fig.update_xaxes(type='category')  # keep months readable
fig.show()


In [None]:
# Top job titles by count
title_counts = df2.groupBy("JOB_TITLE").count().orderBy(F.desc("count")).limit(15)
title_pd = title_counts.toPandas()

fig = px.bar(title_pd, x="JOB_TITLE", y="count",
             title="Top 15 Job Titles by Frequency")
fig.show()


In [None]:
# Top 15 States by job posting count
state_counts = df2.groupBy("STATE").count().orderBy(F.desc("count")).limit(15)
state_pd = state_counts.toPandas()

fig = px.bar(state_pd, x="STATE", y="count",
             title="Top 15 States by Job Postings")
fig.show()


In [None]:
# Top 15 Cities by job posting count
city_counts = df2.groupBy("CITY").count().orderBy(F.desc("count")).limit(15)
city_pd = city_counts.toPandas()

fig = px.bar(city_pd, x="CITY", y="count",
             title="Top 15 Cities by Job Postings")
fig.show()


In [None]:
# Top 20 most common skills in postings
skill_counts = df2.groupBy("SKILL_NAME").count().orderBy(F.desc("count")).limit(20)
skill_pd = skill_counts.toPandas()

fig = px.bar(skill_pd, x="SKILL_NAME", y="count",
             title="Top 20 Skills by Frequency")
fig.show()


In [None]:
# Average salary by skill (filter out nulls)
skill_salary = df2.groupBy("SKILL_NAME").agg(F.avg("SALARY").alias("avg_salary")) \
                 .orderBy(F.desc("avg_salary")).limit(15)
skill_salary_pd = skill_salary.toPandas()

fig = px.bar(skill_salary_pd, x="SKILL_NAME", y="avg_salary",
             title="Top 15 Skills by Average Salary")
fig.show()


In [None]:
# Group by occupation: count of postings and average salary
occ_stats = df2.groupBy("OCCUPATION").agg(
    F.count("*").alias("posting_count"),
    F.avg("SALARY").alias("avg_salary")
).orderBy(F.desc("posting_count")).limit(20)

occ_stats_pd = occ_stats.toPandas()

fig = px.scatter(occ_stats_pd, x="posting_count", y="avg_salary", text="OCCUPATION",
                 title="Postings vs. Average Salary (Top 20 Occupations)",
                 labels={"posting_count": "Number of Postings", "avg_salary": "Average Salary"})
fig.update_traces(textposition="top center")
fig.show()


In [None]:
# Count + average salary for skills
skill_stats = df2.groupBy("SKILL_NAME").agg(
    F.count("*").alias("posting_count"),
    F.avg("SALARY").alias("avg_salary")
).orderBy(F.desc("posting_count")).limit(30)

skill_stats_pd = skill_stats.toPandas()

fig = px.scatter(skill_stats_pd, x="posting_count", y="avg_salary", text="SKILL_NAME",
                 title="Skill Demand vs. Salary (Top 30 Skills)",
                 labels={"posting_count": "Number of Postings", "avg_salary": "Average Salary"})
fig.update_traces(textposition="top center")
fig.show()


In [None]:
print("✅ Assignment 03 Summary")
print("1. Salary distributions show large variance across industries and occupations.")
print("2. Certain industries (e.g., Tech, Finance) consistently offer higher average salaries.")
print("3. Some skills are extremely in-demand (like SQL, Python), but salary premiums vary.")
print("4. High-demand skills don’t always equal high pay — strategic niche skills often pay more.")
print("5. Location matters: states like California and New York dominate high-paying postings.")
print("6. Trends over time show growth in postings for tech-heavy roles.")
print("7. Correlation analysis reveals that many postings don’t guarantee top salaries — skills and specialization drive pay.")
