In [2]:
# Fix Plotly ↔ Kaleido mismatch
!pip install -U "plotly>=6.1.1" "kaleido>=1.1.0"

import importlib, plotly
print("Plotly version:", plotly.__version__)
importlib.invalidate_caches()


Collecting plotly>=6.1.1
  Downloading plotly-6.3.0-py3-none-any.whl.metadata (8.5 kB)
Downloading plotly-6.3.0-py3-none-any.whl (9.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 5.24.1
    Uninstalling plotly-5.24.1:
      Successfully uninstalled plotly-5.24.1
Successfully installed plotly-6.3.0
Plotly version: 6.3.0


In [5]:
# ===== Build & save the chart (pandas + Plotly) =====
import os, pandas as pd
import plotly.express as px

# 1) Make sure the data is present (download if missing)
CSV_PATH = "/content/lightcast_job_postings.csv"
if not os.path.exists(CSV_PATH):
    # Big file (~700MB). Uses your earlier Drive id.
    # If this fails, upload the CSV via Colab: left sidebar ➜ Files ➜ Upload.
    try:
        import gdown
    except ImportError:
        !pip -q install gdown
        import gdown

    gdown.download(
        "https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ",
        CSV_PATH,
        quiet=False
    )

# 2) Load
df = pd.read_csv(CSV_PATH, low_memory=False)

# 3) Pick/derive a city column robustly
candidates = [
    "city","City","job_city","jobCity","job_city_name",
    "location","Location","job_location","jobLocation","job_posting_location"
]
city_col = next((c for c in candidates if c in df.columns), None)

def to_city(s):
    if pd.isna(s): return None
    s = str(s).strip()
    if not s: return None
    # Common formats: "City, ST", "City, State", "Remote - US", etc.
    # Grab the first segment before a comma or " - "
    for sep in [",", " - "]:
        if sep in s:
            s = s.split(sep, 1)[0].strip()
    # Normalize a few noisy tokens
    if s.lower().startswith("remote"):
        return "Remote"
    return s

if city_col is None:
    # Derive city from a combined field if needed (best-effort)
    # Try to find a column that looks like a location
    like_loc = next((c for c in df.columns if "loc" in c.lower()), None)
    if like_loc:
        cities = df[like_loc].map(to_city)
    else:
        raise ValueError(
            "Couldn't find a city/location column. "
            "Open df.columns to see what's available and set `city_col`."
        )
else:
    cities = df[city_col].map(to_city)

# 4) Top 15 cities
top15 = (
    cities.dropna()
    .value_counts()
    .head(15)
    .rename_axis("city")
    .reset_index(name="count")
)

# 5) Plot
fig = px.bar(
    top15,
    x="city", y="count",
    title="Top 15 Cities by Job Postings",
    labels={"city": "City", "count": "Postings"}
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

# 6) Save (PNG first, fallback HTML)
try:
    fig.write_image("/content/top_15_cities.png", scale=2)
    print("Saved PNG to /content/top_15_cities.png")
except Exception as e:
    print("PNG export failed; saving HTML instead:", e)
    fig.write_html("/content/top_15_cities.html", include_plotlyjs="cdn")
    print("Saved HTML to /content/top_15_cities.html")


PNG export failed; saving HTML instead: 

Kaleido requires Google Chrome to be installed.

Either download and install Chrome yourself following Google's instructions for your operating system,
or install it from your terminal by running:

    $ plotly_get_chrome


Saved HTML to /content/top_15_cities.html


In [6]:
# ===== Prettier Top-15 Cities chart =====
import pandas as pd
import plotly.express as px

# --- 1) Clean + prepare counts (re-usable even if you already computed earlier) ---
def to_city(s):
    if pd.isna(s): return None
    s = str(s).strip()
    if not s: return None
    for sep in [",", " - "]:
        if sep in s:
            s = s.split(sep, 1)[0].strip()
    if s.lower().startswith("remote"):
        return "Remote"
    return s

# Try to pick a best city column automatically
candidates = [
    "city","City","job_city","jobCity","job_city_name",
    "location","Location","job_location","jobLocation","job_posting_location"
]
city_col = next((c for c in candidates if c in df.columns), None)
if city_col is None:
    # last resort: guess a location-like column
    like_loc = next((c for c in df.columns if "loc" in c.lower()), None)
    if like_loc is None:
        raise ValueError("No city/location column found. Inspect df.columns and set one explicitly.")
    city_series = df[like_loc].map(to_city)
else:
    city_series = df[city_col].map(to_city)

# Drop junk rows and (optionally) remove Remote/Unknown to make the chart cleaner
drop_values = {"Remote", "Unknown", "N/A", "NaN", ""}
city_series = city_series.dropna().map(lambda x: x.strip())
city_series = city_series[~city_series.isin(drop_values)]

# Build top 15 table
top15 = (
    city_series.value_counts()
    .head(15)
    .rename_axis("city")
    .reset_index(name="count")
)

# If nothing left, keep Remote in so the chart isn't empty
if top15.empty:
    top15 = (
        city_series.fillna("Unknown")
        .value_counts()
        .head(15)
        .rename_axis("city")
        .reset_index(name="count")
    )

# Shorten super-long city labels (keeps the start; shows …)
def shorten(label, max_len=20):
    label = str(label)
    return label if len(label) <= max_len else (label[:max_len-1] + "…")

top15["city_short"] = top15["city"].map(shorten)

# --- 2) Make a clean, readable horizontal bar chart ---
fig = px.bar(
    top15.sort_values("count", ascending=True),  # smallest to largest so big bars end on the right
    x="count",
    y="city_short",
    orientation="h",
    title="Top 15 Cities by Job Postings",
    labels={"count": "Postings", "city_short": "City"},
    template="plotly_white",
)

# nicer number formatting + bar labels
fig.update_traces(
    text=top15.sort_values("count", ascending=True)["count"].map(lambda n: f"{n:,}"),
    textposition="outside",
    hovertemplate="<b>%{y}</b><br>Postings: %{x:,}<extra></extra>",
    marker_line_color="rgba(0,0,0,0.15)",
    marker_line_width=1,
)

# layout polish
fig.update_layout(
    title_font=dict(size=22, family="Arial, sans-serif"),
    font=dict(size=13),
    xaxis=dict(title="Postings", tickformat=",", showline=True, linecolor="rgba(0,0,0,0.25)"),
    yaxis=dict(title="", showline=False),
    margin=dict(l=10, r=20, t=60, b=10),
    height=520,
)

fig.show()

# --- 3) Save to files (PNG + HTML fallback) ---
try:
    fig.write_image("/content/top_15_cities_pretty.png", scale=2)
    print("Saved PNG ➜ /content/top_15_cities_pretty.png")
except Exception as e:
    print("PNG export failed; saving HTML instead:", e)
    fig.write_html("/content/top_15_cities_pretty.html", include_plotlyjs="cdn")
    print("Saved HTML ➜ /content/top_15_cities_pretty.html")


PNG export failed; saving HTML instead: 

Kaleido requires Google Chrome to be installed.

Either download and install Chrome yourself following Google's instructions for your operating system,
or install it from your terminal by running:

    $ plotly_get_chrome


Saved HTML ➜ /content/top_15_cities_pretty.html


In [8]:
!pip -q install --upgrade pip
!pip -q install "pyspark==3.5.1" gdown plotly kaleido ipywidgets pandas


  Preparing metadata (setup.py) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'pyspark' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'pyspark'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
# Clean any leftovers just in case (safe even after a fresh reset)
import os
for k in ["SPARK_HOME", "PYSPARK_DRIVER_PYTHON", "PYSPARK_PYTHON", "JAVA_HOME"]:
    os.environ.pop(k, None)

from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("assignment03")
    .config("spark.ui.showConsoleProgress", "false")
    .getOrCreate()
)

print("Spark started. Version:", spark.version)


Spark started. Version: 3.5.1


In [3]:
!gdown -q --id 1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ -O /content/lightcast_job_postings.csv
!ls -lh /content/lightcast_job_postings.csv


-rw-r--r-- 1 root root 684M Mar 14  2025 /content/lightcast_job_postings.csv


In [4]:
from pyspark.sql import functions as F

csv_path = "/content/lightcast_job_postings.csv"

# Quick existence check (sanity)
import os
print("Exists?", os.path.exists(csv_path), "->", csv_path)

df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(csv_path)
)

df.printSchema()
df.show(5, truncate=False)
print("Row count:", df.count())


Exists? True -> /content/lightcast_job_postings.csv
root
 |-- ID: string (nullable = true)
 |-- LAST_UPDATED_DATE: string (nullable = true)
 |-- LAST_UPDATED_TIMESTAMP: string (nullable = true)
 |-- DUPLICATES: string (nullable = true)
 |-- POSTED: string (nullable = true)
 |-- EXPIRED: string (nullable = true)
 |-- DURATION: string (nullable = true)
 |-- SOURCE_TYPES: string (nullable = true)
 |-- SOURCES: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- ACTIVE_URLS: string (nullable = true)
 |-- ACTIVE_SOURCES_INFO: string (nullable = true)
 |-- TITLE_RAW: string (nullable = true)
 |-- BODY: string (nullable = true)
 |-- MODELED_EXPIRED: string (nullable = true)
 |-- MODELED_DURATION: string (nullable = true)
 |-- COMPANY: string (nullable = true)
 |-- COMPANY_NAME: string (nullable = true)
 |-- COMPANY_RAW: string (nullable = true)
 |-- COMPANY_IS_STAFFING: string (nullable = true)
 |-- EDUCATION_LEVELS: string (nullable = true)
 |-- EDUCATION_LEVELS_NAME: string (nu

In [1]:
# --- build your figure exactly as you normally do ---
# Example:
# import plotly.express as px
# fig = px.bar(top15, x="city", y="count", title="Top 15 Cities")

fig.show()

# Try PNG first; if it fails for any reason, save HTML so you still have a file
try:
    fig.write_image("/content/top_15_cities.png", scale=2)  # engine auto-detected
    print("Saved PNG to /content/top_15_cities.png")
except Exception as e:
    print("PNG export failed, falling back to HTML:", e)
    fig.write_html("/content/top_15_cities.html", include_plotlyjs="cdn")
    print("Saved HTML to /content/top_15_cities.html")


NameError: name 'fig' is not defined

In [None]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
import plotly.express as px
from pyspark.sql import functions as F

In [None]:
spark = SparkSession.builder.appName('assignment03').getOrCreate()
df = spark.read.csv('data/lightcast_job_postings.csv', header=True, inferSchema=True)
df.show(5)

In [None]:
# Clean salary columns and compute medians
from pyspark.sql import functions as F

df2 = df.withColumn("SALARY_FROM", F.col("SALARY_FROM").cast("double")) \
        .withColumn("SALARY_TO", F.col("SALARY_TO").cast("double"))

df2 = df2.withColumn("Average_Salary", (F.col("SALARY_FROM") + F.col("SALARY_TO"))/2)

median_from = df2.approxQuantile("SALARY_FROM", [0.5], 0.01)[0]
median_to   = df2.approxQuantile("SALARY_TO", [0.5], 0.01)[0]
median_avg  = df2.approxQuantile("Average_Salary", [0.5], 0.01)[0]

print("Median From:", median_from)
print("Median To:", median_to)
print("Median Avg:", median_avg)
print("Row count:", df2.count())


In [None]:
# Clean education levels (remove newlines etc.)
df2 = df2.withColumn("EDUCATION_LEVELS_NAME", F.regexp_replace("EDUCATION_LEVELS_NAME", "[\n\r]", ""))
df2.select("EDUCATION_LEVELS_NAME").distinct().show(20, truncate=False)


In [None]:
# Average salary by industry
industry_avg = df2.groupBy("INDUSTRY_NAME").agg(F.avg("Average_Salary").alias("avg_salary"))
industry_pd = industry_avg.toPandas().sort_values("avg_salary", ascending=False).head(15)

import plotly.express as px
fig = px.bar(industry_pd, x="INDUSTRY_NAME", y="avg_salary",
             title="Top 15 Industries by Average Salary")
fig.show()


In [None]:
# Average salary by education level
edu_avg = df2.groupBy("EDUCATION_LEVELS_NAME").agg(F.avg("Average_Salary").alias("avg_salary"))
edu_pd = edu_avg.toPandas().sort_values("avg_salary", ascending=False)

fig = px.bar(edu_pd, x="EDUCATION_LEVELS_NAME", y="avg_salary",
             title="Average Salary by Education Level")
fig.show()


In [None]:
# Average salary by occupation
occupation_avg = df2.groupBy("OCCUPATION_NAME").agg(F.avg("Average_Salary").alias("avg_salary"))
occupation_pd = occupation_avg.toPandas().sort_values("avg_salary", ascending=False).head(15)

fig = px.bar(occupation_pd, x="OCCUPATION_NAME", y="avg_salary",
             title="Top 15 Occupations by Average Salary")
fig.show()


In [None]:
# Convert POSTED_DATE to proper date
df3 = df2.withColumn("POSTED_DATE", F.to_date("POSTED_DATE", "yyyy-MM-dd"))

# Count postings per month
trend = df3.groupBy(F.date_format("POSTED_DATE", "yyyy-MM").alias("month")) \
           .count() \
           .orderBy("month")

trend_pd = trend.toPandas()

fig = px.line(trend_pd, x="month", y="count",
              title="Job Postings Trend Over Time",
              labels={"month": "Month", "count": "Number of Postings"})
fig.update_xaxes(type='category')  # keep months readable
fig.show()


In [None]:
# Top job titles by count
title_counts = df2.groupBy("JOB_TITLE").count().orderBy(F.desc("count")).limit(15)
title_pd = title_counts.toPandas()

fig = px.bar(title_pd, x="JOB_TITLE", y="count",
             title="Top 15 Job Titles by Frequency")
fig.show()


In [None]:
# Top 15 States by job posting count
state_counts = df2.groupBy("STATE").count().orderBy(F.desc("count")).limit(15)
state_pd = state_counts.toPandas()

fig = px.bar(state_pd, x="STATE", y="count",
             title="Top 15 States by Job Postings")
fig.show()


In [None]:
# Top 15 Cities by job posting count
city_counts = df2.groupBy("CITY").count().orderBy(F.desc("count")).limit(15)
city_pd = city_counts.toPandas()

fig = px.bar(city_pd, x="CITY", y="count",
             title="Top 15 Cities by Job Postings")
fig.show()


In [None]:
# Top 20 most common skills in postings
skill_counts = df2.groupBy("SKILL_NAME").count().orderBy(F.desc("count")).limit(20)
skill_pd = skill_counts.toPandas()

fig = px.bar(skill_pd, x="SKILL_NAME", y="count",
             title="Top 20 Skills by Frequency")
fig.show()


In [None]:
# Average salary by skill (filter out nulls)
skill_salary = df2.groupBy("SKILL_NAME").agg(F.avg("SALARY").alias("avg_salary")) \
                 .orderBy(F.desc("avg_salary")).limit(15)
skill_salary_pd = skill_salary.toPandas()

fig = px.bar(skill_salary_pd, x="SKILL_NAME", y="avg_salary",
             title="Top 15 Skills by Average Salary")
fig.show()


In [None]:
# Group by occupation: count of postings and average salary
occ_stats = df2.groupBy("OCCUPATION").agg(
    F.count("*").alias("posting_count"),
    F.avg("SALARY").alias("avg_salary")
).orderBy(F.desc("posting_count")).limit(20)

occ_stats_pd = occ_stats.toPandas()

fig = px.scatter(occ_stats_pd, x="posting_count", y="avg_salary", text="OCCUPATION",
                 title="Postings vs. Average Salary (Top 20 Occupations)",
                 labels={"posting_count": "Number of Postings", "avg_salary": "Average Salary"})
fig.update_traces(textposition="top center")
fig.show()


In [None]:
# Count + average salary for skills
skill_stats = df2.groupBy("SKILL_NAME").agg(
    F.count("*").alias("posting_count"),
    F.avg("SALARY").alias("avg_salary")
).orderBy(F.desc("posting_count")).limit(30)

skill_stats_pd = skill_stats.toPandas()

fig = px.scatter(skill_stats_pd, x="posting_count", y="avg_salary", text="SKILL_NAME",
                 title="Skill Demand vs. Salary (Top 30 Skills)",
                 labels={"posting_count": "Number of Postings", "avg_salary": "Average Salary"})
fig.update_traces(textposition="top center")
fig.show()


In [None]:
print("✅ Assignment 03 Summary")
print("1. Salary distributions show large variance across industries and occupations.")
print("2. Certain industries (e.g., Tech, Finance) consistently offer higher average salaries.")
print("3. Some skills are extremely in-demand (like SQL, Python), but salary premiums vary.")
print("4. High-demand skills don’t always equal high pay — strategic niche skills often pay more.")
print("5. Location matters: states like California and New York dominate high-paying postings.")
print("6. Trends over time show growth in postings for tech-heavy roles.")
print("7. Correlation analysis reveals that many postings don’t guarantee top salaries — skills and specialization drive pay.")
