In [2]:
#Task 2: Salary Distribution by Employment Type (Box Plot)
# Import libraries
import pandas as pd
import plotly.express as px
import plotly.io as pio

# Set default Plotly theme
pio.templates.default = "plotly_white"

# Load dataset
df = pd.read_csv("data/lightcast_job_postings.csv")

# Convert salary column to numeric
df["SALARY_FROM"] = pd.to_numeric(df["SALARY_FROM"], errors="coerce")

# Drop missing or zero salary values
df_clean = df[(df["SALARY_FROM"] > 0) & df["EMPLOYMENT_TYPE_NAME"].notnull()]

# Create a box plot

fig = px.box(
    df_clean,
    x="EMPLOYMENT_TYPE_NAME",
    y="SALARY_FROM",
    color="EMPLOYMENT_TYPE_NAME",
    title="Salary Distribution by Employment Type",
    labels={"SALARY_FROM": "Salary (USD)", "EMPLOYMENT_TYPE_NAME": "Employment Type"},
    template="plotly_white"
)

# Customize font and layout
fig.update_layout(
    font=dict(family="Helvetica Neue", size=14, color="#333"),
    title_font=dict(size=22),
    xaxis_title="Employment Type",
    yaxis_title="Salary (USD)",
    showlegend=False
)
# Save chart to SVG file
fig.write_image("output/salary_by_employment_type.svg")

# Display chart
fig.show()

In [3]:
import pandas as pd
import plotly.express as px

# Load dataset
df = pd.read_csv("data/lightcast_job_postings.csv")

# Filter out rows with missing or zero salary
df_salary = df[df["SALARY_FROM"] > 0]

# Find top 10 industries by count
top_industries = df_salary["NAICS_2022_6_NAME"].value_counts().head(10).index
df_top_industries = df_salary[df_salary["NAICS_2022_6_NAME"].isin(top_industries)]

# Create box plot
fig = px.box(
    df_top_industries,
    x="NAICS_2022_6_NAME",
    y="SALARY_FROM",
    color="NAICS_2022_6_NAME",
    title="Salary Distribution by Industry (Top 10)",
)

# Customize layout
fig.update_layout(
    font=dict(family="Helvetica Neue", size=14, color="#333"),
    title_font=dict(size=22),
    xaxis_title="Industry",
    yaxis_title="Salary (USD)",
    showlegend=False
)

# Optional: Save to SVG
fig.write_image("output/salary_by_industry.svg")

# Display the plot
fig.show()

In [5]:
# Task 4: Job Posting Trends Over Time 📈
import pandas as pd
import plotly.express as px

# Read CSV again if needed
df = pd.read_csv("data/lightcast_job_postings.csv")

# Convert POSTED to datetime
df['POSTED'] = pd.to_datetime(df['POSTED'], errors='coerce')

# Drop rows with null POSTED
df = df.dropna(subset=['POSTED'])

# Count job postings by day
trend_data = df['POSTED'].value_counts().sort_index()
trend_df = trend_data.reset_index()
trend_df.columns = ['Date', 'Job Count']

# Plot
fig = px.line(trend_df, x='Date', y='Job Count',
              title='Job Posting Trends Over Time')

# Customize layout
fig.update_layout(
    font=dict(family="Helvetica Neue", size=14, color="#333"),
    title_font=dict(size=22),
    xaxis_title="Date",
    yaxis_title="Number of Job Postings",
    showlegend=False
)

# Save and show
fig.write_image("output/job_trend.svg")
fig.show()

In [6]:
# Task 5: Top 10 Job Titles by Count 📊
top_titles = df['TITLE_NAME'].value_counts().nlargest(10).reset_index()
top_titles.columns = ['Job Title', 'Count']

fig = px.bar(top_titles, x='Job Title', y='Count',
             title='Top 10 Job Titles by Count',
             text='Count')

fig.update_layout(
    font=dict(family="Helvetica Neue", size=14, color="#333"),
    title_font=dict(size=22),
    xaxis_tickangle=-45
)

# Save and show
fig.write_image("output/top_10_job_titles.svg")
fig.show()

In [7]:
# Task 6: Remote vs On-Site Job Postings 🥧
remote_counts = df['REMOTE_TYPE_NAME'].value_counts().reset_index()
remote_counts.columns = ['Remote Type', 'Count']

fig = px.pie(remote_counts, names='Remote Type', values='Count',
             title='Remote vs On-Site Job Postings')

fig.update_layout(
    font=dict(family="Helvetica Neue", size=14, color="#333"),
    title_font=dict(size=22)
)

# Save and show
fig.write_image("output/remote_vs_onsite.svg")
fig.show()

In [12]:
import pandas as pd
import plotly.express as px
import os

# Load dataset
df = pd.read_csv("data/lightcast_job_postings.csv")

# Drop rows with missing skills or industry info
df_skills = df.dropna(subset=["SKILLS_NAME", "NAICS2_NAME"])

# Expand SKILLS_NAME: assume it's a semicolon-separated string
df_skills["SKILLS_NAME"] = df_skills["SKILLS_NAME"].astype(str)
df_skills["SKILLS_LIST"] = df_skills["SKILLS_NAME"].str.split(";")

# Explode skills into separate rows
df_exploded = df_skills.explode("SKILLS_LIST")

# Group by industry and skill, count occurrences
skill_counts = df_exploded.groupby(["NAICS2_NAME", "SKILLS_LIST"]).size().reset_index(name="Count")

# Select top 6 industries by total postings
top_industries = skill_counts.groupby("NAICS2_NAME")["Count"].sum().nlargest(6).index.tolist()
skill_counts_top = skill_counts[skill_counts["NAICS2_NAME"].isin(top_industries)]

# Within each industry, get top 10 skills
top_skills_by_industry = (
    skill_counts_top.groupby("NAICS2_NAME")
    .apply(lambda x: x.nlargest(10, "Count"))
    .reset_index(drop=True)
)

# Plot stacked bar chart
fig = px.bar(
    top_skills_by_industry,
    x="NAICS2_NAME",
    y="Count",
    color="SKILLS_LIST",
    title="Top 10 Skills in Top 6 Industries",
    labels={"NAICS2_NAME": "Industry", "SKILLS_LIST": "Skill", "Count": "Skill Frequency"},
)

# Customize layout
fig.update_layout(
    font=dict(family="Helvetica Neue", size=14, color="#333"),
    title_font=dict(size=22),
    barmode="stack",
    showlegend=True
)

# Save chart
os.makedirs("output", exist_ok=True)
fig.write_image("output/skills_by_industry.svg")

# Show chart
fig.show()





In [14]:
import pandas as pd
import plotly.express as px
import os

# Load dataset
df = pd.read_csv("data/lightcast_job_postings.csv")

# Drop rows with missing values in ONET_NAME or SALARY_FROM
df_bubble = df.dropna(subset=["ONET_NAME", "SALARY_FROM"])

# Convert salary to float
df_bubble["SALARY_FROM"] = df_bubble["SALARY_FROM"].astype(float)

# Group by occupation, calculate median salary and job count
bubble_data = (
    df_bubble.groupby("ONET_NAME")
    .agg(MEDIAN_SALARY=("SALARY_FROM", "median"), JOB_COUNT=("ONET_NAME", "count"))
    .reset_index()
)

# Filter to top 20 occupations by job count
bubble_data = bubble_data.sort_values("JOB_COUNT", ascending=False).head(20)

# Plot bubble chart
fig = px.scatter(
    bubble_data,
    x="ONET_NAME",
    y="MEDIAN_SALARY",
    size="JOB_COUNT",
    color="ONET_NAME",
    title="Median Salary by ONET Occupation Type (Top 20 by Count)",
    labels={
        "ONET_NAME": "Occupation",
        "MEDIAN_SALARY": "Median Salary (USD)",
        "JOB_COUNT": "Job Count"
    },
)

# Customize layout
fig.update_layout(
    font=dict(family="Helvetica Neue", size=14),
    title_font=dict(size=22),
    showlegend=False,
    xaxis_tickangle=-45,
)

# Save chart

fig.write_image("output/bubble_onet_salary.svg")

# Show chart
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
import pandas as pd
import plotly.graph_objects as go
import os

# Load dataset
df = pd.read_csv("data/lightcast_job_postings.csv")

# Drop missing values for source and target nodes
df_sankey = df.dropna(subset=["SOC_2021_2_NAME", "SOC_2021_3_NAME"])

# Count transitions between level 2 and level 3 SOC occupations
sankey_data = (
    df_sankey.groupby(["SOC_2021_2_NAME", "SOC_2021_3_NAME"])
    .size()
    .reset_index(name="count")
)

# Filter top 15 flows for readability
sankey_data = sankey_data.sort_values("count", ascending=False).head(15)

# Create unique list of node labels
all_labels = list(pd.unique(sankey_data[["SOC_2021_2_NAME", "SOC_2021_3_NAME"]].values.ravel()))
label_to_index = {label: i for i, label in enumerate(all_labels)}

# Map labels to indices
sankey_data["source_id"] = sankey_data["SOC_2021_2_NAME"].map(label_to_index)
sankey_data["target_id"] = sankey_data["SOC_2021_3_NAME"].map(label_to_index)

# Build Sankey figure
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=all_labels,
        color="blue"
    ),
    link=dict(
        source=sankey_data["source_id"],
        target=sankey_data["target_id"],
        value=sankey_data["count"]
    )
)])

fig.update_layout(
    title_text="Career Pathway Trends: SOC_2021_2_NAME → SOC_2021_3_NAME",
    font=dict(size=14, color="black")
)

fig.write_image("output/sankey_career_pathways.svg")

# Show chart
fig.show()