In [2]:
!pip install requests beautifulsoup4 psycopg2-binary python-dotenv




In [9]:
import os
import json
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
import psycopg2
from psycopg2.extras import execute_values
from dotenv import load_dotenv

load_dotenv()

LOCAL_BRONZE_PATH = "bronze_jobs.json"
LOCAL_SCRAPED_SAVE_PATH = "scraped_raw.json"
MAX_JOB_AGE_DAYS = 7
KEYWORDS = ["engineer", "software", "developer", "audit", "analyst"]


In [24]:
def save_json(path, data):
    with open(path, "w") as f:
        json.dump(data, f, indent=2)

def load_json(path):
    if not os.path.exists(path):
        return []
    with open(path, "r") as f:
        return json.load(f)

from datetime import datetime, timezone

def timestamp():
    return datetime.now(timezone.utc).isoformat()



In [25]:
import re
from bs4 import BeautifulSoup
import requests

BASE = "https://www.governmentjobs.com"
JOB_URL_RE = re.compile(r"^/jobs/\d")  # href starts with /jobs/<digit>

def search_govjobs(keyword: str):
    url = f"{BASE}/jobs?keyword={keyword}"
    print("Searching:", url)

    resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    jobs = []

    # Look for ANY anchor that looks like a job detail URL
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if not JOB_URL_RE.match(href):
            continue

        title = a.get_text(strip=True)
        # Quick sanity check to skip garbage links
        if not title or "Job Alert" in title or "Loading Job Details" in title:
            continue

        full_url = BASE + href
        jobs.append({
            "job_url": full_url,
            "title": title,
        })

    print(f"  found {len(jobs)} job URLs for keyword='{keyword}'")
    return jobs




In [26]:
search_govjobs("software")[:5]


Searching: https://www.governmentjobs.com/jobs?keyword=software
  found 10 job URLs for keyword='software'


[{'job_url': 'https://www.governmentjobs.com/jobs/5148422-0/software-application-specialist',
  'title': 'Software Application Specialist'},
 {'job_url': 'https://www.governmentjobs.com/jobs/120764-1/software-development-part-time-faculty',
  'title': 'Software Development, Part time Faculty'},
 {'job_url': 'https://www.governmentjobs.com/jobs/5126271-0/software-analyst-i-ii-onsite-only-business-and-services-track',
  'title': 'Software Analyst I/II (Onsite Only) (Business and Services Track)'},
 {'job_url': 'https://www.governmentjobs.com/jobs/5144458-0/senior-software-developer',
  'title': 'Senior Software Developer*'},
 {'job_url': 'https://www.governmentjobs.com/jobs/5041458-0/senior-software-developer',
  'title': 'Senior Software Developer'}]

In [27]:
import re

CONTROL_CHARS = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]")

def scrape_job_detail(url):
    resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    script = soup.find("script", {"type": "application/ld+json"})
    if not script or not script.string:
        print("No JSON-LD found for", url)
        return None

    raw = script.string
    # Remove bad control chars
    raw = CONTROL_CHARS.sub(" ", raw)

    try:
        data = json.loads(raw, strict=False)
    except json.JSONDecodeError as e:
        print("JSON decode failed for", url, "=>", e)
        return None

    if isinstance(data, list):
        data = data[0]

    return {
        "job_id": url.rstrip("/").split("/")[-2],
        "title": data.get("title"),
        "company": data.get("hiringOrganization", {}).get("name", ""),
        "location": data.get("jobLocation", {}).get("address", {}).get("addressLocality", ""),
        "description": data.get("description", ""),
        "posted_at": data.get("datePosted", timestamp()),
        "url": url,
    }




In [28]:
def scrape_all_keywords():
    results = []
    seen = set()

    for kw in KEYWORDS:
        search_results = search_govjobs(kw)
        for job in search_results:
            url = job["job_url"]
            if url in seen:
                continue

            seen.add(url)

            try:
                detail = scrape_job_detail(url)
                if detail:
                    detail["scraped_at"] = timestamp()
                    results.append(detail)
            except Exception as e:
                print("Error scraping detail:", url, e)

    print(f"Total jobs scraped: {len(results)}")
    return results

scraped_jobs = scrape_all_keywords()
save_json("scraped_raw.json", scraped_jobs)


Searching: https://www.governmentjobs.com/jobs?keyword=engineer
  found 10 job URLs for keyword='engineer'
Searching: https://www.governmentjobs.com/jobs?keyword=software
  found 10 job URLs for keyword='software'
Searching: https://www.governmentjobs.com/jobs?keyword=developer
  found 10 job URLs for keyword='developer'
Searching: https://www.governmentjobs.com/jobs?keyword=audit
  found 10 job URLs for keyword='audit'
Searching: https://www.governmentjobs.com/jobs?keyword=analyst
  found 10 job URLs for keyword='analyst'
Total jobs scraped: 48


In [30]:
# Global config (re-run this after any kernel restart)

BRONZE_PATH = "bronze_jobs.json"
SILVER_PATH = "silver_jobs.json"
GOLD_PATH = "gold_jobs.json"

# ONLY needed if you're using keyword scraping
KEYWORDS = ["engineer", "software", "developer", "audit", "analyst"]


In [34]:
# Load bronze
bronze = load_json(BRONZE_PATH)

# Append today's scrape
bronze.extend(scraped_jobs)
save_json(BRONZE_PATH, bronze)

# ---- Silver: latest job version ----
latest = {}
for job in bronze:
    key = job["job_id"]
    ts = datetime.fromisoformat(job["scraped_at"])
    if key not in latest or ts > datetime.fromisoformat(latest[key]["scraped_at"]):
        latest[key] = job

silver = list(latest.values())

# ---- Gold: only recent postings ----
from datetime import datetime, timedelta, timezone

# ---- Gold: only recent postings ----
cutoff = datetime.now(timezone.utc) - timedelta(days=MAX_JOB_AGE_DAYS)

def parse_ts(ts_str: str) -> datetime:
    dt = datetime.fromisoformat(ts_str)
    # If for some reason it's naive, assume UTC
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt

gold = [
    j for j in silver
    if parse_ts(j["scraped_at"]) >= cutoff
]


print("Bronze:", len(bronze))
print("Silver:", len(silver))
print("Gold (active):", len(gold))

save_json("silver_jobs.json", silver)
save_json("gold_jobs.json", gold)



Bronze: 192
Silver: 48
Gold (active): 48


In [35]:
import pandas as pd

df_gold = pd.DataFrame(gold)
df_gold



Unnamed: 0,job_id,title,company,location,description,posted_at,url,scraped_at
0,5108053-0,County Engineer,New Hanover County,"New Hanover County, NC","New Hanover County is seeking an experienced, ...",2025-10-13,https://www.governmentjobs.com/jobs/5108053-0/...,2025-11-25T17:22:22.421635+00:00
1,5121231-0,Engineer (Planning),Orange County Sanitation District,"CA 92708, CA",&lt;p style=&quot;margin: 0&quot;&gt;&lt;span ...,2025-10-29,https://www.governmentjobs.com/jobs/5121231-0/...,2025-11-25T17:22:22.760859+00:00
2,4912084-0,Senior Engineer,Klickitat Public Utility,"Goldendale, WA 98620","This position is responsible to plan, scope, s...",2025-04-17,https://www.governmentjobs.com/jobs/4912084-0/...,2025-11-25T17:22:23.100193+00:00
3,5128516-0,Senior Civil Engineer,City of Davis,"Davis, CA",&lt;p style=&quot;margin: 0 0 0.0001pt; text-a...,2025-10-30,https://www.governmentjobs.com/jobs/5128516-0/...,2025-11-25T17:22:23.433317+00:00
4,5117584-0,ENGINEER 1/ENGINEER 2 SEWER UTILITIES,Kitsap County,"Bremerton, WA",&lt;div style=&quot;text-align: left&quot;&gt;...,2025-11-04,https://www.governmentjobs.com/jobs/5117584-0/...,2025-11-25T17:22:23.858116+00:00
5,5109149-0,Electrical Engineer III - Systems,Lewis County Public Utility District,"Chehalis, WA",&lt;p&gt;The Electrical Engineer III - Systems...,2025-10-10,https://www.governmentjobs.com/jobs/5109149-0/...,2025-11-25T17:22:24.188728+00:00
6,4880582-0,Senior Civil Engineer - Solid Waste,City of Winston-Salem,"Winston-Salem, NC",&lt;p&gt;Performs difficult professional work ...,2025-03-23,https://www.governmentjobs.com/jobs/4880582-0/...,2025-11-25T17:22:24.547473+00:00
7,4937456-0,CIVIL ENGINEER - CIP ($5000 Hiring Incentive) ...,City of Surprise,"Surprise, AZ",&lt;div style=&quot;text-align: left&quot;&gt;...,2025-06-20,https://www.governmentjobs.com/jobs/4937456-0/...,2025-11-25T17:22:24.911875+00:00
8,5134955-0,Engineer,Travis County,"Austin, TX",&lt;p&gt;Functions as a project manager over h...,2025-11-09,https://www.governmentjobs.com/jobs/5134955-0/...,2025-11-25T17:22:25.320356+00:00
9,129910-1,Engineer,City of Barberton,"Barberton, OH",&lt;p&gt;The City of Barberton is now acceptin...,2025-09-05,https://www.governmentjobs.com/jobs/129910-1/e...,2025-11-25T17:22:25.670420+00:00
