In [1]:
import requests

In [2]:
num_jobs = 25
km_dist = 75
title = "Data Scientist"
postal = 8000

In [4]:
list_url = f"https://jobnet.dk/bff/FindJob/Search?resultsPerPage={num_jobs}&pageNumber=1&orderType=PublicationDate&kmRadius={km_dist}&searchString={title.replace(' ', '+')}&postalCode={postal}"
response = requests.get(list_url)
if response.status_code != 200:
    print("Error fetching job listings")

In [6]:
response_dict = response.json()
postings = response_dict.get("jobAds", [])

[{'country': 'Danmark',
  'municipality': 'Aarhus',
  'postalCode': 8000,
  'postalDistrictName': 'Aarhus C',
  'hiringOrgName': 'Formalize ApS',
  'occupation': 'IT-ingeniør',
  'conceptUriDa': 'http://data.star.dk/esco/occupation/93c9c217-48a0-4e70-9f1e-01a859665578',
  'workHourPartTime': False,
  'jobAdId': 'f1232997-860f-402a-bf2c-2ff5fd907722',
  'jobAdUrl': '',
  'workPlaceAddress': '',
  'cvr': '42045136',
  'title': 'Staff Software Engineer',
  'description': "Are you looking for a big opportunity and want to invest heavily in your career while being a part of scaling a technical organisation? If so, keep on reading! We are looking for a Staff Software Engineer for our growing Product team.<br><br><strong>The opportunity and what we offer</strong><br><br>Formalize is on a scaling journey with significant traction across Europe. As our new Staff Software Engineer, you'll have the opportunity to learn, grow, and tackle significant problems to make a meaningful impact in our Prod

In [None]:
def jobnet_scraper(title, postal, km_dist, num_jobs, existing_keys: Optional[Set[Tuple[str,str]]] = None, cutoff_dt: Optional[datetime] = None):
    list_url = f"https://jobnet.dk/bff/FindJob/Search?resultsPerPage={num_jobs}&pageNumber=1&orderType=PublicationDate&kmRadius={km_dist}&searchString={title.replace(' ', '+')}&postalCode={postal}"
    response = requests.get(list_url)
    
    if response.status_code != 200:
        print(f"[scrape] Jobnet: failed to fetch job listings (status code: {response.status_code})")
        return pd.DataFrame(columns=['job_board'])
    response_dict = response.json()
    print(f"[scrape] Jobnet: fetching up to {num_jobs} jobs for '{title}' near {postal} {city} (r={km_dist}km)")
    if existing_keys is None:
        existing_keys = set()
    job_list = []
    postings = response_dict.get("JobPositionPostings", [])
    early_reason = None
    for job_dict in tqdm(postings, desc="Jobnet jobs", unit="job"):
        job_id = job_dict["ID"]
        job_url = job_dict["Url"]
        job_post = {}
        
        job_post["title"] = job_dict["Title"]
        job_post["company"] = job_dict["HiringOrgName"]
        job_post["location"] = job_dict["WorkPlaceCity"]
        
        job_post["time_posted"] = job_dict["PostingCreated"]
        job_post["url"] = job_url
        job_post["employment_type"] = job_dict["EmploymentType"]
        job_post["full_or_part_time"] = job_dict["WorkHours"]

        job_post["description"] = None 
        
        if not job_dict["IsExternal"]:
            job_response = requests.get(f"https://job.jobnet.dk/CV/FindWork/JobDetailJson?id={job_id}&previewtoken=")
            if job_response.status_code == 200:
                data = job_response.json()
                formatted_html = data.get("FormattedPurpose") or ""
                job_soup = BeautifulSoup(formatted_html, 'html.parser')
                desc_lines = [line.strip() for line in job_soup.stripped_strings if line.strip()]
                job_post["description"] = "\n".join(desc_lines) or data.get("Description")
        key = (job_post.get("company"), job_post.get("title"))
        # Early termination checks
        # Parse posted time if possible (ISO expected)
        posted_raw = job_post.get("time_posted")
        posted_dt = None
        if posted_raw:
            try:
                posted_dt = pd.to_datetime(posted_raw, utc=True, errors='coerce')
            except Exception:
                posted_dt = None
        if key in existing_keys:
            early_reason = "first existing key encountered (sorted list)"
            break
        if cutoff_dt and posted_dt is not None and posted_dt.to_pydatetime() < cutoff_dt:
            early_reason = "job older than last scrape timestamp"
            break
        job_list.append(job_post)

    df = pd.DataFrame(job_list)
    if not df.empty:
        df['job_board'] = 'jobnet'
    else:
        df = pd.DataFrame(columns=['job_board'])
    if early_reason:
        print(f"[scrape] Jobnet: early termination - {early_reason}; collected {len(job_list)} new rows")
    print(f"[scrape] Jobnet: dataframe rows={len(df)}")
    return df
