In [1]:
%pip install scrapingbee

Note: you may need to restart the kernel to use updated packages.


# Job scraping


### Importing libraries

Explanation and thoughts: look at the comments respectively and see report.pdf

In [2]:
from datetime import datetime # import this module to have unique folder names
from scrapingbee import ScrapingBeeClient
import os #import os module to interact with the operating system
import re #importing regular expressions
from dotenv import load_dotenv


### setting up scrapingbee client and variables

Explanation and thoughts: look at the comments respectively and see report.pdf

In [3]:
load_dotenv()
api_key = os.getenv("SCRAPINGBEE_API_KEY")

if not api_key:
    raise RuntimeError("SCRAPINGBEE_API_KEY not found in .env!")

client = ScrapingBeeClient(api_key=api_key)
now = datetime.now()
formatted = now.strftime("%Y_%m_%d___%H_%M_%S")
file_name = f"scraped_jobs_{formatted}"
os.makedirs(file_name)
 
job_ids_list = []
upper_bound = 2100 #max jobs to scrape
 #regular expression pattern to extract job ids from the HTML content
job_id_extract_pattern = re.compile(r'href=["\'](?:/)?de/stellenangebote/detail/([^/"\']+)/["\']') 
# I only care about jobs in these 2 fields
job_fields = ["informatik-telekommunikation", "finanzen-treuhand-immobilien"]

### Fetching all pages of our job field and collecting all job_ids for each page

Explanation and thoughts: look at the comments respectively and see report.pdf

In [4]:
#the max pages for informatics and telecommunication is 100, so I set the range accordingly
# I stop earlier if I reach the upper_bound of jobs to scrape
stop = False
for job_field in job_fields:
    if stop:
        break
    for page in range(1, 101):
        if stop:
            break
        url = f"https://www.jobs.ch/de/stellenangebote/{job_field}/?page={page}"
        print(f"scrapingbee get request: {url}")
        res = client.get(url, params={'render_js': 'true'})
        job_page = res.content.decode('utf-8')
        page_job_ids = job_id_extract_pattern.findall(job_page)
        
        if not page_job_ids:
            print("No more jobs found, going to next job_field.")
            break
        else:
            print(f"Saw {len(page_job_ids)} jobs_ids on the page: {url}")

        for job_id in page_job_ids:
            if job_id in job_ids_list:
                print(f"job_id: {job_id} already in the list, skipping it")
                continue
            else:
                print(f"Jobfield: {job_field} page: {page} adding job_id: {job_id} --->  {len(job_ids_list)} out of {upper_bound}")
                job_ids_list.append(job_id)
                if len(job_ids_list) >= upper_bound:
                    stop = True
                    break

if len(job_ids_list) >= upper_bound:
    print("reached the upper bound, proceeding with the jobs we have.")
else: 
    print("Did not reach the upper bound, proceeding with the jobs we have.")


scrapingbee get request: https://www.jobs.ch/de/stellenangebote/informatik-telekommunikation/?page=1
Saw 20 jobs_ids on the page: https://www.jobs.ch/de/stellenangebote/informatik-telekommunikation/?page=1
Jobfield: informatik-telekommunikation page: 1 adding job_id: a0826a2a-d355-464c-8166-97856a513ed1 --->  0 out of 2100
Jobfield: informatik-telekommunikation page: 1 adding job_id: 9caf8d90-7266-4a54-a5d0-c8a0d8c52286 --->  1 out of 2100
Jobfield: informatik-telekommunikation page: 1 adding job_id: 6e1fe010-2d19-49f8-beb0-0ed8e21bb1e2 --->  2 out of 2100
Jobfield: informatik-telekommunikation page: 1 adding job_id: 1a564905-c883-472a-9605-7b8a21dc6c47 --->  3 out of 2100
Jobfield: informatik-telekommunikation page: 1 adding job_id: 735bef36-ed3c-485d-bf1b-04e1759457ca --->  4 out of 2100
Jobfield: informatik-telekommunikation page: 1 adding job_id: dc021ca1-b670-4138-93ae-2448aa9d8d1b --->  5 out of 2100
Jobfield: informatik-telekommunikation page: 1 adding job_id: 8ce1943f-01be-4148

### Downloading specific website for each previously collected job id

Explanation and thoughts: look at the comments respectively and see report.pdf

In [5]:
# Once we have all links collected we can download the specific page
index = 1
for job_id in job_ids_list:
    try:
        # constructing the specific job offer url
        url = f"https://www.jobs.ch/de/stellenangebote/detail/{job_id}/"
        print(f"scrapingbee get request {url} ---> {index} out of {len(job_ids_list)}")
        response = client.get(url, params={'render_js': 'true'})
        with open(f"{file_name}/stellenausschreibung_{index}.html", "wb") as f:
            f.write(response.content)
    except Exception as e:
        print(f"Error during scrapingbee get requst link:{job_id}, error: {e}")
    index+=1
print(f"scraping completed, total jobs downloaded: {len(job_ids_list)}")

scrapingbee get request https://www.jobs.ch/de/stellenangebote/detail/a0826a2a-d355-464c-8166-97856a513ed1/ ---> 1 out of 2100
scrapingbee get request https://www.jobs.ch/de/stellenangebote/detail/9caf8d90-7266-4a54-a5d0-c8a0d8c52286/ ---> 2 out of 2100
scrapingbee get request https://www.jobs.ch/de/stellenangebote/detail/6e1fe010-2d19-49f8-beb0-0ed8e21bb1e2/ ---> 3 out of 2100
scrapingbee get request https://www.jobs.ch/de/stellenangebote/detail/1a564905-c883-472a-9605-7b8a21dc6c47/ ---> 4 out of 2100
scrapingbee get request https://www.jobs.ch/de/stellenangebote/detail/735bef36-ed3c-485d-bf1b-04e1759457ca/ ---> 5 out of 2100
scrapingbee get request https://www.jobs.ch/de/stellenangebote/detail/dc021ca1-b670-4138-93ae-2448aa9d8d1b/ ---> 6 out of 2100
scrapingbee get request https://www.jobs.ch/de/stellenangebote/detail/8ce1943f-01be-4148-a6e1-d300a27ff8cd/ ---> 7 out of 2100
scrapingbee get request https://www.jobs.ch/de/stellenangebote/detail/6dce5f21-f7c1-4eac-89c4-aad7001d2828/ ---

# report.pdf

**is located in the same directory as this file**
