# Job posting gathering

In [32]:
import numpy as np
import json
import requests
from bs4 import BeautifulSoup
import re
import time
import itertools
from selenium import webdriver

import methods.scrapefunctions as sfuncs
import methods.urls as urlfuncs

## Website: karriere.at

In [26]:
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import methods.scrapefunctions as sfuncs
import methods.urls as urlfuncs

def load_job_karriereat(url, driver=None, close_popup=False, wait_time_popup=2, wait=1, close_driver=True, return_url=False):
    if driver is None:
        driver = webdriver.Firefox()
    driver.get(url)
    button_selector = ".onetrust-close-btn-handler"
    if close_popup:
        sfuncs.close_popup(button_selector, click_wait=wait_time_popup, post_click_wait=0,
                            driver=driver, close_driver=False, open_page=False)
    time.sleep(wait)
    job_soup = BeautifulSoup(driver.page_source, 'html.parser')
    if close_driver:
        driver.quit()

    if return_url:
        return job_soup, url
    return job_soup

def load_jobs_karriereat(urls, driver=None, close_popup=False, wait_time_popup=2, wait=1, close_driver=True, return_urls=False):
    if driver is None:
        driver = webdriver.Firefox()
    all_soups = []
    returned_urls = []

    for url in urls:
        if return_urls:
            soup, url = load_job_karriereat(url, driver=driver, close_popup=close_popup, 
                                   wait_time_popup=wait_time_popup, wait=wait, 
                                   close_driver=False, return_url=True)
            returned_urls.append(url)
        else:
            soup = load_job_karriereat(url, driver=driver, close_popup=close_popup, 
                                       wait_time_popup=wait_time_popup, wait=wait, 
                                       close_driver=False, return_url=False)
        all_soups.append(soup)

    if close_driver:
        driver.quit()

    if return_urls:
        return all_soups, returned_urls
    return all_soups

def load_page_karriereat(url, driver=None, close_popup=True, wait_time_popup=12, post_click_wait=1.5, close_driver=True):
    if driver is None:
        driver = webdriver.Firefox()
    driver.get(url)
    button_selector = ".onetrust-close-btn-handler"
    if close_popup:
        sfuncs.close_popup(button_selector, click_wait=wait_time_popup, post_click_wait=post_click_wait,
                            driver=driver, close_driver=False, open_page=False)
    button_selector = ".m-loadMoreJobsButton__button"
    final_page_soup = sfuncs.press_button_until_gone(button_selector, wait_time=4, pre_click_wait=1,
                                                driver=driver, close_driver=False, open_page=False)
    if close_driver:
        driver.quit()
    return final_page_soup

def load_pages_karriereat(urls, driver=None, close_popup="first", click_wait=4, close_driver=True):
    if driver is None:
        driver = webdriver.Firefox()
    all_soups = []
    close_popup_bool = True
    for i,url in enumerate(urls):
        wait = click_wait
        if close_popup=="first":
            if i==0:
                close_popup_bool = True
                wait = 15
            else:
                close_popup_bool = False
        elif (close_popup=="all") | (close_popup==True):
            close_popup_bool = True
            if i==0:
                wait = 15
        elif (close_popup=="none") | (close_popup==False):
            close_popup_bool = False

        soup = load_page_karriereat(url, driver=driver, close_popup=close_popup_bool, wait_time_popup=wait, close_driver=False)
        all_soups.append(soup)

    if close_driver:
        driver.quit()
    return all_soups

def gather_data_karriereat(driver=None, close_driver=True):
    titlewords =["machine-learning","machine-learning-engineer","machine-learning-scientist",
     "ML-scientist", "ML-engineer", "ML-researcher", "ML-developer", "ML-AI",
     "AI-engineer", "AI-scientist", "AI-researcher", "AI-developer", "AI-ML", 
     "data-science","data-scientist", "data-mining",
     "data-engineer", "data-engineering", "data-engineering-developer",
     "data-analysis", "data-analytics", "data-analyst",
     "business-intelligence", "business-intelligence-analyst", "bi-analyst", "business-analyst",
     ]
    locations =["wien-und-umgebung"]*len(titlewords)    
    urls_links = urlfuncs.urls_builder('https://www.karriere.at/jobs', [titlewords, locations], zipped = True, all_combinations = False)
    
    if driver is None:
        driver = webdriver.Firefox()
    soups = load_pages_karriereat(urls_links, driver=driver, close_popup="first", close_driver=False)
    pattern = 'div.m-jobsListItem__container div.m-jobsListItem__dataContainer h2.m-jobsListItem__title a.m-jobsListItem__titleLink'
    
    postings = {}
    for soup in soups:
        selects = soup.select(pattern)
        for select in selects:
            id = re.search(r'\d+', select["href"]).group()
            if id not in postings.keys():
                title = select.text
                if title:
                    title = title.strip()
                postings[id] = {"title": title, "url": select["href"], "source": "karriere.at", "id": id}
    
    url_jobs = [posting["url"] for posting in postings.values()]
    soups,returned_urls = load_jobs_karriereat(url_jobs, driver=driver, close_driver=False, return_urls=True)
    ids = [re.search(r'\d+', url).group() for url in returned_urls]

    for i,soup in enumerate(soups):
        description = soup.find("div", class_="m-jobContent__jobText m-jobContent__jobText--standalone")
        if description:
            description = description.text.strip()
        postings[ids[i]]["description"] = description
    
    if close_driver:
        driver.quit()
    return postings

In [27]:
postings = gather_data_karriereat()

A summary of what we have gathered:

In [39]:
num_postings = len(postings)
num_postings_with_description = len([posting for posting in postings.values() if posting["description"]])
description_length_sum = np.sum([len(posting["description"]) for posting in postings.values() if posting["description"]])
description_length_avg = np.mean([len(posting["description"]) for posting in postings.values() if posting["description"]])
description_words_sum = np.sum([len(posting["description"].split()) for posting in postings.values() if posting["description"]])
description_words_avg = np.mean([len(posting["description"].split()) for posting in postings.values() if posting["description"]])

print(f"Number of postings: {num_postings}")
print(f"Number of postings with description: {num_postings_with_description}")
print(f"Total number of characters: {description_length_sum}")
print(f"Average number of characters (excluding 0-length descriptions): {description_length_avg}")
print(f"Total number of words: {description_words_sum}")
print(f"Average number of words (excluding 0-length descriptions): {description_words_avg}")

Number of postings: 462
Number of postings with description: 460
Total number of characters: 1783171
Average number of characters (excluding 0-length descriptions): 3876.458695652174
Total number of words: 226275
Average number of words (excluding 0-length descriptions): 491.9021739130435


An example:

In [60]:
banned_words = ["manager", "management", "professor", "team leader", "teamleader", "teamleiter", "team leiter",
                "internship", "jurist", "lawyer", "auditor"]
postings_filtered = {key: value for key, value in postings.items()
                     if not any(banned_word in value["title"].lower() for banned_word in banned_words)}
len(postings_filtered), len(postings)

(352, 462)

In [53]:
titles = [posting["title"] for posting in postings.values()]
titles

['Consultant Technology Strategy & Advisory (all genders)',
 'IT-Consultant Artificial Intelligence (AI) (all genders)',
 'Conversational AI Specialist (all humans)',
 'Assistant Professor with tenure track to establish a Research Group for Artificial Intelligence / Machine Learning in the Life Sciences',
 'Games Analyst (f/m/d)',
 'Senior Full Stack Data Scientist_in',
 'Data Scientist_in',
 'DevOps Engineer (w/m/x)',
 'DevOps Engineer (w/m/x)',
 'KI-Experte für Softwarelösungen (m/w/d) - AI Solution Architect',
 'Teamleiter*in Data und Analytics',
 'Digitalisierungsexpert:in Energiewirtschaft mit Schwerpunkt Kurzfristprognose',
 'Data Scientist (m/w/d) - Data Engineer',
 'Data Engineer - Digital Services (m/f/d)',
 'Talent mit Unternehmergeist (m/w/d)',
 '(Senior) Consultant (w/m/d) ML und LLM Engineering',
 'IT Consultant (all genders) – Digitalisierung',
 'Consulting für PhysikerIn und MathematikerIn (m/w/d)',
 'Systemarchitekt*in Geoinformation',
 'IT-Netzwerkspezialist (w/m/x)',


In [44]:
postings["7379068"]

{'title': 'IT-Netzwerkspezialist (w/m/x)',
 'url': 'https://www.karriere.at/jobs/7379068',
 'source': 'karriere.at',
 'id': '7379068',
 'description': 'IT-Netzwerkspezialist (w/m/x)Internationales Umfeld#Cisco#Linux#CloudWien 30 - 40% Homeoffice EUR 3.500 - EUR 5.000 Vollzeit Job Nr. EPIN40651 smart bewerben Deine zukünftige RolleAls IT-Netzwerkspezialist übernimmst du die Verantwortung für die Konfiguration, Überwachung und Wartung der Netzwerksysteme auf Linux-Basis. Zu deinen Aufgaben gehören: Firewall-Management, regelmäßige Audits sowie Verwaltung der Firewall-Infrastruktur, um sicherzustellen, dass die Netzwerksicherheit stets auf dem neuesten Stand ist Netzwerkvisualisierung, Erstellung und Pflege von Netzwerk-Mappings, um die Netzwerkarchitektur zu optimieren Unterstützung vor Ort, damit alle Systeme reibungslos laufen und um bei technischen Herausforderungen schnell regieren zu können Entwicklung und Pflege von Shell-Skripten Innovative IT-Projekte wie Secure Edge Networking u

In [41]:
#Save data
date_today = time.strftime("%Y-%m-%d")

with open(f"source/save/postings_{date_today}.json", "w") as f:
    json.dump(postings, f)

## TieTalent

**In works**

In [79]:
session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

url = 'https://tietalent.com/en/jobs?search=positions%5B0%5D%3DData_Engineer_5%26positions%5B1%5D%3DData_Analyst_36%26positions%5B2%5D%3DData_Scientist_37%26positions%5B3%5D%3DMachine_Learning_7%26positions%5B4%5D%3DBusiness_Intelligence_39%26positions%5B5%5D%3DNLP_14%26locations%5B0%5D%3DVienna_Vienna_Austria_304'
response = session.get(url, headers=headers)

Here, there is no scrolling needed, but checking try page 2, 3, ... will be desired.

In [80]:
soup = BeautifulSoup(response.content, 'html.parser')