In [8]:
#!pip install curl_cffi
import pandas as pd
import plotly as pl
from bs4 import BeautifulSoup
from curl_cffi import requests as cureq
from datetime import datetime, timedelta

url = "data/combined_job_offers.csv"
url2 = "data/combined_job_offers3.csv"

df1 = pd.read_csv(url)
df2 = pd.read_csv(url2)

df3 = pd.concat([df1,df2],axis=0)

pd.set_option('display.max_columns', None)

In [9]:
# cleaning
# 1 dropping not needed columns
df3.drop(columns =['repost_date', 'email', 'job_desc'], inplace=True)
# 2 Renaming 'link'
df3.rename(columns={'link': 'source'}, inplace=True)
# 3 replace all links with LinkedIn
# apply lambda for each cell replace all string
df3['source'] = df3['source'].apply(lambda x: 'LinkedIn')# 1 drop columns df3

In [10]:
def scrape_stepstone(job_title, page, language, worktime, sector):
    
    lang_filter = f"&action=facet_selected%3bdetectedLanguages%3bde&fdl={language}" # must be 'de' or 'en'
    worktime_filter = f"&action=facet_selected%3bworktypes%3b8000{worktime}" # must be '1' for fulltime or '2' for parttime
    sector_filter = f"&action=facet_selected%3bsectors%3b21000&se={sector}" # 21000 for 'it & internet', 23000 for 'bwl/business' 15000 for 'retail', 19001 for 'bank', 19002 for 'finance'  
    if worktime == "1":
        test = "vollzeit"
    elif worktime == "2":
        test = "teilzeit"
    # compare number of existing pages to user entries
    #url = f"https://www.stepstone.de/jobs/{job_title}/in-berlin?radius=10&page={page}{lang_filter}{worktime_filter}{worktime_filter}" # job_title must be tile with '-' separator
    
    url = f"https://www.stepstone.de/jobs/{test}/{job_title}/in-berlin?radius=10&action=facet_selected%3bworktypes%3b80001&fdl={language}&se={sector}"
    
    response = cureq.get(url, impersonate='chrome')
    soup = BeautifulSoup(response.content, "html.parser")
    return soup

def handle_date(stepstone_post_date):
    sliced = stepstone_post_date.split(" ", 2)
    date_number = int(sliced[1])
    time_format = sliced[2]
    date = datetime.now()
    
    match time_format:
        case "Stunden" | "Stunde":
            date = date - timedelta(hours=date_number)
        case "Tagen" | "Tag":
            date = date - timedelta(days=date_number)
        case "Wochen" | "Woche":
            date = date - timedelta(weeks=date_number)
        case "Monate" | "Monat":
            date = date - timedelta(months=date_number)
        case _:
           date = date
    return date

In [11]:
# scrape for testing purpose
test = scrape_stepstone('data-analyst', '2', 'de', '2', '21000')

In [12]:
# get all job listings
itest = test.find_all(['article'], attrs={"class": 'res-1p8f8en'})

# scrape job titles
job_title_list = []
for item in itest:
    job_title_list.append(item.find(['div'], attrs={'class': 'res-nehv70'}).get_text())

# scrape company names
company_name_list = []
for item in itest:
    company_name_list.append(item.find(['span'], attrs={'class': 'res-btchsq'}).get_text())

# scrape post date
post_date_list = []
for item in itest:
    post_date_list.append(handle_date(item.find(['time']).get_text()))

# scrape salary - only possible with login
salary_list = []
for item in itest:
    item.find_all(['div'], attrs={'class': 'res-lgmafx'})[0]
    #salary_list.append(item.find(['span'], attrs={'class': 'res-1fad2gj'}).get_text())

# scrape remote 
remote_list = []
for item in itest:
    result = item.find_all(['div'], attrs={'class': 'res-lgmafx'})
    remote = result[0].find(['span'], attrs={'class': 'res-1qh7elo'})
    try:
        remote_exists = remote.find(['span'], attrs={'class': 'res-btchsq'}).get_text()
        remote_list.append(remote_exists)
    except AttributeError:
        remote_exits = "on-site"
        remote_list.append(remote_exists)
        
    
print(len(job_title_list))
print(len(remote_list))
print(company_name_list)
remote_list

25
25
['Sopra Steria', 'HMS Analytical Software GmbH', 'HMS Analytical Software GmbH', 'Atruvia AG', 'FUNKE Works GmbH', 'inovex GmbH', 'Atruvia AG', 'ParshipMeet Group', 'inovex GmbH', 'Atruvia AG', 'BridgingIT GmbH', 'BridgingIT GmbH', 'NTT DATA Business Solutions AG', 'NTT DATA Business Solutions AG', 'NTT DATA Business Solutions AG', 'NTT DATA Business Solutions AG', 'NTT DATA Business Solutions AG', 'NTT DATA Business Solutions AG', 'NTT DATA Business Solutions AG', 'ecratum GmbH', 'NTT DATA Business Solutions AG', 'NTT DATA Business Solutions AG', 'NTT DATA Business Solutions AG', 'HMS Analytical Software GmbH', 'LIQID Investments GmbH']


['Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office',
 'Teilweise Home-Office']

In [63]:
#1 stepstone wrangling: created stepstone_df 
column_dict = {"job_title": job_title_list, "company_name":company_name_list, "post_date":post_date_list, "job_remote": remote_list}
stepstone_df = pd.DataFrame(column_dict)

#2 stepstone wrangling: extract_job_type

import re
job_level = []
def extract_job_type(title):
    keywords = ['Senior', 'Werkstudent', 'Junior', 'Consultant', 'Developer', 'Mid', 'Intern', 'Lead', 'Manager', 'Internship', 'Entry level','Director', 'Mid-Senior']
    for keyword in keywords:
        if re.search (r'\b' + re.escape(keyword) + r'\b', title, re.IGNORECASE):
            return keyword
    return 'Unknown'

stepstone_df['job_level'] = stepstone_df['job_title'].apply(extract_job_type)

In [62]:
# 3 linkedin wrangling:extract_job_type for NaN with keywords and 'Unknown'
import re
df3['job_level']
def replace_nan_with_keywords(row):
    keywords = ['Senior', 'Mid-Senior level', 'Associate', 'Entry', 'Entry level','Werkstudent', 'Junior', 'Consultant', 'Developer', 'Mid', 'Intern', 'Lead', 'Manager', 'Internship', 'Entry level','Director', 'Mid-Senior']
    
    if pd.isna(row['job_level']):
        for keyword in keywords:
            if re.search(r'\b' + re.escape(keyword) + r'\b', row['job_title'], re.IGNORECASE):
                return keyword
        return 'Unknown'
    else:
        return row['job_level']

df3['job_level'] = df3.apply(replace_nan_with_keywords, axis=1)

In [71]:
# Possible API for company size by employee count

import requests
from bs4 import BeautifulSoup

# URL of the webpage
url = 'https://www.destatis.de/EN/Themes/Economic-Sectors-Enterprises/Enterprises/Small-Sized-Enterprises-Medium-Sized-Enterprises/ExplanatorySME.html'

# Send a request to fetch the HTML of the page
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Example of finding a table (assuming you're targeting specific tables)
# You may need to adjust the selectors according to the actual structure
tables = soup.find_all('table')

# Depending on the structure of the tables, you may want to inspect manually and adjust
# Assuming we're interested in the first table
if tables:
    table = tables[0]  # Or use tables[x] if not the first

    # Extract table rows
    rows = table.find_all('tr')
    for row in rows:
        # Extract the columns (elements) in each row
        cols = row.find_all('td')
        size_class = cols[0].get_text(strip=True) if len(cols) > 0 else ''
        persons_employed = cols[1].get_text(strip=True) if len(cols) > 1 else ''
        # annual_turnover = cols[2].get_text(strip=True) if len(cols) > 2 else ''
        
        # If looking for specific size classes
        if size_class in ["Micro-enterprises", "Small enterprises", "Medium-sized enterprises"]:
            print(f"Size Class: {size_class}, Persons Employed: {persons_employed}") #, Annual Turnover: {annual_turnover}


Size Class: Micro-enterprises, Persons Employed: Up to 9


username = ""
password = ""
credentials = {"username": username, "password": password}
response = cureq.get(url, data = credentials, impersonate='chrome')