# Web Scraper Data from www.berufsberatung.ch

Note that the HTML content in 'berufsberatung_occupations_de.html' has manually been derived from: https://www.berufsberatung.ch/dyn/show/1893.  
Alternatively use Python and Selenium with Chromedriver.  

## Libraries and settings

In [1]:
# Libraries
import os
import re
import html
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
import matplotlib.pyplot as plt

# Path
path = '/home/ec2-user/SageMaker/career_counseling_chatbot'

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Show working directory
print("Current working directory:", os.getcwd())

Current working directory: /home/ec2-user/SageMaker/career_counseling_chatbot/notebooks


## Extract occupation URLs 

In [2]:
# Function to fetch HTML content from a file
def extract_occupation_urls(html_content):
    """Extract occupation URLs from HTML content
    
    Args:
        html_content: HTML content as string
        
    Returns:
        List of dictionaries containing occupation data
    """
    # Create a BeautifulSoup object from the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all anchor tags with the specified class
    links = soup.find_all('a', class_='ajaxUpadteHrefIdx')
    
    base_url = "https://www.berufsberatung.ch"
    occupation_data = []
    
    for link in links:
        # Get the relative URL and decode HTML entities
        relative_url = html.unescape(link['href'])
        
        # Extract the occupation ID from the URL
        id_match = re.search(r'id=(\d+)', relative_url)
        occupation_id = id_match.group(1) if id_match else None
        
        # Get the occupation name
        occupation_name = link.text.strip()
        
        # Create the complete URL
        complete_url = f"{base_url}{relative_url}"
        
        # Store the data
        occupation_data.append({
            'id': occupation_id,
            'name': occupation_name,
            'url': complete_url
        })
    
    return occupation_data


# Read HTML content from file
with open(path + '/data/processed/berufsberatung_occupations_de.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


# Extract occupation URLs
occupation_data = extract_occupation_urls(html_content)

# Convert to DataFrame
df = pd.DataFrame(occupation_data)

# Save data to CSV
# df.to_csv('../../data/processed/berufsberatung_occupations_de.csv', index=False)

# Show the data frame
print("Occupation DataFrame:")
df.head()


Occupation DataFrame:


Unnamed: 0,id,name,url
0,9946,Abdichter/in EFZ,https://www.berufsberatung.ch/dyn/show/1900?la...
1,9951,Abdichtungspraktiker/in EBA,https://www.berufsberatung.ch/dyn/show/1900?la...
2,8198,Abklärer/in IV,https://www.berufsberatung.ch/dyn/show/1900?la...
3,11528,Abteilungsleiter/in (alle Branchen),https://www.berufsberatung.ch/dyn/show/1900?la...
4,11279,Active Sourcer / Social Recruiter,https://www.berufsberatung.ch/dyn/show/1900?la...


## Extract data from single occupation

In [3]:
# Read html content from url
url = "https://www.berufsberatung.ch/dyn/show/1900?lang=de&idx=10000&id=9946"
response = requests.get(url)
html_content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract title and description from id="roof-top"
roof_top = soup.find('div', id='roof-top')
title = roof_top.find('h1').get_text(strip=True) if roof_top else 'Not found'
description = roof_top.find('p', class_='lead').get_text(strip=True) if roof_top else 'Not found'

# Extract initial categories
categories = {}
roof_bottom = soup.find('div', id='roof-bottom')
if roof_bottom:
    dl = roof_bottom.find('dl')
    if dl:
        for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd')):
            categories[dt.get_text(strip=True)] = dd.get_text(separator=' ', strip=True)

# Function to extract sections from h2 headers
def extract_section(header_text):
    header = soup.find('h2', string=header_text)
    if header:
        content = []
        sibling = header.parent.find_next_sibling('div', class_='boxContent')
        if sibling:
            content.append(sibling.get_text(separator=' ', strip=True))
        return ' '.join(content)
    return 'Not found'

additional_sections = ["Tätigkeiten", "Ausbildung", "Voraussetzungen", "Weiterbildung", "Berufsverhältnisse", "Weitere Informationen"]
for section in additional_sections:
    categories[section] = extract_section(section)

# Extract related occupations with their IDs
related_occupations = []
related_section = soup.find('ul', class_='arrow has-i')
if related_section:
    for li in related_section.find_all('li'):
        # Find the anchor tag inside the list item
        anchor = li.find('a')
        if anchor:
            # Extract the occupation name
            occupation_name = anchor.get_text(strip=True)
            
            # Extract the occupation ID from the href attribute using regex
            href = anchor.get('href')
            id_match = re.search(r'id=(\d+)', href)
            occupation_id = id_match.group(1) if id_match else None
            
            if occupation_id:
                # Store both name and ID
                related_occupations.append({
                    "id": occupation_id,
                    "name": occupation_name
                })

# Combine extracted data
extracted_data = {
    'URL': url,
    'Date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    'Title': title,
    'Description': description,
    **categories,
    'Related': related_occupations  # Now storing structured data with IDs
}

# Print the extracted data
for key, value in extracted_data.items():
    print(f"{key}:\n{value}\n{'-'*50}")


URL:
https://www.berufsberatung.ch/dyn/show/1900?lang=de&idx=10000&id=9946
--------------------------------------------------
Date:
2025-04-15 06:18:27
--------------------------------------------------
Title:
Abdichter/in EFZ
--------------------------------------------------
Description:
Abdichterinnen und Abdichter isolieren Flachdächer, Vordächer, Terrassen und Keller und dichten sie ab. Sie bringen Verkleidungen an Neubauten oder renovierten Gebäuden an. Ausserdem reparieren sie kaputte Abdichtungen und halten sie instand.
--------------------------------------------------
Bildungstypen:
Grundbildung (Lehre)
--------------------------------------------------
Berufsfelder:
Bau
--------------------------------------------------
Branchen:
Bau - Planung, Hoch- und Tiefbau
--------------------------------------------------
Swissdoc:
0.430.45.0
--------------------------------------------------
Tätigkeiten:
Sie üben folgende Tätigkeiten aus: Arbeiten vorbereiten und Baustelle einrichten

## Extract data from all occupations

In [None]:
# Create a list to store all occupation data
all_occupation_data = []

# Set up progress tracking
total_urls = len(df)
print(f"Starting to process {total_urls} URLs ...")

# File to save all data
output_file = path + '/data/processed/berufsberatung_occupations_de.json'

# Resume from existing data if available
if os.path.exists(output_file):
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            all_occupation_data = json.load(f)
        processed_urls = {item['URL'] for item in all_occupation_data}
        print(f"Loaded {len(all_occupation_data)} previously processed occupations")
    except Exception as e:
        print(f"Error loading existing data: {e}")
        processed_urls = set()
else:
    processed_urls = set()

# Loop through each occupation URL in df['url']
for index, row in df.iterrows():
    url = row['url']
    occupation_id = row['id']
    name = row['name']
    
    # Skip already processed URLs
    if url in processed_urls:
        print(f"Skipping already processed: {name}")
        continue
    
    # Print progress
    print(f"Processing {index+1}/{total_urls}: {name} (ID: {occupation_id})")
    
    try:
        # Read html content from url
        response = requests.get(url)
        response.raise_for_status()  # Raise exception for HTTP errors
        html_content = response.content

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract title and description from id="roof-top"
        roof_top = soup.find('div', id='roof-top')
        title = roof_top.find('h1').get_text(strip=True) if roof_top and roof_top.find('h1') else 'Not found'
        description = roof_top.find('p', class_='lead').get_text(strip=True) if roof_top and roof_top.find('p', class_='lead') else 'Not found'

        # Extract initial categories
        categories = {}
        roof_bottom = soup.find('div', id='roof-bottom')
        if roof_bottom:
            dl = roof_bottom.find('dl')
            if dl:
                for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd')):
                    categories[dt.get_text(strip=True)] = dd.get_text(separator=' ', strip=True)

        # Function to extract sections from h2 headers
        def extract_section(header_text):
            header = soup.find('h2', string=header_text)
            if header:
                content = []
                sibling = header.parent.find_next_sibling('div', class_='boxContent')
                if sibling:
                    content.append(sibling.get_text(separator=' ', strip=True))
                return ' '.join(content)
            return 'Not found'

        additional_sections = ["Tätigkeiten", "Ausbildung", "Voraussetzungen", "Weiterbildung", "Berufsverhältnisse", "Weitere Informationen"]
        for section in additional_sections:
            categories[section] = extract_section(section)

        # Extract related occupations with their IDs
        related_occupations = []
        related_section = soup.find('ul', class_='arrow has-i')
        if related_section:
            for li in related_section.find_all('li'):
                # Find the anchor tag inside the list item
                anchor = li.find('a')
                if anchor:
                    # Extract the occupation name
                    occupation_name = anchor.get_text(strip=True)
                    
                    # Extract the occupation ID from the href attribute using regex
                    href = anchor.get('href')
                    id_match = re.search(r'id=(\d+)', href)
                    occupation_id = id_match.group(1) if id_match else None
                    
                    if occupation_id:
                        # Store both name and ID
                        related_occupations.append({
                            "id": occupation_id,
                            "name": occupation_name
                        })

        # Combine extracted data
        extracted_data = {
            'ID': occupation_id,
            'Name': name,
            'URL': url,
            'Date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'Title': title,
            'Description': description,
            **categories,
            'Related': related_occupations
        }

        # Add to our list of all occupation data
        all_occupation_data.append(extracted_data)
        
        # Save intermediate results every 10 occupations
        if len(all_occupation_data) % 10 == 0:
            with open(output_file, 'w', encoding='utf-8') as json_file:
                json.dump(all_occupation_data, json_file, ensure_ascii=False, indent=4)
            print(f"Saved intermediate results ({len(all_occupation_data)} occupations)")
        
        # Brief pause to avoid overwhelming the server
        time.sleep(1)
        
    except Exception as e:
        print(f"Error processing {name} (URL: {url}): {str(e)}")
        # Save error information
        error_data = {
            'ID': occupation_id,
            'Name': name,
            'URL': url,
            'Date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'Error': str(e)
        }
        all_occupation_data.append(error_data)

# Save the final results to the JSON file
with open(output_file, 'w', encoding='utf-8') as json_file:
    json.dump(all_occupation_data, json_file, ensure_ascii=False, indent=4)

print(f"Completed processing {len(all_occupation_data)} occupations")
print(f"Results saved to {output_file}")

## Summary statistics

In [None]:
# Read 'berufsberatung_occupations_de.json' to a DataFrame
df = pd.read_json(path + '/data/processed/berufsberatung_occupations_de.json', encoding='utf-8')

# Show information
df.info()

## Bildungstypen

In [None]:
# Show number of occupations per Bildungstyp
bildungstyp_counts = df['Bildungstypen'].value_counts()
bildungstyp_counts


## Berufsfelder

In [None]:
# Show number of occupations per Berufsfeld
berufsfeld_counts = df['Berufsfelder'].value_counts()

# Loop through each Berufsfeld and print the count
for berufsfeld, count in berufsfeld_counts.items():
    print(f"{berufsfeld}: {count}")


## Branchen

In [None]:
# Show number of occupations per Branche
branchen_counts = df['Branchen'].value_counts()

# Loop through each Branche and print the count
for branche, count in branchen_counts.items():
    print(f"{branche}: {count}")

## Related occupations

In [None]:
# Count the number of related occupations for each occupation
related_counts = df['Related'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Get frequency distribution
count_distribution = related_counts.value_counts().sort_index()

# Create a bar chart to visualize
plt.figure(figsize=(7, 4))
count_distribution.plot(kind='bar', color='skyblue')
plt.title('Number of Occupations by Related Occupation Count')
plt.xlabel('Number of Related Occupations')
plt.ylabel('Count of Occupations')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xticks(rotation=0)  # Make x-axis labels horizontal
plt.tight_layout()
plt.show()

In [None]:
# Create a new DataFrame with occupation names and their related counts
occupation_related_counts = pd.DataFrame({
    'Name': df['Name'],
    'Related_Count': df['Related'].apply(lambda x: len(x) if isinstance(x, list) else 0)
})

# Sort by the number of related occupations
occupation_related_counts = occupation_related_counts.sort_values('Related_Count', ascending=False)

# Plot top 20 occupations with the most related occupations
occupation_related_counts.head(20)
