In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm

base_url="https://www.lag-sb-rlp.de"
category_csv = "../storage/category_urls.csv"
dataset_csv = "../storage/dataset.csv"
dataset_path = "../storage/dataset"
invalid_chars = '<>:"/\\|?*' # forbidden chars for linux file-names

In [26]:
"""Fetch the url for each category from Leichte Sprache Bildergalerie"""
def get_categories():
    response = requests.get(base_url + "/projekte/bildergalerie-leichte-sprache")
    soup = BeautifulSoup(response.content, "html.parser")
    
    data = []
    category_links = {}
    categories_detail_div = soup.find("div", id="phocagallery-categories-detail")
    
    if categories_detail_div:
        for pg_field_div in categories_detail_div.find_all("div", class_="pg-legend"):
            a_tags = pg_field_div.find_all("a", href=True)        
            for a_tag in a_tags:
                category = a_tag.get_text(strip=True).split("<")[0]
                link = a_tag["href"]
                if link not in category_links:
                    category_links[link] = category
    
    for link, category in category_links.items():
        data.append({"url": base_url + link, "category": category})
    
    df = pd.DataFrame(data)
    df.to_csv(category_csv, index=False)
    
get_categories()

In [27]:
def scrape_category_links(csv_file):
    df = pd.read_csv(csv_file)
    return list(zip(df["url"], df["category"]))

def scrape_elements_in_category(category_link, category):
    elements = []
    page = 0
    while True:
        page_link = f"{category_link}?start={page*50}" if page > 0 else category_link
        response = requests.get(page_link)
        soup = BeautifulSoup(response.content, "html.parser")
        
        container = soup.find("div", id="pg-msnr-container")
        if not container:
            break
        
        items = container.find_all("div", class_="pg-cv-box item pg-grid-sizer")
        if not items:
            break
        
        for item in items:
            img_div = item.find("div", class_="pg-cv-box-img pg-box1")
            title_div = item.find("div", class_="pg-box-img-bottom").find("div", class_="pg-cv-name")
            
            if not img_div or not title_div:
                continue
            
            img_tag = img_div.find("img")
            if img_tag and "alt" in img_tag.attrs and img_tag["alt"] == "Zurück":
                continue
            
            link_tag = img_div.find("a")
            title = title_div.text.strip()
            link = link_tag["href"] if link_tag else None
            
            elements.append({"category": category,"title": title, "url": base_url + link})
        
        if len(items) < 50:
            break
        
        page += 1
    
    return elements

"""Get each sample from each category with title and url."""
def get_image_links():
    categories = scrape_category_links(category_csv)
    all_elements = []
    
    for link, category in categories:
        elements = scrape_elements_in_category(link, category)
        all_elements.extend(elements)
    
    df = pd.DataFrame(all_elements)
    df.to_csv(dataset_csv, index=False)

get_image_links()

In [28]:
def clean_title(title):    
    for char in invalid_chars:
        title = title.replace(char, "_")
    return title

"""create .png file for each sample image and .txt file for each image description """
def download_images_and_update_csv():
    df = pd.read_csv(dataset_csv)
    if not os.path.exists(dataset_path):
        os.makedirs(dataset_path)
    
    total_elements = len(df)
    progress_bar = tqdm(total=total_elements, desc="articles scraped", unit="article")

    for index, row in df.iterrows():
        raw_title = row["title"]
        link = row["url"]        
        title = clean_title(raw_title)
        
        folder_name = os.path.join(dataset_path, title)
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        
        response = requests.get(link)
        soup = BeautifulSoup(response.content, "html.parser")
        
        image_box = soup.find("div", id="phocaGalleryImageBox")
        if image_box:
            img_tag = image_box.find("img")
            if img_tag and "src" in img_tag.attrs:
                image_url = base_url + img_tag["src"]
                
                img_response = requests.get(image_url)
                img_path = os.path.join(folder_name, title + ".png")
                with open(img_path, "wb") as img_file:
                    img_file.write(img_response.content)
        
        text_box = soup.find("td", class_="pg-dv-desc no-popup")
        if text_box:
            p_tag = text_box.find("p")
            if p_tag:
                text_content = p_tag.get_text(strip=True)
                # clean image description and save it in csv
                text_path = os.path.join(folder_name, title + ".txt")
                with open(text_path, "w", encoding="utf-8") as text_file:
                    text_file.write(text_content.replace("Download", "").replace("\u00A0", " "))
        
        progress_bar.update(1)
    
    progress_bar.close()
    
    print(f"download completed")

download_images_and_update_csv()

articles scraped: 100%|██████████| 414/414 [03:10<00:00,  2.17article/s]

download completed





In [29]:
"""For further investigation of the dataset, you can add the image resolution and description to dataset.csv """
def add_description_and_resolution():
    df = pd.read_csv(dataset_csv)    
    df["description"] = None
    df["resolution"] = None
    
    for i, row in df.iterrows():
        titel = clean_title(row["title"])
        txt_file_path = None
        img_file_path = None
        
        for root, dirs, files in os.walk(dataset_path):
            if titel in dirs:
                txt_file_path = os.path.join(root, titel, f"{titel}.txt")
                img_file_path = os.path.join(root, titel, f"{titel}.png")
                break
        
        if txt_file_path and os.path.isfile(txt_file_path):
            with open(txt_file_path, "r", encoding="utf-8") as file:
                description = file.read().strip()
                df.at[i, "description"] = description
        
        if img_file_path and os.path.isfile(img_file_path):
            try:
                with Image.open(img_file_path) as img:
                    resolution = f"{img.width}x{img.height}"
                    df.at[i, "resolution"] = resolution
            except Exception as e:
                print(f"could not read image resolution for {img_file_path}: {e}")
    
    df.to_csv(dataset_csv, index=False)

add_description_and_resolution()