In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
from tqdm import tqdm
import time
import re

In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options

In [3]:
def scrape_poem_links(url, site:int):
    
    # Set up Firefox options
    firefox_options = Options()
    firefox_options.add_argument("-headless")

    # Set up Firefox service
    webdriver_service = Service('geckodriver.exe')

    # Create a new instance of the Firefox driver
    driver = webdriver.Firefox(options=firefox_options)

    # Load the webpage
    driver.get(url)

    # Wait for the dynamic content to load (you may need to adjust the waiting time based on the page)
    driver.implicitly_wait(5)

    
    # Get the page source after the content is loaded
    for i in range(site):
        try:
            next_button = driver.find_element(By.XPATH, '//a//span[contains(text(), "nächste")]')
            print(next_button)
            next_button.click()
            time.sleep(5)  # Wait for additional content to load
        except:
            break
    page_source = driver.page_source

    # Close the browser
    driver.quit()

    # Process the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find and extract the desired elements using BeautifulSoup
    # Example: Get the titles of the poems
    poem_links = []
    link_elements = soup.find_all('a', class_='row')
    
    for link in link_elements:
        href = link.get('href')
        poem_links.append(href)
    
    return poem_links

In [4]:
def scrape_poem(url, title, gender):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    poem_div = soup.find('div', class_='gedicht-originaltext clearfix')
    poem_lines = [line.strip() for line in poem_div.stripped_strings]
    
    author_h1 = soup.find('h1', id='gedicht-autor')
    author_name = author_h1.find('a').text.strip() if author_h1 else ''

    categories_table = soup.find('table', class_='kat')
    categories_links = categories_table.find_all('a')
    categories = [link.text.strip() for link in categories_links]
    
    poem_data = {
        'title': title,
        'categories': categories,
        'gender': gender,
        'author': author_name,
        'poem': {}
    }
    
    for idx, line in enumerate(poem_lines, start=1):

        filtered_line = re.sub(r'[^\x00-\x7FäöüÄÖÜß]+', '', line)
        filtered_line = filtered_line.strip()

        poem_data['poem'][f'line.{idx}'] = {'text': filtered_line}
    
    return poem_data


def save_to_json(poem_data, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        json.dump(poem_data, file, indent=4, ensure_ascii=False)


In [6]:
# Create a "corpus" folder if it doesn't exist
corpus_folder = f'corpus_finished'
os.makedirs(corpus_folder, exist_ok=True)

# Define the gender, category, and site combinations
combinations = [
    ('m', 66, 4),
    ('w', 66, 3),
    ('m', 29, 4),
    ('w', 29, 3)
]

# Scrape poem links and save poems for each combination
for gender, category, sites in combinations:
    url = f'https://www.lyrikline.org/de/gedichte?query=&onlynewoff=&lang[]=de&translatorname=999999&category[]={category}&geschlecht[]={gender}'
    poem_links =[]
    for s in range(sites):
        poem_links += scrape_poem_links(url, s)

    # Scrape and save the poems
    for link in tqdm(poem_links, desc=f'Scraping poems for {gender}, {category}', unit='poem'):
        title = link.split('/')[-1]
        poem_url = f'https://www.lyrikline.org{link}'
        poem_data = scrape_poem(poem_url, title, gender)
        file_name = os.path.join(corpus_folder, f'{title}.json')
        save_to_json(poem_data, file_name)


<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="d9794976-0277-4c4d-a260-98b6acbbd0c0", element="af46de62-682e-4cbc-908e-ab78c3b0603a")>
<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="d1dbedb9-cfce-416f-9718-6d6bc2da9411", element="3e329b9d-1f42-40f2-927c-5e39ee79ee23")>
<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="d1dbedb9-cfce-416f-9718-6d6bc2da9411", element="7ab545ed-b709-462d-a4cd-28afae35d20a")>


['/de/gedichte/maervent-oiosis-11952',
 '/de/gedichte/alles-was-wir-tun-ist-musik-john-cage-13426',
 '/de/gedichte/als-belgien-furchtbar-war-13241',
 '/de/gedichte/den-nachtblauen-falter-15511',
 '/de/gedichte/der-donau-2616',
 '/de/gedichte/anatomie-der-erinnerung-14148',
 '/de/gedichte/angeln-vor-oland-15602',
 '/de/gedichte/aus-den-notizen-zu-einem-selbstbildnis-13237',
 '/de/gedichte/bei-uns-zuhaus-679',
 '/de/gedichte/das-weltall-ist-ein-grosser-wald-dem-die-angst-keine-ohren-hat-13424',
 '/de/gedichte/der-pollenflug-15513',
 '/de/gedichte/der-schiffsbaumeister-10090',
 '/de/gedichte/der-stille-grund-11236',
 '/de/gedichte/der-zettel-11681',
 '/de/gedichte/die-horizontbaeume-752',
 '/de/gedichte/die-maulwuerfe-14378',
 '/de/gedichte/doch-deine-augen-halten-fest-11240',
 '/de/gedichte/drei-schwestern-10078',
 '/de/gedichte/du-bist-eine-lilie-15524',
 '/de/gedichte/echo-11237',
 '/de/gedichte/maervent-oiosis-11952',
 '/de/gedichte/alles-was-wir-tun-ist-musik-john-cage-13426',
 '/de/