In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
from tqdm import tqdm
import time

In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options

# Specify the URL for scraping
url = 'https://www.lyrikline.org/de/gedichte?query=&onlynewoff=&lang[]=de&translatorname=999999&category[]=66'

In [3]:
def scrape_poem_links(url, site:int):
    
    # Set up Firefox options
    firefox_options = Options()
    firefox_options.add_argument("-headless")

    # Set up Firefox service
    webdriver_service = Service('geckodriver.exe')

    # Create a new instance of the Firefox driver
    driver = webdriver.Firefox(service=webdriver_service, options=firefox_options)

    # Load the webpage
    driver.get(url)

    # Wait for the dynamic content to load (you may need to adjust the waiting time based on the page)
    driver.implicitly_wait(5)

    
    # Get the page source after the content is loaded
    for i in range(site):
        try:
            next_button = driver.find_element(By.XPATH, '//a//span[contains(text(), "nächste")]')
            print(next_button)
            next_button.click()
            time.sleep(5)  # Wait for additional content to load
        except:
            break
    page_source = driver.page_source

    # Close the browser
    driver.quit()

    # Process the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find and extract the desired elements using BeautifulSoup
    # Example: Get the titles of the poems
    poem_links = []
    link_elements = soup.find_all('a', class_='row')
    
    for link in link_elements:
        href = link.get('href')
        poem_links.append(href)
    
    return poem_links

In [4]:
def scrape_poem(url, title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    poem_div = soup.find('div', class_='gedicht-originaltext clearfix')
    poem_lines = [line.strip() for line in poem_div.stripped_strings]
    
    author_h1 = soup.find('h1', id='gedicht-autor')
    author_name = author_h1.find('a').text.strip() if author_h1 else ''
    
    poem_data = {
        'title': title,
        'gender': '',
        'author': author_name,
        'poem': {}
    }
    
    for idx, line in enumerate(poem_lines, start=1):
        poem_data['poem'][f'line.{idx}'] = {'text': line.strip()}
    
    return poem_data


def save_to_json(poem_data, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        json.dump(poem_data, file, indent=4, ensure_ascii=False)


In [5]:
# Scrape the poem links
url = 'https://www.lyrikline.org/de/gedichte?query=&onlynewoff=&lang[]=de&translatorname=999999&category[]=66'
poem_links = []
site_number = 6
for i in range(site_number):
    poem_links += scrape_poem_links(url, i)


<selenium.webdriver.remote.webelement.WebElement (session="956ae522-2be3-44f7-9093-ee5151fbdaf8", element="8cc9c1f6-3321-4e38-828b-b03926f6b547")>
<selenium.webdriver.remote.webelement.WebElement (session="73a715aa-af1d-40c4-8e8f-8de36a3af872", element="8abbecce-b79a-40a6-9361-35fa007a3cba")>
<selenium.webdriver.remote.webelement.WebElement (session="73a715aa-af1d-40c4-8e8f-8de36a3af872", element="24c9bcb5-a181-4057-bd8e-0235f059d891")>
<selenium.webdriver.remote.webelement.WebElement (session="1c08206a-c965-4295-8dcc-237fc048c42f", element="d4c5724d-3775-4800-b991-493f0a1c4512")>
<selenium.webdriver.remote.webelement.WebElement (session="1c08206a-c965-4295-8dcc-237fc048c42f", element="0970bc01-e829-4803-9113-ab0986b5e2a8")>
<selenium.webdriver.remote.webelement.WebElement (session="1c08206a-c965-4295-8dcc-237fc048c42f", element="684e0b37-2efe-49af-bc03-e9a8cb8e2e84")>
<selenium.webdriver.remote.webelement.WebElement (session="79bbc596-e2ce-4b88-9862-592eda91169f", element="eff06802-7d

In [6]:
poem_links

['/de/gedichte/maervent-oiosis-11952',
 '/de/gedichte/alemannische-mangos-15731',
 '/de/gedichte/alles-was-wir-tun-ist-musik-john-cage-13426',
 '/de/gedichte/als-belgien-furchtbar-war-13241',
 '/de/gedichte/am-fenster-abends',
 '/de/gedichte/den-nachtblauen-falter-15511',
 '/de/gedichte/der-donau-2616',
 '/de/gedichte/anatomie-der-erinnerung-14148',
 '/de/gedichte/anbruch-13884',
 '/de/gedichte/angeln-vor-oland-15602',
 '/de/gedichte/antarktika-15729',
 '/de/gedichte/aus-duengerkind-i-15421',
 '/de/gedichte/aus-duengerkind-ii-15422',
 '/de/gedichte/aus-den-notizen-zu-einem-selbstbildnis-13237',
 '/de/gedichte/bei-uns-zuhaus-679',
 '/de/gedichte/benanntes-gefilde-14636',
 '/de/gedichte/brief-im-april-15724',
 '/de/gedichte/bukowina-i-545',
 '/de/gedichte/das-weltall-ist-ein-grosser-wald-dem-die-angst-keine-ohren-hat-13424',
 '/de/gedichte/der-pollenflug-15513',
 '/de/gedichte/der-schaefer-strickte-13928',
 '/de/gedichte/der-schiffsbaumeister-10090',
 '/de/gedichte/der-stille-grund-11236

In [7]:
# Create a "corpus" folder if it doesn't exist
corpus_folder = 'corpus_selenium'
os.makedirs(corpus_folder, exist_ok=True)

In [8]:
# Scrape and save the poems
for link in tqdm(poem_links, desc='Scraping poems', unit='poem'):
    title = link.split('/')[-1]
    poem_url = f'https://www.lyrikline.org{link}'
    poem_data = scrape_poem(poem_url, title)
    file_name = os.path.join(corpus_folder, f'{title}.json')
    save_to_json(poem_data, file_name)

Scraping poems: 100%|██████████| 118/118 [02:54<00:00,  1.48s/poem]
