In [14]:
import requests
from bs4 import BeautifulSoup
import json
import os
from tqdm import tqdm
import time
import re

In [15]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options

In [16]:
def scrape_poem_links(url, site:int):
    
    # Set up Firefox options
    firefox_options = Options()
    firefox_options.add_argument("-headless")

    # Set up Firefox service
    webdriver_service = Service('geckodriver.exe')

    # Create a new instance of the Firefox driver
    driver = webdriver.Firefox(options=firefox_options)

    # Load the webpage
    driver.get(url)

    # Wait for the dynamic content to load (you may need to adjust the waiting time based on the page)
    driver.implicitly_wait(5)

    
    # Get the page source after the content is loaded
    for i in range(site):
        try:
            next_button = driver.find_element(By.XPATH, '//a//span[contains(text(), "nächste")]')
            print(next_button)
            next_button.click()
            time.sleep(5)  # Wait for additional content to load
        except:
            break
    page_source = driver.page_source

    # Close the browser
    driver.quit()

    # Process the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find and extract the desired elements using BeautifulSoup
    # Example: Get the titles of the poems
    poem_links = []
    link_elements = soup.find_all('a', class_='row')
    
    for link in link_elements:
        href = link.get('href')
        poem_links.append(href)
    
    return poem_links

In [21]:
def scrape_poem(url, title, gender):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    poem_div = soup.find('div', class_='gedicht-originaltext clearfix')
    poem_lines = [line.strip() for line in poem_div.stripped_strings]
    
    author_h1 = soup.find('h1', id='gedicht-autor')
    author_name = author_h1.find('a').text.strip() if author_h1 else ''

    birth_soup = soup.find('span', class_='autor-daten')
    try:
        author_birth = re.search('(\d{2}\.\d{2}\.)(\d{4})', birth_soup.get_text()).group(2)
    except AttributeError:
        author_birth = '0'

    # published_soup = soup.find('div', id_='meta-info')
    # published = re.search('\b\d{4}\b', published_soup).group()

    poem_data = {
        'title': title,
        'gender': gender,
        'author': author_name,
        'author_birth': author_birth,
        # 'published': published,
        'poem': {}
    }
    
    for idx, line in enumerate(poem_lines, start=1):

        filtered_line = re.sub(r'[^\x00-\x7FäöüÄÖÜß]+', '', line)
        filtered_line = filtered_line.strip()

        poem_data['poem'][f'line.{idx}'] = {'text': filtered_line}
    
    return poem_data


def save_to_json(poem_data, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        json.dump(poem_data, file, indent=4, ensure_ascii=False)


In [24]:
# Create a "corpus" folder if it doesn't exist
corpus_folder = f'corpus_finished'
os.makedirs(corpus_folder + '_m', exist_ok=True)
os.makedirs(corpus_folder + '_w', exist_ok=True)

# Define the gender, category, and site combinations
combinations = [
    ('m', 95),
    ('w', 52)
]

# Scrape poem links and save poems for each combination
for gender, sites in combinations:
    url = f'https://www.lyrikline.org/de/gedichte?query=&onlynewoff=&lang[]=de&geschlecht[]={gender}'
    poem_links = []
    for s in range(sites):
        poem_links += scrape_poem_links(url, s)
    # Scrape and save the poems
    for link in tqdm(poem_links, desc=f'Scraping poems for {gender}', unit='poem'):
        title = link.split('/')[-1]
        poem_url = f'https://www.lyrikline.org{link}'
        poem_data = scrape_poem(poem_url, title, gender)
        file_name = os.path.join(f'{corpus_folder}'+ f'_{gender}', f'{title}.json')
        save_to_json(poem_data, file_name)


<selenium.webdriver.remote.webelement.WebElement (session="8130f381-9335-4bd0-9587-8f607e0fe722", element="35a56485-8065-468b-a59d-d965e12ceede")>
<selenium.webdriver.remote.webelement.WebElement (session="7cba8014-094d-4582-b10e-e0f1ee833ea1", element="17317760-045d-4acf-a2ec-d83f3750c76c")>
<selenium.webdriver.remote.webelement.WebElement (session="7cba8014-094d-4582-b10e-e0f1ee833ea1", element="eb230be9-eb9e-4e49-b078-1b8df8ef0d4b")>
<selenium.webdriver.remote.webelement.WebElement (session="2996919d-d4fb-46af-bd65-3a3416971b61", element="3cf75dc9-cfc4-42fd-8da9-c33421b8f5b3")>
<selenium.webdriver.remote.webelement.WebElement (session="2996919d-d4fb-46af-bd65-3a3416971b61", element="8dbacea1-74cc-4dbd-9b69-0edf0dae3cf7")>
<selenium.webdriver.remote.webelement.WebElement (session="2996919d-d4fb-46af-bd65-3a3416971b61", element="256b4ac0-d02e-4d8d-9379-e87186e7258b")>
<selenium.webdriver.remote.webelement.WebElement (session="28ecc8c5-1ce7-47c3-8cb6-8b962e3a9bd5", element="f28d470a-bc

Scraping poems for m: 100%|██████████| 1883/1883 [51:47<00:00,  1.65s/poem] 


<selenium.webdriver.remote.webelement.WebElement (session="7fb10daf-502a-4c04-9fca-b000912885e4", element="7ee22320-3a25-414c-8332-7a4f1081df86")>
<selenium.webdriver.remote.webelement.WebElement (session="89553650-6faa-4a64-83bf-8ec2d1a01504", element="d572ee2b-a42f-4a2a-b0dd-c3e66c0e7825")>
<selenium.webdriver.remote.webelement.WebElement (session="89553650-6faa-4a64-83bf-8ec2d1a01504", element="5b2f6643-3bc0-4c87-a3a1-905758920899")>
<selenium.webdriver.remote.webelement.WebElement (session="6e8c971e-cf8e-43b5-923f-98e999deb3f5", element="847e0842-0a79-4c70-bd63-1523d96a944c")>
<selenium.webdriver.remote.webelement.WebElement (session="6e8c971e-cf8e-43b5-923f-98e999deb3f5", element="83a55117-7e3c-40d1-a8cf-ed1f51f43af0")>
<selenium.webdriver.remote.webelement.WebElement (session="6e8c971e-cf8e-43b5-923f-98e999deb3f5", element="e8deccfa-bd5a-468e-8d47-58c5822080e8")>
<selenium.webdriver.remote.webelement.WebElement (session="cab7307d-0247-4f80-a3df-c736df132e31", element="a207da0d-ce

Scraping poems for w: 100%|██████████| 1021/1021 [29:11<00:00,  1.72s/poem]


: 