In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
from tqdm import tqdm
import time

In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options

# Specify the URL for scraping
url = 'https://www.lyrikline.org/de/gedichte?query=&onlynewoff=&lang[]=de&translatorname=999999&category[]=66'

In [3]:
def scrape_poem_links(url, site:int):
    
    # Set up Firefox options
    firefox_options = Options()
    firefox_options.add_argument("-headless")

    # Set up Firefox service
    webdriver_service = Service('geckodriver.exe')

    # Create a new instance of the Firefox driver
    driver = webdriver.Firefox(service=webdriver_service, options=firefox_options)

    # Load the webpage
    driver.get(url)

    # Wait for the dynamic content to load (you may need to adjust the waiting time based on the page)
    driver.implicitly_wait(5)

    
    # Get the page source after the content is loaded
    for i in range(site):
        try:
            next_button = driver.find_element(By.XPATH, '//a[contains(text(), "nächste")]')
            next_button.click()
            time.sleep(5)  # Wait for additional content to load
        except:
            break
    page_source = driver.page_source

    # Close the browser
    driver.quit()

    # Process the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find and extract the desired elements using BeautifulSoup
    # Example: Get the titles of the poems
    poem_links = []
    link_elements = soup.find_all('a', class_='row')
    
    for link in link_elements:
        href = link.get('href')
        poem_links.append(href)
    
    return poem_links

In [4]:
def scrape_poem(url, title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    poem_div = soup.find('div', class_='gedicht-originaltext clearfix')
    poem_lines = [line.strip() for line in poem_div.stripped_strings]
    
    author_h1 = soup.find('h1', id='gedicht-autor')
    author_name = author_h1.find('a').text.strip() if author_h1 else ''
    
    poem_data = {
        'title': title,
        'gender': '',
        'author': author_name,
        'poem': {}
    }
    
    for idx, line in enumerate(poem_lines, start=1):
        poem_data['poem'][f'line.{idx}'] = {'text': line.strip()}
    
    return poem_data


def save_to_json(poem_data, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        json.dump(poem_data, file, indent=4, ensure_ascii=False)


In [5]:
# Scrape the poem links
url = 'https://www.lyrikline.org/de/gedichte?query=&onlynewoff=&lang[]=de&translatorname=999999&category[]=66'
poem_links = []
site_number = 6
for i in range(site_number):
    poem_links += scrape_poem_links(url, i)

# Create a "corpus" folder if it doesn't exist
corpus_folder = 'corpus_selenium'
os.makedirs(corpus_folder, exist_ok=True)

# Scrape and save the poems
for link in tqdm(poem_links, desc='Scraping poems', unit='poem'):
    title = link.split('/')[-1]
    poem_url = f'https://www.lyrikline.org{link}'
    poem_data = scrape_poem(poem_url, title)
    file_name = os.path.join(corpus_folder, f'{title}.json')
    save_to_json(poem_data, file_name)

Scraping poems:  24%|██▍       | 29/120 [00:41<03:19,  2.20s/poem]