In [1]:
import requests
from bs4 import BeautifulSoup
import json
import os
from tqdm import tqdm

In [2]:
def scrape_poem_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    poem_links = []
    link_elements = soup.find_all('a', class_='row')
    
    for link in link_elements:
        href = link.get('href')
        poem_links.append(href)
    
    return poem_links

In [3]:
def scrape_poem(url, title):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    poem_div = soup.find('div', class_='gedicht-originaltext clearfix')
    poem_lines = [line.strip() for line in poem_div.stripped_strings]
    
    author_h1 = soup.find('h1', id='gedicht-autor')
    author_name = author_h1.find('a').text.strip() if author_h1 else ''
    
    poem_data = {
        'title': title,
        'author': author_name,
        'poem': {}
    }
    
    for idx, line in enumerate(poem_lines, start=1):
        poem_data['poem'][f'line.{idx}'] = {'text': line.strip()}
    
    return poem_data


def save_to_json(poem_data, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        json.dump(poem_data, file, indent=4, ensure_ascii=False)


In [4]:
# Scrape the poem links
url = 'https://www.lyrikline.org/de/gedichte?query=&onlynewoff=&lang[]=de&translatorname=999999&category[]=66'
poem_links = scrape_poem_links(url)

# Create a "corpus" folder if it doesn't exist
corpus_folder = 'corpus'
os.makedirs(corpus_folder, exist_ok=True)

# Scrape and save the poems
for link in tqdm(poem_links, desc='Scraping poems', unit='poem'):
    title = link.split('/')[-1]
    poem_url = f'https://www.lyrikline.org{link}'
    poem_data = scrape_poem(poem_url, title)
    file_name = os.path.join(corpus_folder, f'{title}.json')
    save_to_json(poem_data, file_name)

Scraping poems: 100%|████████████████████████████████████████████████████████████████| 20/20 [00:23<00:00,  1.16s/poem]
