In [1]:
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import json
import msgpack
import pandas as pd
import time
import random

In [9]:
def get_section_info(url):
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, "html.parser")
    section_div = soup.find('div', {'class': 'front-matter'})
    header = section_div.find('h2').text
    metaline = section_div.find('h5', {'class': 'metaline'}).text
    if 'Seite 1 von' in metaline:
        num_of_pages = metaline.split(' - ')[1]
        num_of_pages = int(num_of_pages.replace('Seite 1 von ', ''))
    else: num_of_pages = 1
    return header, num_of_pages

def get_books_links(url, idx):
    link = f"{url}?p={idx}"
    html_text = requests.get(link).text
    soup = BeautifulSoup(html_text, "html.parser")
    page_books = set(['https://www.perlentaucher.de' + e.find('a')['href'] for e in soup.find_all('article', {'class': 'book teaser-block'})])
    return list(page_books)

def get_book_meta(url):
    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, "html.parser")
    
    bookdata = soup.find('div', {'class': 'bookdata'})
    
    if author := bookdata.find('h3', {'class': 'bookauthor'}):
        author = author.text
    else:
        author = None

    if title := bookdata.find('h3', {'class': 'booktitle'}):
        title = title.text
    else:
        title = None

    if book_type := bookdata.find('div', {'class': 'smaller'}):
        book_type = book_type.text
    else:
        book_type = None

    bookdata_body = bookdata.select_one("div.bookdata_body div.tiny.gray")
    bookdata_text = bookdata_body.get_text(separator="\n", strip=True)
    bookdata_text = [e.strip() for e in bookdata_text.split('\n')]

    publisher = bookdata_text[0].split(',')[0].strip()
    place = bookdata_text[0].split(',')[1].strip()[:-4].strip()
    year = bookdata_text[0].split(',')[1].strip()[-4:]

    description = soup.find_all('div', {'class': 'smaller'})[-1].text
    keywords = [e.text for e in soup.find_all('a', {'class': 'kw'})]

    return {
            'url': url,
            'author': author,
            'title': title,
            'subtitle': book_type,
            'publisher': publisher,
            'place': place,
            'year': year,
            'description': description,
            'keywords': keywords,
            }

In [11]:
# Prepare a dictionary for web scraping:
# key: url (str)
# value: [name (str), num_of_pages (int)]

# urls = {
#     'https://www.perlentaucher.de/buchKSL/amerikanische-politik-20-jahrhundert.html' : ['Amerikanische Politik, 20. Jahrhundert', 20]
# }

urls = {
    # 'https://www.perlentaucher.de/buchKSL/deutsche-literatur-20-jahrhundert-romane.html': ['Deutsche Literatur, 20. Jahrhundert (Romane)', 166]
}
with open('links.txt', 'r', encoding='utf-8') as txt:
    for url in tqdm(txt.readlines()):
        url = url.replace('\n', '')
        urls[url] = list(get_section_info(url))


100%|██████████| 5/5 [00:02<00:00,  1.92it/s]


In [12]:
urls

{'https://www.perlentaucher.de/buchKSL/deutsche-biografien-20-jahrhundert.html': ['Deutsche Biografien, 20. Jahrhundert',
  129],
 'https://www.perlentaucher.de/buchKSL/deutsche-literatur-20-jahrhundert-briefe-tagebuecher.html': ['Deutsche Literatur, 20. Jahrhundert (Briefe/Tagebücher)',
  76],
 'https://www.perlentaucher.de/buchKSL/deutsche-literatur-20-jahrhundert-erinnerungen.html': ['Deutsche Literatur, 20. Jahrhundert (Erinnerungen)',
  98],
 'https://www.perlentaucher.de/buchKSL/deutsche-literatur-20-jahrhundert-essay.html': ['Deutsche Literatur, 20. Jahrhundert (Essay)',
  50],
 'https://www.perlentaucher.de/buchKSL/deutsche-literatur-20-jahrhundert-reisereportagen.html': ['Deutsche Literatur, 20. Jahrhundert (Reisereportagen)',
  8]}

In [13]:
# prepare direct links for books
full_books_output = {}

for url, lst in urls.items():
    books_links = []
    num_of_pages = range(0, lst[-1])
    for idx in tqdm(num_of_pages):
        books_links.extend(get_books_links(url, idx))
    full_books_output[url] = {'category': lst[0],
                              'books_urls': list(set(books_links))}

with open('books_links.json', 'w', encoding='utf-8') as jfile:
    json.dump(full_books_output, jfile, indent=4, ensure_ascii=False)

with open("books_links.bin", "wb") as file:
    file.write(msgpack.packb(full_books_output))

100%|██████████| 129/129 [02:31<00:00,  1.18s/it]
100%|██████████| 76/76 [01:15<00:00,  1.00it/s]
100%|██████████| 98/98 [01:59<00:00,  1.22s/it]
100%|██████████| 50/50 [01:01<00:00,  1.24s/it]
100%|██████████| 8/8 [00:09<00:00,  1.24s/it]


In [14]:
# get books metadata
with open("books_links.bin", "rb") as file:
    books_urls = msgpack.unpackb(file.read(), raw=False)

books_metadata = []
for key, val in books_urls.items():
    category = val['category']
    books = val['books_urls']
    for book in tqdm(books):
        time.sleep(round(random.uniform(0, 1), 2))
        try:
            book_meta = get_book_meta(book)
            book_meta.update({'category': category})
            books_metadata.append(book_meta)
        except: continue

with open('books_metadata.json', 'w', encoding='utf-8') as jfile:
    json.dump(books_metadata, jfile, indent=4, ensure_ascii=False)

with open("books_metadata.bin", "wb") as file:
    file.write(msgpack.packb(books_metadata))


100%|██████████| 1540/1540 [32:03<00:00,  1.25s/it]
100%|██████████| 909/909 [16:30<00:00,  1.09s/it]
100%|██████████| 1167/1167 [21:07<00:00,  1.09s/it]
100%|██████████| 597/597 [11:09<00:00,  1.12s/it]
100%|██████████| 95/95 [01:55<00:00,  1.22s/it]


In [15]:
# to df
# {
#         "url": "https://www.perlentaucher.de/buch/klaus-pohl/die-kinder-der-preussischen-wueste.html",
#         "author": "Klaus Pohl",
#         "title": "Die Kinder der Preußischen Wüste",
#         "subtitle": null,
#         "publisher": "Arche Verlag",
#         "place": "Hamburg",
#         "year": "2011",
#         "description": "Die Geschichte des Robert Papst ist die Geschichte des Schriftstellers Thomas Brasch. Das Buch, das Robert Papst nicht schreibt, das Thomas Brasch nie geschrieben hat - dieses Buch liegt jetzt vor. Die spannende Geschichte eines Sohnes, der stärker ist als sein Vater, der mit seinen Gedichten, Filmen, Theaterstücken weltberühmt wird, der auf den Filmfestspielen in Cannes gefeiert, als Lyriker zehntausendfach gelesen wird, von Liebesaffären umschwirrt - bis er an Drogen, Alkohol und Schulden viel zu früh stirbt. Zehn Jahre nach seinem Tod erzählt Braschs Freund und Weggefährte Klaus Pohl die berührende, aufregende, diese große Geschichte.",
#         "keywords": [
#             "Brasch, Thomas",
#             "Dichter",
#             "Schriftsteller"
#         ],
#         "category": "Deutsche Literatur, 20. Jahrhundert (Romane)"
# }

with open("books_metadata.bin", "rb") as file:
    books_meta = msgpack.unpackb(file.read(), raw=False)

to_df = []
for elem in books_meta:
    to_df.append(tuple([v if not isinstance(v, list) else ' | '.join(v) for k,v in elem.items()]))

df = pd.DataFrame(to_df, columns=['url', 'author', 'title', 'subtitle', 'publisher', 'place', 'year', 'description', 'keywords', 'category'])
df.to_excel('other.xlsx', index=False)