In [1]:
from pathlib import Path

import re
import requests
from dataclasses import dataclass
from bs4 import BeautifulSoup


@dataclass
class ChapterInfo:
    name: str
    retail: str
    summary: str


@dataclass
class BookInfo:
    name: str
    chapters: list[ChapterInfo]


In [2]:
def it_is_garry_p_chapter(tag):
    is_a_tag = tag.name == 'a'
    ban_words = ["Книга", "Игра", "Саундтрек", "Фильм", "Саундтрек (часть 1)", "Саундтрек (часть 2)"]
    has_no_class_atrr = not tag.has_attr('class')
    class_attr_is_mw, isnt_template = True, True
    havent_ban_words = all(ban_word not in tag.contents for ban_word in ban_words)
    if tag.has_attr('class'):
        class_attr_is_mw = "mw-redirect" in tag.attrs['class']
    if tag.has_attr('title'):
        isnt_template = "шаблон:главы" not in tag.attrs['title'].lower()
    return is_a_tag and (has_no_class_atrr or class_attr_is_mw) and havent_ban_words and isnt_template


def witcher_retail_tag(tag):
    is_p_tag = tag.name == 'p'
    only_one_content = len(tag.contents) == 1
    return is_p_tag and only_one_content


def Checkov_retell_tag(tag):
    is_p_tag = tag.name == 'p'
    parent_tag = tag.parent
    isnt_end = 'За основу пересказа' not in tag.text
    parent_div_is_not_poem = False
    if parent_tag.has_attr('class'):
        parent_div_is_not_poem = parent_tag.attrs['class'][0] != 'poem'
    return is_p_tag and isnt_end and parent_div_is_not_poem


def Chekov_book_tag(tag):
    is_a_tag = tag.name == 'a'
    author_name_in_a = False
    if tag.has_attr('title'):
        author_name_in_a = '(Чехов)' in tag.attrs['title']
    return is_a_tag and author_name_in_a


def get_parsed_retell_g_p(url: str) -> list[str]:
    collected_data: list[BookInfo] = []
    with requests.get(url) as response:
        soup = BeautifulSoup(response.text)
    books = soup.findAll('table')
    for book in books:
        book_name = book.find('tbody').find('tr').find('th').find('span', {'style': 'font-size:110%'}) \
            .text.replace(' (главы)', '')
        book_info = BookInfo(book_name, [])
        chapters = book.findAll(it_is_garry_p_chapter)
        for chapter in chapters:
            chapter_name = chapter.attrs['title']#chapter.contents[0]
            chapter_url = url.replace('Категория:Главы_книг',
                                      chapter_name.replace(' ', '_'))
            with requests.get(chapter_url) as response:
                soup = BeautifulSoup(response.text)
            data = [tag.text for tag in soup.findAll('p')[3:]]
            summary: str = data[0]
            retelling: str = ''.join(data[1:])
            book_info.chapters.append(ChapterInfo(chapter_name, retelling, summary))
        collected_data.append(book_info)
    return collected_data


def get_parsed_retell_witch(url: str) -> list[str]:
    collected_data: list[BookInfo] = []
    with requests.get(url) as response:
        soup = BeautifulSoup(response.text)
    books_names = [tag.contents[2].attrs['title'].replace(' ', '_') for tag in
                   soup.findAll('p', {"class": "read-more"})]
    base_url = "https://wiki.briefly.ru/"
    for book_name in books_names:
        book_info = BookInfo(book_name, [])
        book_url = base_url + book_name
        with requests.get(book_url) as response:
            soup = BeautifulSoup(response.text)
        retail = [tag.text for tag in soup.findAll(witcher_retail_tag)]
        retail_str = ''.join(retail)
        book_info.chapters.append(retail_str)
        collected_data.append(book_info)
    return collected_data


def get_parsed_retell_Checov(url):
    collected_data: list[BookInfo] = []
    with requests.get(url) as response:
        soup = BeautifulSoup(response.text)
    books_names = [tag.contents[0].replace(' ', '_') for tag in soup.findAll(Chekov_book_tag)]
    base_url = "https://wiki.briefly.ru/"
    for book_name in books_names:
        book_info = BookInfo(book_name, [])
        book_url = base_url + book_name
        with requests.get(book_url) as response:
            soup = BeautifulSoup(response.text)
        retail = [tag.text for tag in soup.findAll(Checkov_retell_tag)]
        retail_str = ''.join(retail)
        book_info.chapters.append(retail_str)
        collected_data.append(book_info)
    return collected_data


def get_parsed_retell_game_o_t(url):
    collected_data: list[BookInfo] = []
    with requests.get(url) as response:
        soup = BeautifulSoup(response.text)
    books_names = [tag.contents[0].attrs['title'].replace(' ', '_') for tag in soup.findAll('b')[:5]]
    base_url = "https://7kingdoms.ru/wiki/"
    for book_name in books_names:
        book_info = BookInfo(book_name, [])
        book_url = base_url + book_name
        with requests.get(book_url) as response:
            soup = BeautifulSoup(response.text)

        table = soup.find('table', {"class": "toc plainlinks common-table"})
        chapters = table.find('tbody').findAll('tr')[1].find('td').findAll('a')
        for chapter in chapters:
            chapter_name = chapter.attrs['title'].replace(' ', '_')
            chapter_url = base_url + chapter_name
            with requests.get(chapter_url) as response:
                soup = BeautifulSoup(response.text)
            all_p = soup.findAll('p')
            summary = all_p[0].text.replace('Краткое содержание:', '').strip()
            retell = [all_p[1].text]
            for elem in all_p[1].next_siblings:
                if elem.name == 'h2':
                    break
                if elem.name == 'p':
                    retell.append(elem.text)
            chapter_info = ChapterInfo(chapter_name, ''.join(retell), summary)
            book_info.chapters.append(chapter_info)
        collected_data.append(book_info)
    return collected_data

In [3]:
gp_url = 'https://harrypotter.fandom.com/ru/wiki/Категория:Главы_книг'
witcher_url = "https://wiki.briefly.ru/%D0%92%D0%B5%D0%B4%D1%8C%D0%BC%D0%B0%D0%BA_(%D0%A1%D0%B0%D0%BF%D0%BA%D0%BE%D0%B2%D1%81%D0%BA%D0%B8%D0%B9)"
chekof = "https://wiki.briefly.ru/Категория:Чехов,_Антон_Павлович"
game_o_t = "https://7kingdoms.ru/wiki/Портал:Цикл_Песнь_Льда_и_Пламени"

In [4]:
gp = get_parsed_retell_g_p(gp_url)

In [5]:
witcher = get_parsed_retell_witch(witcher_url)

In [6]:
Checov = get_parsed_retell_Checov(chekof)

In [7]:
game_o_trones = get_parsed_retell_game_o_t(game_o_t)

In [19]:
for book in gp:
    with open(Path(f'./parsed_data/gp/{book.name.replace("_"," ")}_summary.md'),'w') as file:
        for chapter in book.chapters:
            if 'В настоящее время на этой странице нет текста.' not in chapter.summary and chapter.summary != '\n':
                file.write(f"##{chapter.name}\n\n{chapter.summary}\n")
    with open(Path(f'./parsed_data/gp/{book.name.replace("_"," ")}_retell.md'),'w') as file:
        for chapter in book.chapters:
            if len(chapter.retail) > 0:
                file.write(f"##{chapter.name}\n\n{chapter.retail}\n")

In [16]:
for book in witcher:
    with open(Path(f'./parsed_data/witcher/{book.name.replace("_"," ")}.md'),'w') as file:
        file.write(f"##{book.name}\n\n{book.chapters[0]}\n")

In [17]:
for book in Checov:
    with open(Path(f'./parsed_data/Chekhov/{book.name.replace("_"," ")}.md'),'w') as file:
        file.write(f"##{book.name}\n\n{book.chapters[0]}\n")

In [15]:
for book in game_o_trones:
    with open(Path(f'./parsed_data/game_of_t/{book.name.replace("_"," ")}_summary.md'),'w') as file:
        for chapter in book.chapters:
            file.write(f"##{chapter.name}\n\n{chapter.summary.replace('Краткое cодержание: ','').replace('Краткое Содержание: ','')}\n\n")
    with open(Path(f'./parsed_data/game_of_t/{book.name.replace("_"," ")}_retell.md'),'w') as file:
        for chapter in book.chapters:
            retell = re.sub("[\(\[].*?[\)\]]", "", chapter.retail)
            file.write(f"##{chapter.name}\n\n{retell}\n")