In [17]:
import os
import requests
from bs4 import BeautifulSoup
import re
import logging

# –õ–æ–≥–≥–∏—Ä–æ–≤–∞–Ω–∏–µ
LOG_FILE = "parser.log"
logging.basicConfig(
    level=logging.INFO,  
    format="%(asctime)s [%(levelname)s] - %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILE, mode="w", encoding="utf-8")
        #logging.StreamHandler() - –µ—Å–ª–∏ –Ω—É–∂–Ω–æ –≤ console –ª–æ–≥–∏ –≤—ã–≤–æ–¥–∏—Ç—å
    ]
)

class WikiVoyageParser:
    def __init__(self, language="ru", save_path="data"):
        self.api_url = f"https://{language}.wikivoyage.org/w/api.php"
        self.save_path = save_path
        os.makedirs(save_path, exist_ok=True)

    def get_page_sections(self, title):
        """–ü–æ–ª—É—á–∞–µ—Ç —Å–ø–∏—Å–æ–∫ —Ä–∞–∑–¥–µ–ª–æ–≤ (—Å–µ–∫—Ü–∏–π) —Å—Ç–∞—Ç—å–∏."""
        params = {
            "action": "parse",
            "format": "json",
            "page": title,
            "prop": "sections"
        }
        response = requests.get(self.api_url, params=params)
        data = response.json()

        sections = data.get("parse", {}).get("sections", [])
        if sections:
            logging.info(f"üìë –ù–∞–π–¥–µ–Ω—ã —Ä–∞–∑–¥–µ–ª—ã –¥–ª—è {title}!")
        else:
            logging.warning(f"‚ö†Ô∏è –†–∞–∑–¥–µ–ª—ã –¥–ª—è {title} –Ω–µ –Ω–∞–π–¥–µ–Ω—ã!")

        return sections

    def get_section_content(self, title, section_id, section_name):
        """–ü–æ–ª—É—á–∞–µ—Ç —Ç–µ–∫—Å—Ç –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ–≥–æ —Ä–∞–∑–¥–µ–ª–∞ —Å—Ç–∞—Ç—å–∏ –∏ –æ—á–∏—â–∞–µ—Ç –µ–≥–æ –æ—Ç HTML."""
        params = {
            "action": "parse",
            "format": "json",
            "page": title,
            "prop": "text",
            "section": section_id
        }
        response = requests.get(self.api_url, params=params)
        data = response.json()
        section_html = data.get("parse", {}).get("text", {}).get("*", "")

        clean_text = self.clean_html(section_html)
        clean_text = self.remove_section_redundancy(clean_text, section_name)

        return clean_text

    def clean_html(self, html_text):
        """–£–¥–∞–ª—è–µ—Ç HTML-—Ç–µ–≥–∏ –∏ –∏—Å–ø—Ä–∞–≤–ª—è–µ—Ç –ø–µ—Ä–µ–Ω–æ—Å—ã —Å—Ç—Ä–æ–∫."""
        soup = BeautifulSoup(html_text, "html.parser")

        for tag in soup.find_all(["table", "sup", "style", "script"]):
            tag.decompose()

        blocks = []

        # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º <p> –∏ <li> –æ—Ç–¥–µ–ª—å–Ω–æ
        for el in soup.find_all(["p", "li", "h3", "h2"]):
            txt = el.get_text(" ", strip=True)
            if txt:
                blocks.append(txt)
    
        # –°–∫–ª–µ–∏–≤–∞–µ–º –≤ —Ç–µ–∫—Å—Ç —Å –¥–≤–æ–π–Ω—ã–º–∏ –ø–µ—Ä–µ–Ω–æ—Å–∞–º–∏ –º–µ–∂–¥—É –±–ª–æ–∫–∞–º–∏
        text = "\n\n".join(blocks)
    
        # –£–±–∏—Ä–∞–µ–º —Å–ª—É–∂–µ–±–Ω—ã–µ –º–µ—Ç–∫–∏ –≤–∏–¥–∞ [–ø—Ä–∞–≤–∏—Ç—å]
        text = re.sub(r'\[\s*[^]]*\]', '', text)

        return text.strip()

    def remove_section_redundancy(self, text, section_name):
        """–£–¥–∞–ª—è–µ—Ç –ø–æ–≤—Ç–æ—Ä–Ω–æ–µ —É–ø–æ–º–∏–Ω–∞–Ω–∏–µ –∑–∞–≥–æ–ª–æ–≤–∫–∞ —Ä–∞–∑–¥–µ–ª–∞ –≤ –Ω–∞—á–∞–ª–µ —Ç–µ–∫—Å—Ç–∞."""
        pattern = rf'^{section_name}\s*'
        clean_text = re.sub(pattern, '', text, count=1, flags=re.IGNORECASE).strip()
        return clean_text

    def save_to_txt(self, country, place, sections):
        """–°–æ—Ö—Ä–∞–Ω—è–µ—Ç –¥–≤–∞ —Ä–∞–∑–¥–µ–ª–∞ –≤ –æ–¥–∏–Ω —Ñ–∞–π–ª."""
        place_path = os.path.join(self.save_path, country)
        os.makedirs(place_path, exist_ok=True)

        file_path = os.path.join(place_path, f"{place}.txt")
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(f"{place.upper()}\n\n")  # –ó–∞–≥–æ–ª–æ–≤–æ–∫ —Ñ–∞–π–ª–∞

            for sec in sections:
                section_title = sec["line"]
                section_id = sec["index"]

                if section_title in ["–î–æ—Å—Ç–æ–ø—Ä–∏–º–µ—á–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏", "–ß–µ–º –∑–∞–Ω—è—Ç—å—Å—è"]:
                    content = self.get_section_content(place, section_id, section_title)

                    if content.strip():
                        f.write(f"### {section_title}\n\n")
                        f.write(content + "\n\n")

        logging.info(f"‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: {file_path}")

    def process_place(self, country, place):
        """–ü–∞—Ä—Å–∏—Ç –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –æ –º–µ—Å—Ç–µ –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç –Ω—É–∂–Ω—ã–µ —Ä–∞–∑–¥–µ–ª—ã."""
        logging.info(f"üîç –û–±—Ä–∞–±–∞—Ç—ã–≤–∞—é: {place} ({country})")
        sections = self.get_page_sections(place)

        if not sections:
            logging.error(f"‚ùå –ù–µ —É–¥–∞–ª–æ—Å—å –ø–æ–ª—É—á–∏—Ç—å —Ä–∞–∑–¥–µ–ª—ã –¥–ª—è {place}")
            return
        
        self.save_to_txt(country, place, sections)
        logging.info(f"‚úÖ {place} ({country}) —Å–æ—Ö—Ä–∞–Ω–µ–Ω!")

    def process_all(self, locations):
        """–ü—Ä–æ—Ö–æ–¥–∏—Ç –ø–æ —Å—Ç—Ä–∞–Ω–∞–º –∏ –∏—Ö –≥–æ—Ä–æ–¥–∞–º, –æ–±—Ä–∞–±–∞—Ç—ã–≤–∞—è –¥–∞–Ω–Ω—ã–µ."""
        for country, cities in locations.items():
            logging.info(f"\nüåç –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º —Å—Ç—Ä–∞–Ω—É: {country}")

            # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º —Å—Ç—Ä–∞–Ω—É
            self.process_place(country, country)

            # –û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –≥–æ—Ä–æ–¥–∞
            for city in cities:
                self.process_place(country, city)

        logging.info("üéâ –í—Å–µ —Å—Ç—Ä–∞–Ω—ã –∏ –≥–æ—Ä–æ–¥–∞ –æ–±—Ä–∞–±–æ—Ç–∞–Ω—ã –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã!")

# –ó–∞–ø—É—Å–∫–∞–µ–º –ø–∞—Ä—Å–µ—Ä
parser = WikiVoyageParser(language="ru")

# –°–ø–∏—Å–æ–∫ —Å—Ç—Ä–∞–Ω –∏ –≥–æ—Ä–æ–¥–æ–≤
locations_to_parse = {
    "–†–æ—Å—Å–∏—è": ["–ú–æ—Å–∫–≤–∞", "–°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥", "–ö–∞–∑–∞–Ω—å"],
    "–§—Ä–∞–Ω—Ü–∏—è": ["–ü–∞—Ä–∏–∂", "–õ–∏–æ–Ω"],
    "–ì–µ—Ä–º–∞–Ω–∏—è": ["–ë–µ—Ä–ª–∏–Ω", "–ú—é–Ω—Ö–µ–Ω"],
    "–ò—Ç–∞–ª–∏—è": ["–†–∏–º", "–í–µ–Ω–µ—Ü–∏—è", "–§–ª–æ—Ä–µ–Ω—Ü–∏—è"]
}

parser.process_all(locations_to_parse)

In [2]:
import shutil

def recurcive_deletion(folder_name):
   shutil.rmtree(folder_name)

recurcive_deletion('./data')