In [6]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Corrected base URL of the website and the number of pages
base_url = "https://www.krugosvet.ru/enc/medicina/page/"

def fetch_article_text(article_url):
    """
    Fetch the content of an article.

    Args:
    - article_url: The full URL of the article.

    Returns:
    - A string containing the article content.
    """
    try:
        response = requests.get(article_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Assuming article content is within <p> tags
            paragraphs = soup.find_all('p')
            article_text = '\n'.join([para.get_text() for para in paragraphs])
            return article_text
        else:
            print(f"Failed to fetch {article_url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred while fetching {article_url}: {str(e)}")
        return None

def save_article(title, content, folder):
    """
    Save the article content to a .txt file.

    Args:
    - title: The title of the article.
    - content: The article content.
    - folder: The folder where the article should be saved.
    """
    # Clean the title for safe file naming
    safe_title = title.replace("/", "_").replace("\\", "_").replace(":", "_").strip()
    file_path = os.path.join(folder, f"{safe_title}.txt")

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

    print(f"Saved '{title}' to {file_path}")

def scrape_articles_from_page(page_url, folder):
    """
    Scrape article links from a given page and save their contents.

    Args:
    - page_url: The URL of the page listing articles.
    - folder: Folder to save the articles.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    try:
        response = requests.get(page_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find all divs with class 'article-teaser'
            articles = soup.find_all('div', class_='article-teaser')

            for article in articles:
                link_tag = article.find('a')
                if link_tag:
                    article_url = urljoin(page_url, link_tag['href'])
                    title_tag = article.find('div', class_='at-title')
                    title = title_tag.get_text(strip=True) if title_tag else 'Untitled'

                    print(f"Fetching article: {title}")

                    content = fetch_article_text(article_url)
                    if content:
                        save_article(title, content, folder)
        else:
            print(f"Failed to access {page_url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred while scraping {page_url}: {str(e)}")

# Main script execution
if __name__ == "__main__":
    # Folder to save the scraped articles
    output_folder = "krugosvet_articles"

    # Scrape articles from page 1 to 35
    for page_num in range(1, 50):
        page_url = f"{base_url}{page_num}"  # Corrected base URL structure
        print(f"Scraping articles from {page_url}")
        scrape_articles_from_page(page_url, output_folder)


Scraping articles from https://www.krugosvet.ru/enc/medicina/page/1
Fetching article: АЛЬЦГЕЙМЕРА БОЛЕЗНЬ
Saved 'АЛЬЦГЕЙМЕРА БОЛЕЗНЬ' to krugosvet_articles\АЛЬЦГЕЙМЕРА БОЛЕЗНЬ.txt
Fetching article: АМНЕЗИЯ
Saved 'АМНЕЗИЯ' to krugosvet_articles\АМНЕЗИЯ.txt
Fetching article: АНАТОМИЯ СРАВНИТЕЛЬНАЯ
Saved 'АНАТОМИЯ СРАВНИТЕЛЬНАЯ' to krugosvet_articles\АНАТОМИЯ СРАВНИТЕЛЬНАЯ.txt
Fetching article: АНАТОМИЯ ЧЕЛОВЕКА
Saved 'АНАТОМИЯ ЧЕЛОВЕКА' to krugosvet_articles\АНАТОМИЯ ЧЕЛОВЕКА.txt
Fetching article: АНГИНА
Saved 'АНГИНА' to krugosvet_articles\АНГИНА.txt
Fetching article: АНГИОМА
Saved 'АНГИОМА' to krugosvet_articles\АНГИОМА.txt
Fetching article: АНЕВРИЗМА
Saved 'АНЕВРИЗМА' to krugosvet_articles\АНЕВРИЗМА.txt
Fetching article: АНЕМИЯ
Saved 'АНЕМИЯ' to krugosvet_articles\АНЕМИЯ.txt
Fetching article: АНЕМИЯ СЕРПОВИДНОКЛЕТОЧНАЯ
Saved 'АНЕМИЯ СЕРПОВИДНОКЛЕТОЧНАЯ' to krugosvet_articles\АНЕМИЯ СЕРПОВИДНОКЛЕТОЧНАЯ.txt
Fetching article: АНЕСТЕЗИЯ
Saved 'АНЕСТЕЗИЯ' to krugosvet_articles\АНЕСТЕЗИЯ.tx

In [3]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Corrected base URL of the website and the number of pages
base_url = "https://www.krugosvet.ru/enc/sociologiya/page/"

def fetch_article_text(article_url):
    """
    Fetch the content of an article.

    Args:
    - article_url: The full URL of the article.

    Returns:
    - A string containing the article content.
    """
    try:
        response = requests.get(article_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Assuming article content is within <p> tags
            paragraphs = soup.find_all('p')
            article_text = '\n'.join([para.get_text() for para in paragraphs])
            return article_text
        else:
            print(f"Failed to fetch {article_url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred while fetching {article_url}: {str(e)}")
        return None

def save_article(title, content, folder):
    """
    Save the article content to a .txt file.

    Args:
    - title: The title of the article.
    - content: The article content.
    - folder: The folder where the article should be saved.
    """
    # Clean the title for safe file naming
    safe_title = title.replace("/", "_").replace("\\", "_").replace(":", "_").strip()
    file_path = os.path.join(folder, f"{safe_title}.txt")

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

    print(f"Saved '{title}' to {file_path}")

def scrape_articles_from_page(page_url, folder):
    """
    Scrape article links from a given page and save their contents.

    Args:
    - page_url: The URL of the page listing articles.
    - folder: Folder to save the articles.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    try:
        response = requests.get(page_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find all divs with class 'article-teaser'
            articles = soup.find_all('div', class_='article-teaser')

            for article in articles:
                link_tag = article.find('a')
                if link_tag:
                    article_url = urljoin(page_url, link_tag['href'])
                    title_tag = article.find('div', class_='at-title')
                    title = title_tag.get_text(strip=True) if title_tag else 'Untitled'

                    print(f"Fetching article: {title}")

                    content = fetch_article_text(article_url)
                    if content:
                        save_article(title, content, folder)
        else:
            print(f"Failed to access {page_url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred while scraping {page_url}: {str(e)}")

# Main script execution
if __name__ == "__main__":
    # Folder to save the scraped articles
    output_folder = "krugosvet_articles"

    # Scrape articles from page 1 to 35
    for page_num in range(1, 16):
        page_url = f"{base_url}{page_num}"  # Corrected base URL structure
        print(f"Scraping articles from {page_url}")
        scrape_articles_from_page(page_url, output_folder)


Scraping articles from https://www.krugosvet.ru/enc/sociologiya/page/1
Fetching article: БОДРИЙЯР, ЖАН
Saved 'БОДРИЙЯР, ЖАН' to krugosvet_articles\БОДРИЙЯР, ЖАН.txt
Fetching article: БОЛЧ, ЭМИЛИ ГРИН
Saved 'БОЛЧ, ЭМИЛИ ГРИН' to krugosvet_articles\БОЛЧ, ЭМИЛИ ГРИН.txt
Fetching article: БУРДЬЕ, ПЬЕР
Saved 'БУРДЬЕ, ПЬЕР' to krugosvet_articles\БУРДЬЕ, ПЬЕР.txt
Fetching article: БУТ, ЧАРЛЗ
Saved 'БУТ, ЧАРЛЗ' to krugosvet_articles\БУТ, ЧАРЛЗ.txt
Fetching article: БЮРОКРАТИЯ
Saved 'БЮРОКРАТИЯ' to krugosvet_articles\БЮРОКРАТИЯ.txt
Fetching article: ВАЛЛЕРСТАЙН, ИММАНУИЛ
Saved 'ВАЛЛЕРСТАЙН, ИММАНУИЛ' to krugosvet_articles\ВАЛЛЕРСТАЙН, ИММАНУИЛ.txt
Fetching article: ВЕБЕР, МАКС
Saved 'ВЕБЕР, МАКС' to krugosvet_articles\ВЕБЕР, МАКС.txt
Fetching article: ВЕСТЕРМАРК, ЭДВАРД АЛЕКСАНДР
Saved 'ВЕСТЕРМАРК, ЭДВАРД АЛЕКСАНДР' to krugosvet_articles\ВЕСТЕРМАРК, ЭДВАРД АЛЕКСАНДР.txt
Fetching article: ВОЗРАСТ
Saved 'ВОЗРАСТ' to krugosvet_articles\ВОЗРАСТ.txt
Fetching article: ВОРОНЦОВ, ВАСИЛИЙ ПАВЛОВИЧ
Saved

In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Corrected base URL of the website and the number of pages
base_url = "https://www.krugosvet.ru/enc/filosofiya/page/"

def fetch_article_text(article_url):
    """
    Fetch the content of an article.

    Args:
    - article_url: The full URL of the article.

    Returns:
    - A string containing the article content.
    """
    try:
        response = requests.get(article_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Assuming article content is within <p> tags
            paragraphs = soup.find_all('p')
            article_text = '\n'.join([para.get_text() for para in paragraphs])
            return article_text
        else:
            print(f"Failed to fetch {article_url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred while fetching {article_url}: {str(e)}")
        return None

def save_article(title, content, folder):
    """
    Save the article content to a .txt file.

    Args:
    - title: The title of the article.
    - content: The article content.
    - folder: The folder where the article should be saved.
    """
    # Clean the title for safe file naming
    safe_title = title.replace("/", "_").replace("\\", "_").replace(":", "_").strip()
    file_path = os.path.join(folder, f"{safe_title}.txt")

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

    print(f"Saved '{title}' to {file_path}")

def scrape_articles_from_page(page_url, folder):
    """
    Scrape article links from a given page and save their contents.

    Args:
    - page_url: The URL of the page listing articles.
    - folder: Folder to save the articles.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    try:
        response = requests.get(page_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find all divs with class 'article-teaser'
            articles = soup.find_all('div', class_='article-teaser')

            for article in articles:
                link_tag = article.find('a')
                if link_tag:
                    article_url = urljoin(page_url, link_tag['href'])
                    title_tag = article.find('div', class_='at-title')
                    title = title_tag.get_text(strip=True) if title_tag else 'Untitled'

                    print(f"Fetching article: {title}")

                    content = fetch_article_text(article_url)
                    if content:
                        save_article(title, content, folder)
        else:
            print(f"Failed to access {page_url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred while scraping {page_url}: {str(e)}")

# Main script execution
if __name__ == "__main__":
    # Folder to save the scraped articles
    output_folder = "krugosvet_articles"

    # Scrape articles from page 1 to 35
    for page_num in range(17, 94):
        page_url = f"{base_url}{page_num}"  # Corrected base URL structure
        print(f"Scraping articles from {page_url}")
        scrape_articles_from_page(page_url, output_folder)


Scraping articles from https://www.krugosvet.ru/enc/filosofiya/page/17
Fetching article: ВАН ЦЗИ
Saved 'ВАН ЦЗИ' to krugosvet_articles\ВАН ЦЗИ.txt
Fetching article: ВАН ЧЖУН
Saved 'ВАН ЧЖУН' to krugosvet_articles\ВАН ЧЖУН.txt
Fetching article: ВАН ЧУАНЬШАНЬ
Saved 'ВАН ЧУАНЬШАНЬ' to krugosvet_articles\ВАН ЧУАНЬШАНЬ.txt
Fetching article: ВАН ЧУН
Saved 'ВАН ЧУН' to krugosvet_articles\ВАН ЧУН.txt
Fetching article: ВАН ЯНМИН
Saved 'ВАН ЯНМИН' to krugosvet_articles\ВАН ЯНМИН.txt
Fetching article: ВАН ЯНМИНА ШКОЛА
Saved 'ВАН ЯНМИНА ШКОЛА' to krugosvet_articles\ВАН ЯНМИНА ШКОЛА.txt
Fetching article: ВАРНЫ
Saved 'ВАРНЫ' to krugosvet_articles\ВАРНЫ.txt
Fetching article: ВАСУБАНДХУ
Saved 'ВАСУБАНДХУ' to krugosvet_articles\ВАСУБАНДХУ.txt
Fetching article: ВАТСИПУТРИЯ
Saved 'ВАТСИПУТРИЯ' to krugosvet_articles\ВАТСИПУТРИЯ.txt
Fetching article: ВАЧАСПАТИ МИШРА
Saved 'ВАЧАСПАТИ МИШРА' to krugosvet_articles\ВАЧАСПАТИ МИШРА.txt
Scraping articles from https://www.krugosvet.ru/enc/filosofiya/page/18
Fetch

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Corrected base URL of the website and the number of pages
base_url = "https://www.krugosvet.ru/enc/istoriya/page/"

def fetch_article_text(article_url):
    """
    Fetch the content of an article.

    Args:
    - article_url: The full URL of the article.

    Returns:
    - A string containing the article content.
    """
    try:
        response = requests.get(article_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Assuming article content is within <p> tags
            paragraphs = soup.find_all('p')
            article_text = '\n'.join([para.get_text() for para in paragraphs])
            return article_text
        else:
            print(f"Failed to fetch {article_url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error occurred while fetching {article_url}: {str(e)}")
        return None

def save_article(title, content, folder):
    """
    Save the article content to a .txt file.

    Args:
    - title: The title of the article.
    - content: The article content.
    - folder: The folder where the article should be saved.
    """
    # Clean the title for safe file naming
    safe_title = title.replace("/", "_").replace("\\", "_").replace(":", "_").strip()
    file_path = os.path.join(folder, f"{safe_title}.txt")

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(content)

    print(f"Saved '{title}' to {file_path}")

def scrape_articles_from_page(page_url, folder):
    """
    Scrape article links from a given page and save their contents.

    Args:
    - page_url: The URL of the page listing articles.
    - folder: Folder to save the articles.
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

    try:
        response = requests.get(page_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find all divs with class 'article-teaser'
            articles = soup.find_all('div', class_='article-teaser')

            for article in articles:
                link_tag = article.find('a')
                if link_tag:
                    article_url = urljoin(page_url, link_tag['href'])
                    title_tag = article.find('div', class_='at-title')
                    title = title_tag.get_text(strip=True) if title_tag else 'Untitled'

                    print(f"Fetching article: {title}")

                    content = fetch_article_text(article_url)
                    if content:
                        save_article(title, content, folder)
        else:
            print(f"Failed to access {page_url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred while scraping {page_url}: {str(e)}")

# Main script execution
if __name__ == "__main__":
    # Folder to save the scraped articles
    output_folder = "krugosvet_articles"

    # Scrape articles from page 1 to 35
    for page_num in range(1, 180):
        page_url = f"{base_url}{page_num}"  # Corrected base URL structure
        print(f"Scraping articles from {page_url}")
        scrape_articles_from_page(page_url, output_folder)

Scraping articles from https://www.krugosvet.ru/enc/istoriya/page/1
Fetching article: АБДУРРАХМАН
Saved 'АБДУРРАХМАН' to krugosvet_articles\АБДУРРАХМАН.txt
Fetching article: АБСОЛЮТИЗМ
Saved 'АБСОЛЮТИЗМ' to krugosvet_articles\АБСОЛЮТИЗМ.txt
Fetching article: АБУ АЛЬ-ФИДА
Saved 'АБУ АЛЬ-ФИДА' to krugosvet_articles\АБУ АЛЬ-ФИДА.txt
Fetching article: АВВАКУМ
Saved 'АВВАКУМ' to krugosvet_articles\АВВАКУМ.txt
Fetching article: АВГУСТ
Saved 'АВГУСТ' to krugosvet_articles\АВГУСТ.txt
Fetching article: АВЕСТИЙСКИЙ ЯЗЫК
Saved 'АВЕСТИЙСКИЙ ЯЗЫК' to krugosvet_articles\АВЕСТИЙСКИЙ ЯЗЫК.txt
Fetching article: АВИЛА КАМАЧО, МАНУЭЛЬ
Saved 'АВИЛА КАМАЧО, МАНУЭЛЬ' to krugosvet_articles\АВИЛА КАМАЧО, МАНУЭЛЬ.txt
Fetching article: АВРЕЛИАН, ЛУЦИЙ ДОМИЦИЙ
Saved 'АВРЕЛИАН, ЛУЦИЙ ДОМИЦИЙ' to krugosvet_articles\АВРЕЛИАН, ЛУЦИЙ ДОМИЦИЙ.txt
Fetching article: АВСТРО-ПРУССКАЯ И АВСТРО-ИТАЛЬЯНСКАЯ ВОЙНЫ
Saved 'АВСТРО-ПРУССКАЯ И АВСТРО-ИТАЛЬЯНСКАЯ ВОЙНЫ' to krugosvet_articles\АВСТРО-ПРУССКАЯ И АВСТРО-ИТАЛЬЯНСКАЯ ВОЙ