In [5]:
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import re

# Define the base URL and headers
base_url = "https://lenta.ru/rubrics/culture/"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Function to sanitize filenames
def sanitize_filename(title):
    # Remove any invalid characters
    return re.sub(r'[<>:"/\\|?*]', '_', title)

# Function to get articles from a specific date range
def get_articles_from_date(start_date, end_date, output_folder):
    current_date = start_date

    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    while current_date >= end_date:
        # Generate the URL for the current date
        url = f"{base_url}{current_date.strftime('%Y/%m/%d/')}"
        response = requests.get(url, headers=headers)
        
        # Check if the response is successful
        if response.status_code != 200:
            print(f"Failed to retrieve data for {current_date.strftime('%Y-%m-%d')}")
            current_date -= timedelta(days=1)
            continue
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all news items
        news_items = soup.find_all('li', class_='archive-page__item _news')

        # Prepare to write all articles for the day to a single file
        daily_content = []

        for item in news_items:
            link = item.find('a', class_='card-full-news _archive')['href']
            title = item.find('h3', class_='card-full-news__title').text.strip()
            article_url = "https://lenta.ru" + link

            # Get the content of the article
            article_response = requests.get(article_url, headers=headers)
            article_soup = BeautifulSoup(article_response.text, 'html.parser')

            # Find all <p> tags
            paragraphs = article_soup.find_all('p', class_='topic-body__content-text')
            content = '\n'.join([p.text.strip() for p in paragraphs])

            # Add the article's title and content to the daily content list
            daily_content.append(f"Title: {title}\nContent:\n{content}\n")

        # Create a single filename for the day's articles
        daily_filename = f"{current_date.strftime('%Y%m%d')}_articles.txt"
        file_path = os.path.join(output_folder, daily_filename)

        # Write all articles for the day to the file
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write('\n'.join(daily_content))

        print(f"Saved: {file_path}")

        # Move to the previous day

        current_date -= timedelta(days=1)

# Define date range
start_date = datetime(2022,10,14)
end_date = start_date - timedelta(days=365)

# Specify output folder
output_folder = "culture_scraped_articles_22-21"

# Call the function
get_articles_from_date(start_date, end_date, output_folder)

Saved: culture_scraped_articles_22-21\20221014_articles.txt
Saved: culture_scraped_articles_22-21\20221013_articles.txt
Saved: culture_scraped_articles_22-21\20221012_articles.txt
Saved: culture_scraped_articles_22-21\20221011_articles.txt
Saved: culture_scraped_articles_22-21\20221010_articles.txt
Saved: culture_scraped_articles_22-21\20221009_articles.txt
Saved: culture_scraped_articles_22-21\20221008_articles.txt
Saved: culture_scraped_articles_22-21\20221007_articles.txt
Saved: culture_scraped_articles_22-21\20221006_articles.txt
Saved: culture_scraped_articles_22-21\20221005_articles.txt
Saved: culture_scraped_articles_22-21\20221004_articles.txt
Saved: culture_scraped_articles_22-21\20221003_articles.txt
Saved: culture_scraped_articles_22-21\20221002_articles.txt
Saved: culture_scraped_articles_22-21\20221001_articles.txt
Saved: culture_scraped_articles_22-21\20220930_articles.txt
Saved: culture_scraped_articles_22-21\20220929_articles.txt
Saved: culture_scraped_articles_22-21\20

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))