In [30]:
from bs4 import BeautifulSoup
import requests
import csv
import os

In [31]:
html_parser = {
    'title': 'art-title',
    'department': 'art-product-name',
    'author': 'art-author',
    'date': 'art-datetime',
    'text': 'art_paragraph'
    }

def get_articles(url):
    soup = BeautifulSoup(requests.get(url).text, "html.parser")
    articles = soup.find_all(class_='result-item-link',href=True)
    return articles

def get_article_data(link):
    data = {
        'title': None,
        'department': None,
        'author': None,
        'date': None,
        'text': None,
        'link': link
    }
    soup = BeautifulSoup(requests.get(link).text, "html.parser")
    for key, value in html_parser.items():
        try:
            if key == 'text':
                paragraphs = soup.find_all(class_=value)
                data[key] = " ".join([paragraph.text for  paragraph in paragraphs]).replace('\n',"").strip()
            else:
                data[key] = soup.find(class_=value).text.replace('\n',"").strip()
        except:
            print(f"Key: {key} not found in {link}")
    return data

In [32]:
csv_file = 'wyborcza.csv'
if not os.path.exists(csv_file):
    column_names = ['Title', 'Department', 'Author', 'Date', 'Text', 'Link']
    with open(csv_file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(column_names)

In [36]:
page = 0
with open('wyborcza.csv', 'a', newline='') as file:
    writer = csv.writer(file)
    while True:
        print(f"Page: {page}")
        link_page = f'https://classic.wyborcza.pl/archiwumGW/0,160510.html?searchForm=&datePeriod=0&initDate=2017-01-01&endDate=2017-05-15&publicationsString=1%3B5&author=&page={page}&sort=OLDEST'
        
        data_list = []    
        articles = get_articles(link_page)
        if len(articles) == 0:
            print("No articles found")
            break
        for article in articles:
            link='https://classic.wyborcza.pl/archiwumGW/'+article['href']
            data = get_article_data(link)
            data_list.append(data)
        for data in data_list:
            writer.writerow([data['title'], data['department'], data['author'], data['date'], data['text'], data['link']])    
        page += 1

Page: 688
Page: 689
Page: 690
Page: 691
Page: 692
Key: author not found in https://classic.wyborcza.pl/archiwumGW/8102707/LICZBA-TYGODNIA
Page: 693
Page: 694
Key: author not found in https://classic.wyborcza.pl/archiwumGW/8103533/Marcin-Wicha-rysuje-Swietego-Oburza
Page: 695
Key: author not found in https://classic.wyborcza.pl/archiwumGW/8103530/DOBRA-NOWINA
Key: author not found in https://classic.wyborcza.pl/archiwumGW/8103910/Z-jakiego-sortu-jestes--kolego--Gorszego-
Key: author not found in https://classic.wyborcza.pl/archiwumGW/8103909/SUCHAR-WCIAZ-POZYWNY
Page: 696
Page: 697
Page: 698
Page: 699
Page: 700
Key: author not found in https://classic.wyborcza.pl/archiwumGW/8103924/KOD-ZNOW-NA-ULICACH
Page: 701
Page: 702
Page: 703
Page: 704
Key: author not found in https://classic.wyborcza.pl/archiwumGW/8104645/SOKOL-WYLADOWAL
Page: 705
Page: 706
Page: 707
Page: 708
Page: 709
Page: 710
Key: author not found in https://classic.wyborcza.pl/archiwumGW/8105756/NUMERY-LOTTO
Page: 711
Key: au