In [16]:
import typing as tp
import requests
import os
from bs4 import BeautifulSoup

from tqdm import tqdm
import numpy as np
import pandas as pd

In [40]:
def mine(url, author):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        links = get_links(soup, url)
        if not os.path.exists(author):
            os.makedirs(author)

        content = []
        for link in tqdm(links):
            try:
                page = requests.get(link).content
                poem_soup = BeautifulSoup(page, 'html.parser')
                title, poem = get_poem(poem_soup)
                if "Å" in poem:
                    poem = poem.encode('cp1252').decode()

                if "english" not in title and "esperanto" not in title and "[en]" not in title:
                    poem_lines = [line.strip() for line in poem.split('\n') if len(line.split(' ')) > 1]
                    if len(poem_lines) % 4:
                        continue
                    for verse in np.array(poem_lines).reshape(-1, 4):
                        correct_verse = '\n'.join(verse)
                        correct_verse = correct_verse + '\n' if not correct_verse.endswith('\n') else correct_verse
                        if len(correct_verse) < 200:
                            content.append(correct_verse)
            except requests.exceptions.ConnectionError:
                print("Failed to connect with %s." % link)
        print(f"Saved {len(content)} poems")
        dataframe = pd.DataFrame(content)
        dataframe.to_csv(f"data/poems/pl/{author}.csv", encoding='UTF-8', index=False, header=False)


def get_links(soup, url):
    return [a.get('href') for a in soup.find_all('a') if a.get('href').startswith(url) and not a.get('href').startswith(f"{url}#")]


def get_poem(soup):
    title = soup.title.text
    poem = soup.find(attrs={"itemprop": "text"}).text
    return title, poem

# mine('https://poezja.org/wz/Miron_Bialoszewski/', 'bialoszewski')
# mine('https://poezja.org/wz/Krzysztof_Kamil_Baczynski/', 'baczynski')
# mine('https://poezja.org/wz/Zbigniew_Herbert/', 'herbert')
# mine('https://poezja.org/wz/Bolesław_Lesmian/', 'lesmian')
# mine('https://poezja.org/wz/Wislawa_Szymborska/', 'szymborska')
# mine('https://poezja.org/wz/Adam_Mickiewicz/', 'mickiewicz')
# mine('https://poezja.org/wz/Jan_Kochanowski/', 'kochanowski')
mine('https://poezja.org/wz/Juliusz_Slowacki/', 'slowacki')

  5%|▍         | 11/236 [00:01<00:39,  5.64it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
  9%|▉         | 21/236 [00:03<00:34,  6.18it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 50%|█████     | 118/236 [00:20<00:29,  4.01it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 73%|███████▎  | 172/236 [00:30<00:10,  6.29it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 77%|███████▋  | 182/236 [00:32<00:14,  3.61it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
100%|██████████| 236/236 [00:41<00:00,  5.63it/s]

Saved 713 poems





In [None]:
INPUT_FILE = "data/poems/pl/treny.txt"
OUTPUT_FILE = "data/poems/pl/treny.csv"

def load_data(path: str, excluded_start: tp.List[str]):
    with open(path, encoding="UTF-8") as f:
        text = list(filter(lambda x: not any([x.startswith(ex) for ex in excluded_start]), f.readlines()))
    return text

TEXT = load_data(INPUT_FILE, ['#', '﻿#', '\n', ' '])
print(*TEXT[:20])

In [3]:
LINES_IN_VERSE = 4
CUT_LINES = len(TEXT) % LINES_IN_VERSE
GROUPED = [''.join(verse) for verse in np.array(TEXT[:-CUT_LINES]).reshape(-1, LINES_IN_VERSE)]


In [4]:
dataframe = pd.DataFrame(GROUPED)
dataframe.to_csv(OUTPUT_FILE, encoding='utf-8', index=False, header=False)
