In [1]:
import ebooklib
import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", DeprecationWarning)

from ebooklib import epub
from bs4 import BeautifulSoup
import regex as re
import pandas as pd

blacklist = [   '[document]',   'noscript', 'header',   'html', 'meta', 'head','input', 'script',   ]

def epub2thtml(epub_path):
    book = epub.read_epub(epub_path)
    chapters = []
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapters.append(item.get_content())
    return chapters

def chap2text(chap):
    output = ''
    soup = BeautifulSoup(chap, 'html.parser')
    text = soup.find_all(text=True)
    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)
    return output

def thtml2ttext(thtml):
    Output = []
    for html in thtml:
        text =  chap2text(html)
        Output.append(text)
    return Output

def epub2text(epub_path):
    chapters = epub2thtml(epub_path)
    ttext = thtml2ttext(chapters)
    return ttext

## Extracting

In [2]:
def link_getter(i):
    index =  405 + i 
    return f"https://tygodnik.dorzeczy.pl/archiwum/{index}/dorzeczy-{i}-{2021}.html"

def titles_extractor(titles_raw):
    print( re.findall("(?<=\n\s)\d+(?=\.\s\s)",titles_raw)[-1], end=' ')
    titles = re.split("\n\s\d+\.\s\s", titles_raw)[1:]
    titles =[ title.strip() for title in titles ]
    return titles

def pipeline(file):
    put = epub2text(file)
    titles = titles_extractor(put[1])
    articles = put[2:]
    print(len(titles))
    if len(titles) != len(articles): print(file)
    return pd.DataFrame(list(zip(titles, articles)), columns=['title', 'article'])

def author_extractor(article):
    title = article['title']
    txt = article['article']
    txt = txt.replace('\xa0', ' ')
    title = title.replace("*", "\*")
    regex = "(?<=\\n " + title + " \\n \\n )[\w \.,]+(?= \\n \\n)"
    try:
        matched = re.search(regex, txt)
    except:
        print(title)
        matched = None
    
    if matched != None:
        return matched[0]
    else:
        return matched
    
def text_preprocessor(article):
    try:
        text =  re.split(" \\n \\n \\n \\n \\n \\n \\n", article)[1]
    except:
        print(article)
    text = text.replace("\n", " ")
    text = text.replace("\xa0", " ")
    return text

In [3]:
df_list = []
for i in range(6, 50):
    df = pipeline(f"Epubs/dorzeczy-{i}-2021-.epub")
    df['author'] = df.apply(lambda x: author_extractor(x), axis=1)
    df['text'] = df['article'].apply(lambda x: text_preprocessor(x))
    df['magazine_nr'] = 405 + i
    df['link'] = link_getter(i)
    df_list.append(df)


56 56
59 59
52 52
55 55
60 60
52 52
54 54
65 65
54 54
52 52
56 56
56 56
55 55
61 61
55 55
58 58
52 52
57 57
49 49
60 60
54 54
53 53
50 50
50 50
48 48
53 53
52 52
50 50
51 51
49 49
49 49
50 50
60 60
56 56
54 54
48 48
53 53
49 49
50 50
54 54
49 49
54 54
50 50
51 51


In [4]:
df_all = pd.concat(df_list)
df_all['year'] = 2021
df_all.drop(columns=['article'], inplace=True)
df_all.to_csv("from_epub.csv")
len(df_all)

2355

In [5]:
df_all

Unnamed: 0,title,author,text,magazine_nr,link,year
0,Młodzi wykształceni i z wielkich ośrodków,,"Kochani, rząd postanowił dokonać zamachu ...",411,https://tygodnik.dorzeczy.pl/archiwum/411/dorz...,2021
1,Tylko dla mimów,Łukasz Warzecha,"Podsłuchane | Adam, o co chodzi z tymi mimami...",411,https://tygodnik.dorzeczy.pl/archiwum/411/dorz...,2021
2,Dwaj Panowie G,"Piotr Gociek, Cezary Gmyz",Z ostatniej chwili: nikt nic nie wie! Czy ...,411,https://tygodnik.dorzeczy.pl/archiwum/411/dorz...,2021
3,Musimy być surowi?,,"Papież Franciszek przyzwyczaił wiernych do, d...",411,https://tygodnik.dorzeczy.pl/archiwum/411/dorz...,2021
4,A to Polska właśnie,Krzysztof Masłoń,"W ramach redefiniowania polskości, a co za ty...",411,https://tygodnik.dorzeczy.pl/archiwum/411/dorz...,2021
...,...,...,...,...,...,...
46,Marian Kołodziej,Jan Pospieszalski,#WARTO II W poniedziałek 6 grudnia 2021...,454,https://tygodnik.dorzeczy.pl/archiwum/454/dorz...,2021
47,Pani sprzątaczka wymiata,Sławomir Jastrzębowski,ĆWIERKOT II Miłość społeczną w minionym...,454,https://tygodnik.dorzeczy.pl/archiwum/454/dorz...,2021
48,Ambasada w śpiączce,Małgorzata Wołczyk,PIÓRKIEM I PAZURKIEM II Małgośka – mówi...,454,https://tygodnik.dorzeczy.pl/archiwum/454/dorz...,2021
49,Proszę nie lekceważyć mionów,Wojciech Roszkowski,Z INNEJ PERSPEKTYWY II Historyk niemal ...,454,https://tygodnik.dorzeczy.pl/archiwum/454/dorz...,2021
