In [None]:
import pandas as pd
import urllib
import re
from datetime import datetime

# Epub and Mobi

In [None]:
dates = set()

links = [f'https://eprasa.pl/news/do-rzeczy?page={i}' for i in range(2,8)]
links.append("https://eprasa.pl/news/do-rzeczy")

for link in links:
        with urllib.request.urlopen(link) as url:
            print(link)
            s = url.read().decode('utf-8')
            # I'm guessing this would output the html source code ?
            dates.update(set(re.findall("(?<=Do Rzeczy )(\d+) \(([\d\.]+)\)", s)))

dates.update(set([('17', '08.05.2022'),('17', '07.05.2023')]))

In [None]:
df_dates = pd.DataFrame(dates, columns=['magazine_nr', 'date'])
df_dates['magazine_nr'] = df_dates['magazine_nr'].astype(int)
df_dates['date'] = pd.to_datetime(df_dates['date'], format='%d.%m.%Y')
df_dates.loc[(df_dates.date.dt.year == 2023) & (df_dates.magazine_nr > 17) & (df_dates.magazine_nr < 21), 'magazine_nr'] -=1
df_dates.loc[(df_dates.date.dt.year == 2022) & (df_dates.magazine_nr > 17), 'magazine_nr'] -=1

df_dates.loc[df_dates['date'].dt.year == 2022, 'magazine_nr'] += 456
df_dates.loc[df_dates['date'].dt.year == 2021, 'magazine_nr'] += 405
df_dates.loc[df_dates['date'].dt.year == 2023, 'magazine_nr'] += 507

df_dates.loc[df_dates['date'] == datetime.strptime("27.12.2022", "%d.%m.%Y"), "magazine_nr"] = 508
df_dates.loc[df_dates['date'] == datetime.strptime("11.04.2022", "%d.%m.%Y"), "magazine_nr"] = 573


In [None]:
df_epub = pd.read_csv("Epub/from_epub.csv")
df_epub.drop(columns="Unnamed: 0", inplace = True)
df_mobi = pd.read_csv("Mobi/from_mobi.csv")
df_mobi.drop(columns="Unnamed: 0", inplace = True)
df_epub['file'] = 'epub'
df_mobi['file'] = 'mobi'
df_all = pd.concat([df_epub, df_mobi])
df_all = pd.merge(df_all,df_dates, how='left', on='magazine_nr')

In [None]:
def missing_values(data):
    dict = {}
    features = data.columns
    for feature in features:
        dict[feature] = data[feature].isnull().sum() / len(data) * 100
    return pd.DataFrame.from_dict(dict,orient ='index', columns=['Missing ratio %']).sort_values('Missing ratio %', ascending=False)

df_all.replace('None', None, inplace=True)
df_all = df_all[~df_all['text'].isna()]
missing_values(df_all)

# PDF

In [None]:
dates = set()

links = [f'https://eprasa.pl/news/do-rzeczy?page={i}' for i in range(2,25)]
links.append("https://eprasa.pl/news/do-rzeczy")

for link in links:
        with urllib.request.urlopen(link) as url:
            print(link)
            s = url.read().decode('utf-8')
            # I'm guessing this would output the html source code ?
            dates.update(set(re.findall("(?<=Do Rzeczy )([\d-]+) \(([\d\.]+)\)", s)))
dates.update(set([ ("13-14", "05.04.2020"),('13I', '29.03.2020')]))

In [None]:
dates.update(set([ ("13-14", "05.04.2020"),('13', '29.03.2020')]))

In [None]:
df_dates = pd.DataFrame(dates, columns=['magazine_nr_1', 'date'])
df_dates['date'] = pd.to_datetime(df_dates['date'], format='%d.%m.%Y')
df_dates['year'] = df_dates['date'].dt.year

In [None]:
df_dates[df_dates['year'] == 2020].sort_values("date").iloc[-1]
df_dates.loc[(df_dates['year'] == 2020) & (df_dates['date'] == datetime.strptime("2020-12-31 00:00:00","%Y-%m-%d %H:%M:%S" )), "year"] = 2021

In [None]:
df_dates.to_csv("Final/dates_extracted.csv")

# Merge with csv files

In [None]:
def date_merge(year, df_dates=df_dates):
    df = pd.read_csv(f"Pdf/dorzeczy_{year}.csv")
    try:
        df.drop(columns="Unnamed: 0", inplace = True)
    except:
        pass
    df['magazine_nr_1'] = df['magazine_nr_1'].astype(str).apply(lambda x: re.sub("–", "-", x))
    df['magazine_nr_1'] = df['magazine_nr_1'].astype(str).apply(lambda x: re.sub("^0", "", x))
    df_merged =  pd.merge(df, df_dates[df_dates['year'] == year], on="magazine_nr_1", how="left")
    print(f"{year}: {len(df_merged[df_merged['date'].isna()])}")
    return df_merged

In [None]:
years = [year for year in range(2015,2024)]
df_list = list(map(date_merge, years))

In [None]:
def link_extractor(row):
    year = row['year']
    magazine_nr_1 = row['magazine_nr_1']
    magazine_nr_2 = row['magazine_nr_2']
    return f'https://tygodnik.dorzeczy.pl/archiwum/{magazine_nr_2}/dorzeczy-{magazine_nr_1}-{year}.html'

In [None]:
df_pdf = pd.concat(df_list)
df_pdf['magazine_nr_2'] = df_pdf['magazine_nr_2'].fillna("0")
df_pdf['magazine_nr_2'] = df_pdf['magazine_nr_2'].astype(int)
df_pdf['link'] = df_pdf.apply(link_extractor, axis = 1)

In [None]:
df_pdf.drop(columns=['file', 'page', 'magazine_nr_1', 'year'], inplace= True)
df_pdf.rename(columns = {"magazine_nr_2": "magazine_nr", "content": "text", }, inplace=True)

In [None]:
missing_values(df_pdf)

# Concat

In [None]:
cols = set(df_pdf.columns.values)
cols.update(df_all.columns.values)
cols

In [None]:
df_final = pd.concat([df_all, df_pdf])

In [None]:
df_final.reset_index(inplace=True)
df_final.rename(columns={"index": "id"}, inplace=True)

In [None]:
len(df_final)

In [None]:
df_final.to_csv("dorzeczy.csv", index = False)