In [None]:
import fitz
import pandas as pd
from pipeline import rzepa_pipeline
import regex as re

## File conversions

In [None]:
def pdf_to_txt(path_in, path_out):
    doc = fitz.open(path_in)                           # open a document
    out = open(path_out, "wb")                  # create a text output
    for i, page in enumerate(doc):                  # iterate the document pages
        text = page.get_text().strip().encode("utf8")       # get plain text (is in UTF-8)
        out.write(text)                             # write text of page
        out.write(bytes(f"_Page {i+1}_\n", 'utf-8'))  # write page delimiter (form feed 0x0C)
    out.close()

In [None]:
def author_correction(df , author_names):
    df['author'] = df['author'].fillna("None")
    df["author_len"] = df['author'].apply(lambda x: len(x))
    df['author'] = df['author'].str.strip()
    df['title'] = df['title'].str.strip()
    df['author'] = df['author'].apply(lambda x: re.sub(" Z( |\xa0).+", "", x))

    for name_surname in author_names:
        name = name_surname.split(" ")[0]
        surname = name_surname.split(" ")[1]
        regex_expression = f'^(?i){surname}'
        df.loc[df["author"] == name, 'title'] = df.loc[df["author"] == name, 'title'].apply(lambda x: re.sub(regex_expression, "" ,x))
        df.loc[df["author"] == name, 'author'] = name_surname

    df['author'] = df["author"].apply(lambda x: x.upper() if x != None else 'None')
    return df

# 2015

In [None]:
special_files = ["dorzeczy_51_2015.pdf", "dorzeczy_20_2015.pdf", "dorzeczy_21_2015.pdf","dorzeczy_24_2015.pdf", "dorzeczy_33_2015.pdf", "dorzeczy_45_2015.pdf", "dorzeczy_7_2015.pdf", "dorzeczy_25_2015.pdf", "dorzeczy_29_2015.pdf", "dorzeczy_5_2015.pdf", "dorzeczy_19_2015.pdf", "dorzeczy_18_2015.pdf", "dorzeczy_7_2015.pdf", "dorzeczy_27_2015.pdf"]
folder_name_2015 = "DoRzeczy/DoRzeczy2015/"

In [None]:
df_2015, problem_files = rzepa_pipeline("DoRzeczy/DoRzeczy2015/", special_files, 2015)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2015.copy()
df_problem['problem'].value_counts()

Table of Contents    4 <br>
Not Chronological    2 <br>
Page number          2 <br>

In [None]:
author_names = ["JOANNA BOJAŃCZYK", "PIOTR ZYCHOWICZ", "WITOLD REPETOWICZ", "ŁUKASZ MAJCHRZYK", "ZYGMUNT BERDYCHOWSKI",
                "SŁAWOMIR CIENCKIEWICZ", "GRZEGORZ KRYCHOWIAK", "KRZYSZTOF MASŁOŃ", "KATARZYNA PINKOSZ", "STEFAN SĘKOWSKI", "ANDRZEJ HORUBAŁA", "WALDEMAR ŁYSIAK"]

df = author_correction(df, author_names)

In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df['year'] = 2015
df.to_csv("dorzeczy_2015.csv", index=False)

# 2016

In [None]:
special_files = ["dorzeczy_29_2016.pdf", "dorzeczy_19_2016.pdf" ,"dorzeczy_13_2016.pdf", "dorzeczy_42_2016.pdf", "dorzeczy_46_2016.pdf"]
folder_name_2016 = "DoRzeczy/DoRzeczy2016/"

In [None]:
df_2016, problem_files = rzepa_pipeline(folder_name_2016, special_files, 2016)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2016.copy()
df_problem['problem'].value_counts()

Table of Contents    7 <br>
Page number          5 <br>
Not Chronological    2 <br>

In [None]:
author_names = ["JOANNA BOJAŃCZYK", "JACEK Przybylski", "STEFAN Sękowski", "ANDRZEJ Horubała", "WALDEMAR Łysiak", "PRZEMYSŁAW Kawalec"]
df = author_correction(df, author_names)
df.loc[(df["author"]=="ŁUKASZ") & (df['page'] == 90), 'author'] = "ŁUKASZ Majchrzyk"
df.loc[(df["author"]=="ŁUKASZ") & (df['page'] == 100), 'author'] = "ŁUKASZ Zboralski"
df.loc[df["author"]=="TOMASZ P.", 'author'] = "TOMASZ P. Terlikowski"
df['author'] = df["author"].apply(lambda x: x.upper())


In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df.to_csv("dorzeczy_2016.csv", index=False)

# 2017

In [None]:
special_files = ["dorzeczy_28_2017.pdf", "dorzeczy_26_2017.pdf"]
folder_name_2017 = "DoRzeczy/DoRzeczy2017/"

In [None]:
df_2017, problem_files = rzepa_pipeline(folder_name_2017, special_files, 2017)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2017.copy()
df_problem['problem'].value_counts()

Table of Contents    6 <br>
Not Chronological    5 <br>
Page number          2 <br>
Many found           1 <br>

In [None]:
author_names = ["MARIA Rutowska", "MARTA Marcinkiewicz", "JOANNA Bojańczyk", "STEFAN Sękowski", "ŁUKASZ Zboralski", 
                   "GRZEGORZ Brzozowicz","TOMASZ Kwaśnicki", "JACEK Przybylski", "AGNIESZKA Niewińska", "ANDRZEJ Horubała", "WOJCIECH Wybranowski", "MAŁGORZATA Wołczyk"]


df = author_correction(df, author_names)
df.loc[(df["author"]=="TOMASZ") & (df['file'].isin(["dorzeczy_38_2017.pdf", 'dorzeczy_14_2017.pdf'])), 'author'] = "TOMASZ P. Terlikowski"
df['author'] = df["author"].apply(lambda x: x.upper())


In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df.to_csv("dorzeczy_2017.csv", index=False)

# 2018

In [None]:
special_files = ["dorzeczy_13_2018.pdf", "dorzeczy_8_2018.pdf"]
folder_name_2018 = "DoRzeczy/DoRzeczy2018/"

In [None]:
df_2018, problem_files = rzepa_pipeline(folder_name_2018, special_files, 2018)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2018.copy()
df_problem['problem'].value_counts()

Not Chronological    4<br>
Table of Contents    4<br>
Few Found            1<br>
Page number          1<br>

In [None]:
author_names = ["JOANNA Bojańczyk", "ŁUKASZ Majchrzyk", "JACEK Przybylski", "MARCIN Makowski",
                   "TOMASZ cukiernik", "WALDEMAR Żyszkiewicz", "KATARZYNA Pinkosz"]

df.loc[df["author"]=="TOMASZ P.", 'author'] = "TOMASZ P. Terlikowski"
df = author_correction(df, author_names)
df['author'] = df["author"].apply(lambda x: x.upper())

In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df.to_csv("dorzeczy_2018.csv", index=False)

# 2019

In [None]:
special_files = ["dorzeczy_50_2019.pdf", "dorzeczy_49_2019.pdf"]
folder_name_2019 = "DoRzeczy/DoRzeczy2019/"
df_2019, problem_files = rzepa_pipeline(folder_name_2019, special_files, 2019)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2019.copy()
df_problem['problem'].value_counts()

Table of Contents    10<br>
Not Chronological     3<br>

In [None]:
author_names = ["JOANNA Bojańczyk", "PIOTR Zychowicz", "WITOLD Reptowicz", "JACEK Przybylski", "RADOSŁAW Wojtas", "KATARZYNA Pinkosz"]


df = author_correction(df, author_names)
df.loc[(df["author"]=="TOMASZ") & (df['file'] == "dorzeczy_20_2019.pdf"), 'author'] = "TOMASZ Rowiński"
df.loc[(df["author"]=="TOMASZ") & (df['file'] == "dorzeczy_30_2019.pdf"), 'author'] = "TOMASZ Kaźmierowski"
df.loc[(df["author"]=="TOMASZ") & (df['file'] == "dorzeczy_6_2019.pdf"), 'author'] = "TOMASZ Lenczewski"
df.loc[(df["author"]=="TOMASZ") & (df['file'] == "dorzeczy_19_2019.pdf"), 'author'] = "TOMASZ Cukiernik"
df['author'] = df["author"].apply(lambda x: x.upper())

In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df.to_csv("dorzeczy_2019.csv", index=False)

# 2020

In [None]:
special_files = ["dorzeczy_42_2020.pdf", "dorzeczy_13_2020.pdf"]
folder_name_2020 = "DoRzeczy/DoRzeczy2020/"
df_2020, problem_files = rzepa_pipeline(folder_name_2020, special_files, 2020)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2020.copy()
df_problem['problem'].value_counts()

Table of Contents    15<br>
Not Chronological     4<br>
Few Found             1<br>

In [None]:
author_names = ["JOANNA Bojańczyk", "WOJCIECH cejrowski", "KATARZYNA Pinkosz", "JERZY MIZIOŁEK","ŁUKASZ MAJCHRZYK", "GRZEGORZ KONDRASIUK", "MARCIN MAKOWSKI", "TOMASZ Rowiński"]


df = author_correction(df, author_names)
df['author'] = df["author"].apply(lambda x: x.upper())

In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df.to_csv("dorzeczy_2020.csv", index=False)

# 2021

In [None]:
special_files = ["dorzeczy_42_2020.pdf", "dorzeczy_49_2019.pdf"]
folder_name_2021 = "DoRzeczy/DoRzeczy2021/"
df_2021, problem_files = rzepa_pipeline(folder_name_2021, special_files, 2021)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2021.copy()
df_problem['problem'].value_counts()

In [None]:
author_names = ["JOANNA Bojańczyk", "WOJCIECH cejrowski", "KATARZYNA Pinkosz", "JERZY MIZIOŁEK",
                   "ŁUKASZ MAJCHRZYK", "GRZEGORZ KONDRASIUK", "MARCIN MAKOWSKI", "TOMASZ Rowiński"]


df = author_correction(df, author_names)
df['author'] = df["author"].apply(lambda x: x.upper())

In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df.to_csv("dorzeczy_2021.csv", index=False)

# 2022

In [None]:
special_files = ["dorzeczy_4_2022.pdf"]
folder_name_2022 = "DoRzeczy/DoRzeczy2022/"
df_2022, problem_files = rzepa_pipeline(folder_name_2022, special_files, 2022)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2022.copy()
df_problem['problem'].value_counts()

Table of Contents    19<br>
Not Chronological     4<br>
Page number           2<br>

In [None]:
author_names = ["MACIEJ Pieczyński", "ŁUKASZ Zboralski"]


df = author_correction(df, author_names)
df['author'] = df["author"].apply(lambda x: x.upper())

In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df.to_csv("dorzeczy_2022.csv", index=False)

# 2023

In [None]:
special_files = []
folder_name_2023 = "DoRzeczy/DoRzeczy2023/"
df_2023, problem_files = rzepa_pipeline(folder_name_2023, special_files, 2023)

In [None]:
df_problem = pd.DataFrame(problem_files, columns=["filename", "problem"])
df = df_2023.copy()
df_problem['problem'].value_counts()

In [None]:
author_names = []


df = author_correction(df, author_names)
df['author'] = df["author"].apply(lambda x: x.upper())

In [None]:
df.drop(columns=['error', 'error_count', 'author_len'], inplace=True)
df.to_csv("dorzeczy_2023.csv", index=False)