# Generate
------
This file will generate the datasets of the project. It requires a lot of manual input, so it'll be a bit messy. This is however necessary, as no dataset exists that'll cater to my exact needs. The dataset will be generated in the following way:
1. Open all the files in the `data/test/raw` & `data/train/raw` folder
2. Process the pdfs to extract the text, and save that processed data, and it's information already once.
3. Process the text to extract the relevant information, and save that processed data, and it's information already once.
4. Go through the data and manually select whether the data is a heading or not. This will be saved in the `data/test/processed` & `data/train/processed` folder.

## Imports

In [4]:
import os
import dotenv
import pandas as pd
import numpy as np
from werkzeug.datastructures import FileStorage      
import tempfile
import fitz
import html

import re

## Load environment variables

In [5]:
dotenv.load_dotenv()
TEST_PATH = os.getenv("TEST_PATH")
TRAIN_PATH = os.getenv("TRAIN_PATH")

TEST_PATH_RAW, TRAIN_PATH_RAW = (os.path.join(p, "raw") for p in (TEST_PATH, TRAIN_PATH))

TEST_PATH_PROCESSED, TRAIN_PATH_PROCESSED = (os.path.join(p, "processed") for p in (TEST_PATH, TRAIN_PATH))

TEST_PATH, TRAIN_PATH

('C:\\Users\\Maarten Boon\\Documents\\projects\\Title-detection-using-ml\\data\\test',
 'C:\\Users\\Maarten Boon\\Documents\\projects\\Title-detection-using-ml\\data\\train')

## Load data

In [6]:
def filenames(path) -> list:
    return [os.path.join(path, f) for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

len(filenames(TEST_PATH_RAW)), len(filenames(TRAIN_PATH_RAW))

(1, 17)

In [7]:
# noinspection PyUnresolvedReferences
def open_pdf(document: FileStorage) -> str:
    with tempfile.NamedTemporaryFile(delete=False) as temp_pdf:
        temp_pdf.write(document.read())
        temp_pdf_path = temp_pdf.name

    # Create a PDF document object
    pdf_document = fitz.open(temp_pdf_path, filetype="pdf")

    # Create an HTML representation from the PDF document
    html_output: str = ""
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        html_output += page.get_text("html")

    # Close the PDF document
    pdf_document.close()
    # Remove the temporary PDF file
    os.remove(temp_pdf_path)
    return html.unescape(html_output)


def load_data(path) -> pd.DataFrame:
    data = {
        "filename": [],
        "text": [],
        "position": [],
        "color": [],
        "font": [],
        "size": [],
        "length": [],
        "fileLength": []
    }
    for filename in filenames(path):
        for k, v in load_singular(filename).items():
            data[k] = data[k] + v
    
    return pd.DataFrame(data)

def load_singular(filename) -> dict:
    # Load the file as pdf
    data = {
        "filename": [],
        "text": [],
        "position": [],
        "color": [],
        "font": [],
        "size": [],
        "length": [],
        "fileLength": []
    }
    # Load the pdf
    filestream = open(filename, "rb")
    file = FileStorage(
        filestream, filename=filestream.name, content_type="application/pdf"
    )
    html_output = open_pdf(file)
    # Now, we have the html output, we can start processing it
    split_html = html_output.split("\n")
    prev_value = {
        "font-size": None,
        "text": "",
        "font-family": None,
        "color": None,
        "position": None,
        "length": 0,
        "fileLength": 0
    }
    for position, line in enumerate(split_html):
        font_size = re.search("font-size:([0-9]+)", line)
        # Get the font size in pixels as an int
        font_size = int(font_size.group(1)) if font_size else -1

        # Get the font family
        font_family = re.search("font-family:([a-zA-Z0-9]+)", line)
        font_family = font_family.group(1) if font_family else None

        # Get the color
        color = re.search(r"color:([a-zA-Z0-9]+|#[a-fA-F0-9]{6})", line)
        color = color.group(1) if color else None

        # Remove all tags
        line = re.sub("<.*?>", "", line, flags=re.DOTALL)

        if line and 0 < len(line.strip()) and font_size > 0:
            line_value = {
                "font-size": font_size,
                "text": line.strip(),
                "font-family": font_family,
                "color": color,
                "position": position,
                "length": len(line.strip()),
            }
            # Check if the current line is the same as the previous line
            if (
                    prev_value["font-size"] == line_value["font-size"]
                    and prev_value["font-family"] == line_value["font-family"]
                    and prev_value["color"] == line_value["color"]
            ):
                prev_value["text"] += " " + line_value["text"]
                prev_value["length"] += line_value["length"]
            else:
                if prev_value["text"] and 2 < prev_value["length"] :
                    # Add the previous value to the data
                    data["filename"].append(filename)
                    data["text"].append(prev_value["text"])
                    data["position"].append(prev_value["position"])
                    data["color"].append(prev_value["color"])
                    data["font"].append(prev_value["font-family"])
                    data["size"].append(prev_value["font-size"])
                    data["length"].append(prev_value["length"])
                prev_value = line_value
                
    file_length = sum(data["length"])
    data["fileLength"] = [file_length for _ in range(len(data['size']))]
    return data

train_data = load_data(TRAIN_PATH_RAW)
train_data

Unnamed: 0,filename,text,position,color,font,size,length,fileLength
0,C:\Users\Maarten Boon\Documents\projects\Title...,Database APIs,1,,Montserrat,43,13,29623
1,C:\Users\Maarten Boon\Documents\projects\Title...,Database System Architectures,5,,Calibri,18,29,29623
2,C:\Users\Maarten Boon\Documents\projects\Title...,Database APIs,7,,Calibri,18,13,29623
3,C:\Users\Maarten Boon\Documents\projects\Title...,Object Persistence and Object Relational Mappers,9,,Calibri,18,47,29623
4,C:\Users\Maarten Boon\Documents\projects\Title...,Exercise,12,,Calibri,18,8,29623
...,...,...,...,...,...,...,...,...
3761,C:\Users\Maarten Boon\Documents\projects\Title...,Werk voor het converteren wordt verdeeld over ...,8647,,Montserrat,12,80,20133
3762,C:\Users\Maarten Boon\Documents\projects\Title...,Bronnen,9409,,Montserrat,19,7,20133
3763,C:\Users\Maarten Boon\Documents\projects\Title...,Andrew S. Tanenbaum and Herbert Bos. 2014. Mod...,9411,,Montserrat,7,103,20133
3764,C:\Users\Maarten Boon\Documents\projects\Title...,William Stallings. 2018. Operating Systems: In...,9414,,Montserrat,7,144,20133


In [8]:
# Do the same for test
test_data = load_data(TEST_PATH_RAW)
test_data

Unnamed: 0,filename,text,position,color,font,size,length,fileLength
0,C:\Users\Maarten Boon\Documents\projects\Title...,Inhoudsopgave,1,,Montserrat,11,13,148759
1,C:\Users\Maarten Boon\Documents\projects\Title...,INHOUDSOPGAVE,6,#ffffff,Montserrat,11,13,148759
2,C:\Users\Maarten Boon\Documents\projects\Title...,Inhoudsopgave ...................................,7,,Montserrat,11,4737,148759
3,C:\Users\Maarten Boon\Documents\projects\Title...,VOORWOORD,2239,#ffffff,Montserrat,11,9,148759
4,C:\Users\Maarten Boon\Documents\projects\Title...,Beste lezer Ondernemerschap vertaalt zich in v...,2240,,Montserrat,10,1649,148759
...,...,...,...,...,...,...,...,...
589,C:\Users\Maarten Boon\Documents\projects\Title...,LIJST MET GEBRUIKTE FIGUREN,84285,,Montserrat,10,27,148759
590,C:\Users\Maarten Boon\Documents\projects\Title...,Figuur 1 - model balans .........................,84286,,Montserrat,11,2783,148759
591,C:\Users\Maarten Boon\Documents\projects\Title...,GEBRUIKTE AFBEELDINGEN VAN UNSPLASH Dank aan v...,84305,,Montserrat,10,290,148759
592,C:\Users\Maarten Boon\Documents\projects\Title...,Bibliografie Pagina 106 van 107,84314,,Montserrat,11,30,148759


## Add as a csv

In [9]:
STEP1_NAME = "step1.csv"
TEST_STEP1, TRAIN_STEP1 = (
    os.path.join(TEST_PATH_PROCESSED, STEP1_NAME),
    os.path.join(TRAIN_PATH_PROCESSED, STEP1_NAME)
)
for d, p in zip(
    [
        test_data,
        train_data
    ],
    [
        TEST_STEP1,
        TRAIN_STEP1
    ]
):
    d.to_csv(
        p,
        header=True,
        index=False,
    )

## Step 2: Label the data, and save the information every so often.

### Reload the data

In [10]:
test_data = pd.read_csv(TEST_STEP1)
test_data

Unnamed: 0,filename,text,position,color,font,size,length,fileLength
0,C:\Users\Maarten Boon\Documents\projects\Title...,Inhoudsopgave,1,,Montserrat,11,13,148759
1,C:\Users\Maarten Boon\Documents\projects\Title...,INHOUDSOPGAVE,6,#ffffff,Montserrat,11,13,148759
2,C:\Users\Maarten Boon\Documents\projects\Title...,Inhoudsopgave ...................................,7,,Montserrat,11,4737,148759
3,C:\Users\Maarten Boon\Documents\projects\Title...,VOORWOORD,2239,#ffffff,Montserrat,11,9,148759
4,C:\Users\Maarten Boon\Documents\projects\Title...,Beste lezer Ondernemerschap vertaalt zich in v...,2240,,Montserrat,10,1649,148759
...,...,...,...,...,...,...,...,...
589,C:\Users\Maarten Boon\Documents\projects\Title...,LIJST MET GEBRUIKTE FIGUREN,84285,,Montserrat,10,27,148759
590,C:\Users\Maarten Boon\Documents\projects\Title...,Figuur 1 - model balans .........................,84286,,Montserrat,11,2783,148759
591,C:\Users\Maarten Boon\Documents\projects\Title...,GEBRUIKTE AFBEELDINGEN VAN UNSPLASH Dank aan v...,84305,,Montserrat,10,290,148759
592,C:\Users\Maarten Boon\Documents\projects\Title...,Bibliografie Pagina 106 van 107,84314,,Montserrat,11,30,148759


In [11]:
train_data = pd.read_csv(TRAIN_STEP1)
train_data

Unnamed: 0,filename,text,position,color,font,size,length,fileLength
0,C:\Users\Maarten Boon\Documents\projects\Title...,Database APIs,1,,Montserrat,43,13,29623
1,C:\Users\Maarten Boon\Documents\projects\Title...,Database System Architectures,5,,Calibri,18,29,29623
2,C:\Users\Maarten Boon\Documents\projects\Title...,Database APIs,7,,Calibri,18,13,29623
3,C:\Users\Maarten Boon\Documents\projects\Title...,Object Persistence and Object Relational Mappers,9,,Calibri,18,47,29623
4,C:\Users\Maarten Boon\Documents\projects\Title...,Exercise,12,,Calibri,18,8,29623
...,...,...,...,...,...,...,...,...
3761,C:\Users\Maarten Boon\Documents\projects\Title...,Werk voor het converteren wordt verdeeld over ...,8647,,Montserrat,12,80,20133
3762,C:\Users\Maarten Boon\Documents\projects\Title...,Bronnen,9409,,Montserrat,19,7,20133
3763,C:\Users\Maarten Boon\Documents\projects\Title...,Andrew S. Tanenbaum and Herbert Bos. 2014. Mod...,9411,,Montserrat,7,103,20133
3764,C:\Users\Maarten Boon\Documents\projects\Title...,William Stallings. 2018. Operating Systems: In...,9414,,Montserrat,7,144,20133


### Run through the data

In [12]:
def generate_code(row):
    return f"{row['font']}_{row['size']}_{row['color']}"

for df in [train_data, test_data]:
    # Create a function that generates a unique code for each style
    df["code"] = df.apply(generate_code, axis=1)

train_data

Unnamed: 0,filename,text,position,color,font,size,length,fileLength,code
0,C:\Users\Maarten Boon\Documents\projects\Title...,Database APIs,1,,Montserrat,43,13,29623,Montserrat_43_nan
1,C:\Users\Maarten Boon\Documents\projects\Title...,Database System Architectures,5,,Calibri,18,29,29623,Calibri_18_nan
2,C:\Users\Maarten Boon\Documents\projects\Title...,Database APIs,7,,Calibri,18,13,29623,Calibri_18_nan
3,C:\Users\Maarten Boon\Documents\projects\Title...,Object Persistence and Object Relational Mappers,9,,Calibri,18,47,29623,Calibri_18_nan
4,C:\Users\Maarten Boon\Documents\projects\Title...,Exercise,12,,Calibri,18,8,29623,Calibri_18_nan
...,...,...,...,...,...,...,...,...,...
3761,C:\Users\Maarten Boon\Documents\projects\Title...,Werk voor het converteren wordt verdeeld over ...,8647,,Montserrat,12,80,20133,Montserrat_12_nan
3762,C:\Users\Maarten Boon\Documents\projects\Title...,Bronnen,9409,,Montserrat,19,7,20133,Montserrat_19_nan
3763,C:\Users\Maarten Boon\Documents\projects\Title...,Andrew S. Tanenbaum and Herbert Bos. 2014. Mod...,9411,,Montserrat,7,103,20133,Montserrat_7_nan
3764,C:\Users\Maarten Boon\Documents\projects\Title...,William Stallings. 2018. Operating Systems: In...,9414,,Montserrat,7,144,20133,Montserrat_7_nan
