In [2]:
# auto reload
%load_ext autoreload
%autoreload 2

import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import os
import scipy
import random
from glob import glob
from skimage import transform
from tqdm import tqdm

try:
    print(os.getcwd())
    os.chdir("../../rug-hr/")
    print(os.getcwd())
except:
    pass

In [2]:
# read corpus from ./datasets/lob_corpus.txt
with open("./datasets/lob_corpus.txt", "r") as f:
    corpus = f.read()
    # filter out ascii characters that are not in chars ranging from ! to ~
    corpus = "".join([c for c in corpus if ord(c) >= 33 and ord(c) <= 126 or c ==" "])
    # replace consequent spaces with single space
    corpus = " ".join(corpus.split())

len(corpus.split(" ")), "tokens in corpus"

(928728, 'tokens in corpus')

In [3]:
from modules.scripturize import sample_manuscript_from_corpus
from modules.scripturize.utils.font import load_font
from modules.scripturize.utils.plot import show_labelled_manuscript
from modules.scripturize.utils.transforms import center_text
from modules.scripturize.text_writer import generate_image_from_token, time2token

loaded_font = load_font("./assets/fonts_cursive/Wolf in the City Light.ttf", 64)

Loaded images from directory: ./assets/scripture_prototypes/.
Loaded images from directory: ./assets/scripture_corruption_prototypes/.


In [4]:
text_length = 1000
page = np.random.randint(0, len(corpus) - text_length)
sample_text = corpus[page:page+text_length]

manuscript, params = sample_manuscript_from_corpus(sample_text,
    margin=4,
    size=(256, 2048),
    allow_intersections=True,
    p_prototype=0.0,
    p_corrupt=0.0,
    direction=1,
    font_size_range=(32, 128),
    char_line_shift_range=(2, 10),
    char_spacing_range=(2, 16),
    line_spacing_range=(4, 32),
    time2token=time2token,
    token2image=lambda token, f: generate_image_from_token(token, f, loaded_font=loaded_font),
    spacing_token=" ",
)

output = center_text(manuscript["raw"])
show_labelled_manuscript(output, params=params, figsize=(15, 15), random_crop_size=None, interactive=True, nlabels=ord("~")-ord(" "))
print("range for output:", np.min(output), np.max(output))
sample_text


(90, 1864) 20 labels are present
range for output: 0 89


'med tohell fire in the next world. It inevitably followed that they werenot likely to be treated with much consideration by Christians in thisone. The intolerance was not, of course, only on one side. TheChristian crusade had its counterpart in the Muslim jihad, orholy war against the unbeliever. The orthodox Muslim regarded withhorror all those who would "give associates to God"; and this wasjust what the Christians did with their Trinity, their Virgin Mary,and (to some extent) with their saints. Medieval Europe was a harsh and rugged school, and the softergraces of civilization were not more widely cultivated in Portugalthan they were elsewhere. A turbulent and treacherous nobility andgentry; an ignorant and lax clergy; doltish, if hard-working, peasantsand fishermen; and a town rabble of artisans and day-labourers, likethe Lisbon mob described by Ec?6a de Queiroz five centuries later,"fanatical, filthy, and ferocious"- these constituted the socialclasses from which the pioneer disc

In [None]:
from modules.scripturize.dataset import generate_manuscript_dataset

available_fonts = glob("./datasets/fonts_handwritten/*.ttf") + glob("./datasets/fonts_handwritten/*.otf")
additive = True
alias = ""
if additive:
    alias = str(np.random.randint(0, 100000)) + "_"
print("Number of fonts:", len(available_fonts))

for i, font in enumerate(available_fonts):
    try:
        loaded_font = load_font(font, 64)
        font_name = loaded_font.getname()[0]
        font_name = alias + "".join([c for c in font_name if ord(c) >= 33 and ord(c) <= 126 or c ==" "])
        print(f"({i+1}/{len(available_fonts)}) Generating dataset for font:", font_name)
        dataset = generate_manuscript_dataset(corpus, 
            margin=4,
            size=(256, 2048),
            allow_intersections=True,
            p_prototype=0.0,
            p_corrupt=1,
            direction=1,
            font_size_range=(32, 128),
            char_line_shift_range=(2, 10),
            char_spacing_range=(2, 16),
            line_spacing_range=(4, 32),
            time2token=time2token,
            token2image=lambda token, f: generate_image_from_token(token, f, loaded_font=loaded_font),
            spacing_token=" ",
            # Generator params
            dataset_size=128, 
            out_dir="./datasets/handwitten_manuscripts/", 
            file_alias=font_name,
        )
    except Exception as e:
        print("error:", e)
        continue

**Note**: Some generated blank images due to some errors in the generator. 

In [74]:
from concurrent.futures import ThreadPoolExecutor


nclasses = ord("~")-ord(" ")
print("Number of classes:", nclasses)
dataset_folder = "datasets/handwitten_manuscripts/"
raw_images = glob(dataset_folder + "Raw/*.png")
mut_images = glob(dataset_folder + "Mutated/*.png")

errors = {
    "empty": [],
    "invalid_label": [],
    "insignificant": [],
}


def check_pair(raw_path, mut_path, min_roi=0.01):
    raw = plt.imread(raw_path)
    mut = plt.imread(mut_path)

    total_area = raw.shape[0] * raw.shape[1]
    min_roi_area = total_area * min_roi
    compute_roi_area = lambda img: np.sum(img > 0)
    
    if np.max(raw) == 0 or np.max(mut) == 0:
        errors["empty"].append({ "pair": (raw_path, mut_path) })

    if np.max(raw) > nclasses or np.max(mut) > nclasses:
        errors["invalid_label"].append({ "pair": (raw_path, mut_path) })

    if compute_roi_area(raw) < min_roi_area or compute_roi_area(mut) < min_roi_area:
        errors["insignificant"].append({ "pair": (raw_path, mut_path) })


jobs = zip(raw_images, mut_images)
with ThreadPoolExecutor(max_workers=8) as executor:
    list(tqdm(executor.map(lambda p: check_pair(*p), jobs), total=len(raw_images)))


Number of classes: 94


100%|██████████| 26425/26425 [00:33<00:00, 783.92it/s]


In [67]:
# summary % of errors
for k, v in errors.items():
    print(f"{k}: {len(v)/len(raw_images)*100:.2f}%")

print("New dataset after removing errors:")
print(f"Raw: {len(raw_images) - len(errors['empty']) - len(errors['invalid_label']) - len(errors['insignificant'])}")

confirmation = input("Do you want to delete the erroneous images? (y/n): ")
if confirmation == "y":
    for k, v in errors.items():
        print(f"Fixing {k} errors...")
        for e in tqdm(v):
            os.remove(e["pair"][0])
            os.remove(e["pair"][1])
    print("Done.")    

empty: 0.00%
invalid_label: 0.00%
insignificant: 0.00%
New dataset after removing errors:
Raw: 26425
