In [1]:
from PIL import Image
from pdf2image import convert_from_path, convert_from_bytes
import fitz
import pandas as pd
import easyocr
import cv2
import os
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from io import BytesIO
from shapely.plotting import plot_polygon
from collections import defaultdict
import sys
sys.path.insert(1, 'src')
from text_coverage import compute_area, compute_area_ocr, compute_area_pdf_fonts, easy_ocr, preprocess_text, findwords, decontracted, preprocessed_word, find_ner
from nltk.corpus import words
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4') 
import math

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/charlottepanuskova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/charlottepanuskova/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Bold: 'On the Farm (Read and Learn)', 'A Big Book of the Dark'
Italics: 'Colour in Nature'

#### PDF method

The method extracts text from PDF pages and identifies the boxes in which the text was typed.
Boxes with the same font are then merged, and the text is unified.
It also computes the percentage of text on the page.

In [None]:
book =  'How Animals Sleep' #'How Flamingo Lost His Colour'#'I Am in Love With My Bicycle' #'How Animals Sleep'#'Our Tom\'s Day'#'Atlas of Cats' #'Shapes and Patterns in Nature'#
pdf_file = f'PDFs/{book}.pdf'
doc = fitz.open(pdf_file)
text_page = {'Book Name': [], 'page': [], 'text':[]}

for i in range(doc.page_count): 
    number_of_boxes = 0
    page = doc[i] 

    # Get text blocks
    text_instances = page.get_text("dict")["blocks"]
    text_area = 0

    # Get the PDF page dimensions
    page_rect = page.rect
    page_area = page_rect.width * page_rect.height

    # Render the page to an image
    zoom = 1  # increase resolution (1=72 dpi, 2=144 dpi, etc.)
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)

    # Convert to an image PIL can display
    img = Image.open(BytesIO(pix.tobytes("png")))

    # Plot the image
    fig, ax = plt.subplots(figsize=(10, 14))
    ax.imshow(img)
    ax.set_xlim(0, page_rect.width )
    ax.set_ylim(0, page_rect.height )
    ax.invert_yaxis()  # PyMuPDF origin is bottom-left

    rectangles = []
    font_rectangles = defaultdict(list)
    # Draw rectangles for text spans
    text_previous = ''
    for block in text_instances:
        
        if "lines" in block:    
            for line in block["lines"]:
                for span in line["spans"]:
                    text = span["text"]
                    if text_previous == text:
                        continue
                
                    text_previous = text
                    rect = fitz.Rect(span["bbox"])  
                    rectangles.append(rect)
                    number_of_boxes += 1 
                    font,_,_ = (span["font"]).partition('-') # Get the font name before the hyphen
                    font_rectangles[font].append(span) #, span["size"]
                    r = patches.Rectangle(
                        (rect.x0, rect.y0),
                        rect.width,
                        rect.height,
                        linewidth=1,
                        edgecolor='red',
                        facecolor='none'
                    )
                    ax.add_patch(r)
    
    merged, text_area, geoms, geoms_area = compute_area(rectangles)
    
    # Text coverage
    coverage = (text_area / page_area) * 100
    plt.title(f"Page {i} — Coverage: {coverage:.2f}%, Boxes: {number_of_boxes}, Geoms: {geoms}")
    plt.axis('off')
    plt.show()

    merged_all, text_area, geoms, geoms_area, geometries_texts = compute_area_pdf_fonts(font_rectangles)
    coverage = (text_area / page_area ) * 100
    if geoms > 0:
     # Convert to an image PIL can display
        img = Image.open(BytesIO(pix.tobytes("png")))

        # Plot the image
        fig, ax = plt.subplots(figsize=(10, 14))
        ax.imshow(img)
        ax.set_xlim(0, page_rect.width )
        ax.set_ylim(0, page_rect.height )
        ax.invert_yaxis()  # PyMuPDF origin is bottom-left
        for m in merged_all:
            plot_polygon(polygon=m, ax=ax, add_points=False, color="green")
        plt.title(f"Page {i} — Coverage: {coverage:.2f}%, Boxes: {geoms}")
        plt.axis('off')
        plt.show()
        for font, spans in geometries_texts.items():
            print(f"Font: {font}")
            text = preprocess_text(' '.join([span['text'] for span in spans]))
            text = text.strip()
            middle = int(math.floor(len(text)/2))
            if text[0:middle] == text[middle:]:text = text[0:middle]
            print(text)
            text_page['Book Name'].append(book)
            text_page['page'].append(i)
            text_page['text'].append(text)


#### Find whether words are english or names (mostly Czech) 

In [20]:
import requests
import time

time_sleep = 0.3  # seconds

params = {'data' : 'data',
        'model': 'nametag3-czech-cnec2.0-240830',  
        'input': 'vertical',
        'output': 'vertical'} 
def find_ner(text):
    global params
    try:
        text = ' '.join([word.capitalize() for word in text.split()])
        params['data'] = text
        response = requests.get(url="http://lindat.mff.cuni.cz/services/nametag/api/recognize", params=params)
        response.raise_for_status()  # raises exception when not a 2xx response
        if response.status_code != 204:
            result = response.json()['result']
            result = result.replace("\n", "\t")
            s = result.split('\t')
            s = s[0:-1]
            if len(s) > 1:
                for i in range(0, len(s), 3):
                    print(f'name: {s[i + 2]}, type: {s[i + 1]}')
                    return s[i + 2].lower()
        time.sleep(time_sleep)            
    except Exception as e:
        print("Error:", e)

    return  None

glued_words = []
spell = SpellChecker(language='en')
lemmatizer = nltk.stem.WordNetLemmatizer()
for word_list in text_page['text']:
    word_list = decontracted(word_list.replace('’', "'"))  # Expand contractions
    misspelled = spell.unknown([preprocessed_word(lemmatizer, word) for word in word_list.split() if word.isalpha() and len(word) > 1])
    if len(misspelled) > 0: 
        print(f"Misspelled words: {misspelled}")
        names = []
        for word in misspelled:
            name = find_ner(word)
            if name is not None:
                names.append(name)
        for name in names: 
            misspelled.discard(name)
        #names = find_ner(' '.join(list(misspelled)))
        glued_words.extend(list(misspelled))


Misspelled words: {'katarína', 'bartíková', 'petra', 'macurová'}
name: Katarína, type: gu
name: Bartíková, type: ps
name: Petra, type: pf
name: Macurová, type: ps
Misspelled words: {'katarína', 'bartíková', 'petra', 'macurová'}
name: Katarína, type: gu
name: Bartíková, type: ps
name: Petra, type: pf
name: Macurová, type: ps
Misspelled words: {'te'}
Misspelled words: {'th', 'falle', 'tr'}
Misspelled words: {'ofs'}
Misspelled words: {'pil', 'ows'}
Misspelled words: {'sc', 'ina', 'ng'}
Misspelled words: {'placessomething'}
Misspelled words: {'smal', 'mamals'}
Misspelled words: {'el'}
Misspelled words: {'asian'}
name: Asian, type: pc
Misspelled words: {'te', 'ag'}
Misspelled words: {'gerbi'}
Misspelled words: {'slele'}
Misspelled words: {'erbil'}
name: Erbil, type: pf
Misspelled words: {'nothin', 'havin', 'ood'}
name: Havin, type: ps
Misspelled words: {'ng'}
Misspelled words: {'iraffe'}
name: Iraffe, type: p_
Misspelled words: {'lon'}
name: Lon, type: pf
Misspelled words: {'che'}
Misspelle

#### Are there words glued together in the string? 

In [None]:
for glued_word in glued_words:
    print(f"Checking glued word: {glued_word}")
    for i in range(3, min(10, len(glued_word) + 1)):
        findwords(spell, glued_word, minlength= i, maxlength=i)

Checking glued word: te
Checking glued word: th
Checking glued word: falle
0: 'all' at position 1 (3)
0: 'fall' at position 0 (4)
Checking glued word: tr
Checking glued word: ofs
Checking glued word: pil
Checking glued word: ows
Checking glued word: sc
Checking glued word: ina
Checking glued word: ng
Checking glued word: placessomething
0: 'lac' at position 1 (3)
1: 'ace' at position 2 (3)
2: 'som' at position 6 (3)
3: 'met' at position 8 (3)
4: 'hin' at position 11 (3)
0: 'lace' at position 1 (4)
1: 'aces' at position 2 (4)
2: 'cess' at position 3 (4)
3: 'some' at position 6 (4)
4: 'thin' at position 10 (4)
0: 'place' at position 0 (5)
1: 'laces' at position 1 (5)
2: 'thing' at position 10 (5)
0: 'places' at position 0 (6)
0: 'something' at position 6 (9)
Checking glued word: smal
Checking glued word: mamals
0: 'mam' at position 0 (3)
0: 'mama' at position 0 (4)
Checking glued word: el
Checking glued word: te
Checking glued word: ag
Checking glued word: gerbi
Checking glued word: slel

In [None]:
for word_list in text_page['text']:
    for word in word_list.split():
        word = word.strip('.,!?()[]{}"\'').lower()
        if word not in words.words():
            print(f"Non-english word: {word}") 


In [64]:
text_page = pd.DataFrame(text_page)
text_page.to_excel(f'excel tables/Classifier_text.xlsx', index=False)

#### OCR method 
Will be used when pdf method is not possible. 

In [None]:
reader = easyocr.Reader(['en'], gpu=True)

conf = 0.40


book_image_surface = 0
book_text_surface = 0
book = 'This Is Our Piggy'
number_of_pages = len(os.listdir(f'JPGs/{book}/'))
for i in range(0, number_of_pages):

    test_img = f'JPGs/{book}/{book}_{i}.jpg'
    raw_text, results = easy_ocr(reader, test_img)

    #Compute surface of box
    results['surface'] = results[results['conf'] > conf].bbox.apply(
       lambda x: (x[2][0] - x[0][0]) * (x[2][1] - x[0][1])
    )
    

    # Read image with OpenCV (for size)
    im = cv2.imread(test_img)
    h, w, _ = im.shape
    image_surface = w * h
    book_image_surface += image_surface

    #text_area = results.surface.sum()
    #book_text_surface += text_area
    if len(results[results['conf'] > conf]) > 0: 
        text_area, number_of_boxes, geoms_area = compute_area_ocr(results[results['conf'] > conf]) 
    else: text_area, number_of_boxes, geoms_area = 0,0,[0]

    coverage = text_area / image_surface * 100
    #number_of_boxes = len(results[~results['surface'].isnull()])
    print(f"Page {i} coveres {coverage:.2f}% of the page and has {number_of_boxes} boxes.")

    # Plotting the image and boxes
    fig, ax = plt.subplots(figsize=(10, 14))
    im_rgb = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
    ax.imshow(im_rgb)

    for _, row in results.iterrows():
        if row.conf > conf:
            box = row['bbox']
            rect_x = box[0][0]
            rect_y = box[0][1]
            rect_w = box[2][0] - box[0][0]
            rect_h = box[2][1] - box[0][1]

            rect = patches.Rectangle(
                (rect_x, rect_y),
                rect_w,
                rect_h,
                linewidth=1,
                edgecolor='red',
                facecolor='none'
            )
            ax.add_patch(rect)

    ax.set_title(f"Page {i} — Coverage: {coverage:.2f}%, Boxes: {number_of_boxes}")
    plt.axis('off')
    plt.show()


In [47]:
book_percentage = book_text_surface/book_image_surface*100
print(f'Percentage of text in {book}: {book_percentage} %')

# text from PDF
book_text_path = f'Plain text/{book}.txt'
f = open(book_text_path)
text = f.read()
number_of_words = len(text.split())
print(f'Number of words: {number_of_words}')
average_number_of_words = number_of_words/number_of_pages
print(f'Average number of words per page: {average_number_of_words}')

Percentage of text in Shapes and Patterns in Nature: 9.9135730381641 %
Number of words: 2882
Average number of words per page: 137.23809523809524


In [4]:
import pymupdf
from pypdf import PdfReader

### From: https://stackoverflow.com/questions/52346942/how-to-replace-delete-text-from-a-pdf-using-python

book = 'I Am in Love With My Bicycle'
file_path = f"PDFs/{book}.pdf"
doc = pymupdf.open(file_path) # open a document

reader = PdfReader(file_path)

for i in range(0,4):
    page = doc.load_page(i)
    print(f'Page {i}')

    lines = reader.pages[i].extract_text().split('\n')
    print(lines)
    for line in lines:
        draft = page.search_for(line)
        try:
            for rect in draft:
                annot = page.add_redact_annot(rect)
                page.apply_redactions()
                page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
        except: print('No Text')        
    
doc.save(f"{book}_without_text.pdf", garbage=3, deflate=True)

Page 0
['I AM IN LOVE WITH MY', 'mAgda gArgulákOvá & MArie urbánkOVá', 'mAgda gArgulákOvá & MArie urbánkOVáI AM IN LOVE WITH MY ', 'IF YOU HAVE A BIKE, GET ON! If you haven’t – we’ll lend you one! ', 'Then let us go out on the lanes and roads, through the mud and puddles. ', 'A bike takes you anywhere you fancy. On a bike, you’re free and unstoppable. ', 'Just pedal… and understand what’s going on around you. Whether you’re an ', 'avid cyclist or just starting out, it’s good to know a few things about bikes – such ', 'as which is the right one for you, how high to set your saddle, and what you ', 'should never, ever do on a bike. So, why not read all about it – in this book! ', 'It may be small, but it’s packed with fun and information. You’ll learn why your ', 'helmet should be a good fit, as well as lots of other things about bikes and ', 'cycling. You’ll take a ride in a fun bicycle maze and get to know for yourself why ', 'a bike sometimes needs mudguards. But enough of this talk! 

In [27]:
from pypdf import PdfReader

doc = PdfReader("PDFs/Shapes and Patterns in Nature.pdf")
doc.get_fields()


{}

In [38]:
doc.page_layout

In [None]:
book = 'Atlas of Cats'
pdf_file = f'PDFs/{book}.pdf'
doc = fitz.open(pdf_file)

page = doc[5]

text_dict = defaultdict(list)
last_font = None


for block in text_instances:
    if "lines" in block:
        for line in block["lines"]:
            for span in line["spans"]:
                font = span["font"]
                size = span["size"]
                font_size = (font, size)
                if font_size != last_font:
                    text_dict[font_size].append('\n' + span["text"])
                    last_font = font_size
                else:
                    text_dict[font_size].append(span["text"])

In [25]:
print(text_dict.keys())
' '.join(text_dict[('MrDodo-LightRounded', 12.0)])

dict_keys([('MrsWhite', 14.0), ('MrsWhite', 15.0), ('BiscuitChicken', 15.0), ('MrDodo-LightRounded', 12.0), ('BiscuitChicken', 10.0), ('BiscuitChicken', 20.0), ('MrDodo-RegularRounded', 20.0), ('BiscuitChicken', 20.124610900878906), ('BiscuitChicken', 18.0), ('BiscuitChicken', 48.0), ('MrsWhite', 18.0), ('MrDodo-RegularRounded', 12.0)])


"\nRomper \nYou may pet me whenever \n I feel like it. \nAny household cat that has the appearance and temperament \n described above can be called a European shorthair, though it's  true that such a cat is pretty hard to find in village squares \n or animal shelters. \nJust like wild animals defend their territory, so can I fight for  the right to rule my home. Over time, I may come to accept a  new cat or dog friend, but it usually takes me a while. After all,  I must make sure that my human pack still loves me since our  house is so crowded all of a sudden!  \nEach one of us is unique. That's because we are not  usually bred, and unlike other breeds don’t have to meet  strict rules on personality and temperament. We tend to  be smart (naturally), playful, and full of energy, but not  crazy hotheads. An afternoon snooze on a window sill?  Yes, please. What we really love, though, is hunting — our \n hunting instincts are very strong. \n1. Chinese Li Hua 2. Australian Mist 3. Desert L