In [1]:
#!pip uninstall fitz PyMuPDF --yes
#!pip install PyMuPDF
#!pip install PyMuPDF==1.19.1  ## install a release later than 1.19.0
#!pip install PyMuPDF==1.18.19  ## don't install 1.19.0 because of Wrong x0/y0 values for bboxes in get_text("words"), see: https://github.com/pymupdf/PyMuPDF/issues/1328
# ## Releases of PyMuPDF as of Oct 18, 2021: 1.11.2, 1.12.5, 1.13.20, 1.14.19.post2, 1.14.20, 1.14.21, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.16.7, 1.16.8, 1.16.9, 1.16.10, 1.16.11, 1.16.12, 1.16.13, 1.16.14, 1.16.15, 1.16.16, 1.16.17, 1.16.18, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.17.6, 1.17.7, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.18.6, 1.18.7, 1.18.8, 1.18.9, 1.18.10, 1.18.11, 1.18.12, 1.18.13, 1.18.14, 1.18.15, 1.18.16, 1.18.17, 1.18.18, 1.18.19, 1.19.0


In [2]:
!open -a Preview book-1.pdf book-2.pdf book-3.pdf

In [3]:
config = [
   {"filename" : "NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf",
    "name" : "book-1",
    "page_count" : 642,
    "sections" : {
      "preface" : [1, 78],
      "toponymique" : [49, 76],
      "abbrev" : [77, 78],
      "content" : [79, 607],
      "sample" : [79, 79+10],
      "index" : [617, 639]},
    },
   {"filename" : "NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 2.pdf",
    "name" : "book-2",
    "page_count" : 725,
    "sections" : {
      "preface" : [1, 79],
      "abbrev" : [6, 7],
      "content" : [8, 700],
      "sample" : [8, 8+10],
      "index" : [704, 725]},
    },
   {"filename" : "NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf",
    "name" : "book-3",
    "page_count" : 588,
    "sections" : {
      "preface" : [1, 7],
      "abbrev" : [6, 7],
      "content" : [8, 554],
      "sample" : [8, 8+10],
      "index" : [556, 583],
      "familyidx" : [584, 585]},
    },
]


In [4]:
import sys, fitz, re
from pprint import pprint

## Purpose: this cell is a pre-processing step that will create a symlink (e.g. book-1.pdf)
##          that points to the original file, and then will create new PDF documents
##          that are excerpts of subsets of the whole book
##          as named sections such as book-1-index.pdf or book-1-abbrev.pdf
##          The file book-X-sample.pdf contains the first 10 pages of the book-1-content.pdf
##          and the sample is used to speed up development/testing later on

## https://pymupdf.readthedocs.io/en/latest/document.html#Document.delete_pages
##    In general, the performance of this method is dependent on the number of remaining pages
##    NOT on the number of deleted pages
## So, keeping fewer pages is faster, and deleting more pages at once is also faster 
def excerpt(infile, outfile, start, end):
  doc = fitz.open(infile)  # open document
  last_page = doc.page_count
  print(start, end, last_page, outfile)
  deleteme = []
  if start - 1 > 0:
    deleteme.extend(list(range(0, start - 1)))       
  if end < last_page:
    deleteme.extend(list(range(end, last_page)))
  # print(deleteme)
  doc.delete_pages(deleteme)
  doc.save(outfile, garbage=4, clean=True)

import os, os.path
for book in config:
   symlink = book["name"] + ".pdf"
   fname = book["filename"]
   if os.path.exists(book["filename"]) and not os.path.exists(symlink):
      print("Creating symlink: ", symlink, " ->", fname)
      os.symlink(book["filename"], symlink)
   for section in book["sections"]:
      #print(s)
      start, end = book["sections"][section];
      out_fname = book["name"] + "-" + section + ".pdf"
      excerpt(book["filename"], out_fname, start, end)

#!open -a Preview book-[1,2,3]-*.pdf

## Runtime of this cell is ~11m38s

1 78 642 book-1-preface.pdf
49 76 642 book-1-toponymique.pdf
77 78 642 book-1-abbrev.pdf
79 607 642 book-1-content.pdf
79 89 642 book-1-sample.pdf
617 639 642 book-1-index.pdf
1 79 725 book-2-preface.pdf
6 7 725 book-2-abbrev.pdf
8 700 725 book-2-content.pdf
8 18 725 book-2-sample.pdf
704 725 725 book-2-index.pdf
1 7 588 book-3-preface.pdf
6 7 588 book-3-abbrev.pdf
8 554 588 book-3-content.pdf
8 18 588 book-3-sample.pdf
556 583 588 book-3-index.pdf
584 585 588 book-3-familyidx.pdf


In [5]:
# !open -a Preview book-[1,2,3]-*.pdf


In [6]:
# !open -a Preview book-[1,2,3]-index.pdf


In [37]:
# https://pymupdf.readthedocs.io/en/latest/faq/#how-to-analyze-font-characteristics

import fitz
import pprint, re

def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

class Face:
    def __init__(self, font, size, color, flags=0):
        self.font = font
        self.size = size
        self.color = color
        self.flags = flags
        self.style = flags_decomposer(self.flags)
    def __str__(self):
        return f"Font: '{self.font}' ({self.style}), size {self.size:g}, color " + ("#%06x" % self.color)
print(Face("Times", 10.929, 0, 4))


Font: 'Times' (serifed, proportional), size 10.929, color #000000


In [38]:
import sys, fitz, re
from pprint import pprint
# fname = "book-1-sample.pdf"
fname = "book-3.pdf"
doc = fitz.open(fname)  # open document

from fitz.utils import getColor
#blue = getColor("aliceblue")

# txtpgoffset = 77
txtpgoffset = 0
# start_page = 1 + txtpgoffset
start_page = 1
#end_page = start_page + 11
#end_page = start_page + 150
#end_page = start_page + 530
end_page = doc.page_count
items = []
mode = "searching"
print(mode)
for i in range(start_page, end_page):
    page = doc[i]
    # read page text as a dictionary, suppressing extra spaces in CJK fonts
    flags = 0 | fitz.TEXT_INHIBIT_SPACES | fitz.TEXT_DEHYPHENATE
    blocks = page.get_text("dict", flags=flags)["blocks"]
    for x,b in enumerate(blocks):  # iterate through the text blocks
        br = fitz.Rect(b["bbox"])
        #annot = page.add_rect_annot(br)
        for y,l in enumerate(b["lines"]):  # iterate through the text lines
            lr = fitz.Rect(l["bbox"])
            for z,s in enumerate(l["spans"]):  # iterate through the text spans
                if z==0 and mode == "found":
                    mode = "spanning"
                #s["style"] = flags_decomposer(s["flags"])  # readable font flags
                face = Face(s["font"], s["size"], s["color"], s["flags"])
                t = s["text"]
                ## FIXME: [83, 'N  Cystopteris  filix-fragilis  (L.) Borb. —  Polypodiumfilix-fragile  L.,  Cyst,  fragilis']
                ## FIXMECLASS: OCR Error; a spurious mark in the text preceding the genus is interpreted as a letter 'N'
                if z==0 and mode != "found" and re.search(r"Times-Bold\b", str(face)) and s["size"] > 10.0:
                        r_rect = fitz.Rect(lr)
                        r_high = fitz.Rect(s["bbox"])
                        joined = " ".join(list(map(lambda x: x["text"].strip(), l["spans"])))  ## FIXMEDESC: ideally this simplistic code would work if not for OCR Errors
                        item = joined
                        # found = re.match(r"(?P<genus>([A-Z]\w+) (?P<species>\w+)", joined)
                        # d = found.groupdict()
                        matches = re.match(r"([A-Z]\w+) (\w+)", joined)
                        warning = "Possible OCR Error"
                        if matches and len(matches.groups()) == 2:
                            found = matches.groups()
                            warning = "Normal"
                            mode = "found"
                            # item = f"""${d['genus']} ${d['species']}"""
                            item = " ".join(found)
                        else:
                            warning = "Possible OCR Error"

                        print([i, x, y, z, item, warning])
                        annot_high = page.add_highlight_annot(r_high)
                        items.append([t, r_rect, r_high, i, z,s, y,l, x,b, warning])
print(len(items))
#print(i, items[0][:3])

searching
[6, 8, 5, 0, 'JL', 'Possible OCR Error']
[7, 6, 0, 0, 'Rhododendron ponticum', 'Normal']
[8, 4, 0, 0, 'Arbutus unedo', 'Normal']
[8, 8, 0, 0, 'Arbutus andrachne', 'Normal']
[9, 1, 0, 0, 'Erica manipuliflora', 'Normal']
[9, 13, 0, 0, 'Pentapera sicula', 'Normal']
[10, 7, 0, 0, 'Samolus valerandi', 'Normal']
[10, 16, 0, 0, 'Ânagallis arvensis L. — A. phoenicea Scop., A. caerulea L., A. latifolia L.', 'Possible OCR Error']
[11, 18, 0, 0, 'Lysimachia dubia', 'Normal']
[11, 25, 0, 0, 'Asterolinon linum', 'Normal']
[12, 12, 0, 0, 'Cyclamen coum', 'Normal']
[12, 21, 0, 0, 'Cyclamen libanoticum', 'Normal']
[13, 9, 0, 0, 'Cyclamen persicum', 'Normal']
[14, 7, 0, 0, 'Androsace villosa', 'Normal']
[14, 13, 0, 0, 'Androsace multiscapa', 'Normal']
[14, 18, 0, 0, 'Androsace maxima', 'Normal']
[15, 3, 0, 0, 'Primula vulgaris', 'Normal']
[16, 19, 0, 0, 'Acantholimon damassanum', 'Normal']
[17, 14, 0, 0, 'Acantholimon acerosum', 'Normal']
[18, 1, 0, 0, 'Acantholimon antilibanoticum', 'Normal'

In [39]:
## This cell requires ~20 mins to calculate results from the first book

import math
doc = fitz.open(fname)  # open document
#subset = items[:6]
subset = items
boxes = []
for it, item in enumerate(subset):
    mode = "searching"
    if (it >= len(subset) - 1):
        break
    next_item = subset[it + 1]
    [t0, rr0, rh0, i0, z0,s0, y0,l0, x0,b0, warning0] = item
    [t, rr, rh, i, z,s, y,l, x,b, warning] = [t0, rr0, rh0, i0, z0,s0, y0,l0, x0,b0, warning0]
    [t1, rr1, rh1, i1, z1,s1, y1,l1, x1,b1, warning1] = next_item
    page = doc[i0]
    flags = 0 | fitz.TEXT_INHIBIT_SPACES | fitz.TEXT_DEHYPHENATE
    blocks = page.get_text("dict", flags=flags)["blocks"]
    blocks = blocks[x:]
    r_rect = rr0
    
    #print(t0)
    # if t0 == "Isoetes olympica":
    #print([int(it), len(subset), rr0, t0, "\n", t1])
    
    annot_rect = page.add_rect_annot(rr0)
    annot_rect.set_border(width=1, dashes=[1,2])
    annot_rect.update()

    
    for x,b in enumerate(blocks):  # iterate through the text blocks
        br = fitz.Rect(b["bbox"])
        #annot = page.add_rect_annot(br)
        for y,l in enumerate(b["lines"][y0:]):  # iterate through the text lines
            lr = fitz.Rect(l["bbox"])
            s = l["spans"][0]
            #for z,s in enumerate(l["spans"]):  # iterate through the text spans
            this_line = " ".join(list(map(lambda x: x["text"], l["spans"])))
            if mode != "found":
                if re.search("Aire g", this_line):
                    mode = s["font"] + " " + str(s["size"])
                    r_rect = r_rect.include_rect(lr)
                    #print(str((x, y, s["size"], s["font"])) + "FOUND: " + this_line)
                elif lr.intersects(rr1):                    
                    mode = "found"
                    #print(str((x, y, s["size"], s["font"])) + "FOUND: " + this_line)
                else:
                    if mode != "searching":
                        (font, size) = mode.split(" ")
                        isize = round(math.ceil(float(size)))
                        if s["size"] >= float(isize):
                            mode = "found"
                    if mode != "found":
                        r_rect = r_rect.include_rect(lr)
                        # if y >= y0:
                        #     r_rect = r_rect.include_rect(lr)
                    
    r_high = rh0
    annot_rect = page.add_rect_annot(r_rect)
    annot_high = page.add_highlight_annot(r_high)
    # boxes.append([t0, start_page + it, r_rect, item]) . ## FIXME: thinking it should be i0 not start_page + it
    boxes.append([t0, i0, r_rect, item])


# print(boxes[0])
print(len(boxes))

#doc.delete_pages(end_page, doc.page_count - 1)
#doc.delete_pages(0, start_page - 1)
marked_epithet_fname = "marked-pages-" + doc.name
doc.save(marked_epithet_fname, garbage=4, clean=True)
#!open -a Preview marked-pages-book-1-sample.pdf
#!open -a Preview marked-pages-book-1.pdf


1144


In [40]:
import pprint
pp = pprint.PrettyPrinter(compact=True)

import math
doc = fitz.open(fname)  # open document
results = {}
for it, thing in enumerate(boxes):
    mode = "searching"
    #print(thing)
    [name, pageno, rrr, item] = thing
    page = doc[pageno] 
    [t0, rr0, rh0, i0, z0,s0, y0,l0, x0,b0, warning0] = item
    flags = 0 | fitz.TEXT_INHIBIT_SPACES | fitz.TEXT_DEHYPHENATE
    blocks = page.get_text("dict", clip=rr0, flags=flags)["blocks"]    


    print(pageno - txtpgoffset, i0, t0)
    results[t0] = {}
    results[t0]["name"] = t0
    results[t0]["name_rects"] = [rr0]
    results[t0]["pdf_page"] = pageno
    results[t0]["book_page"] = pageno - txtpgoffset
    key = "Extra"
    mode = "searching"
    margins = [[(math.ceil(l["bbox"][0]), l["spans"][0]["text"].split(" ")[0]) for l in filter(lambda bl: bl.get("spans"), b["lines"])] for b in blocks]
    margins = [m for n in margins for m in n]
    targets = list(map(lambda n: n[0], filter(lambda m: m[1] in ["L.", "S.", "Aire"] , margins))) or [blocks[0]["lines"][0]["bbox"][0]]
    margins_max = max(targets or margins)
    margins_min = min(list(map(lambda n: n[0], margins)))

    for x,b in enumerate(blocks):  # iterate through the text blocks
        br = fitz.Rect(b["bbox"])
        for y,l in enumerate(b["lines"][y0:]):  # iterate through the text lines
            lr = fitz.Rect(l["bbox"])
            xloc = min(margins_max, math.ceil(lr.x0))
            this_line = re.sub(r"\s+", " ", " ".join(list(map(lambda x: x["text"].strip(), l["spans"])))).strip()
            # matches = re.match("^(?P<label>L\.|S\.|Aire g\w+\.|Not\w+\.|Fleurs|Fructif\.|Fructific|Floraison\w+)\s", this_line)
            #### FIXME: Phyllitis scolopendrium (L.) Newm. —Asplenium scolopendrium L., Scolopendrium
            #### FIXME: Juniper
            matches = re.match("^(?P<label>" + t0 + "|.*?)(\s|$)", this_line)
            if matches:
                d = matches.groupdict()
                if margins_max - xloc < 5:
                    key = d["label"]
                    if ' ' in key:
                        key = "Description"
                    elif key not in ["Description", "L.", "S.", "Aire"]:
                        key = "Extra"
                    elif key in ["Description", "L.", "S.", "Aire"]:
                        pass
                    else:
                        print("UNHANDLED: " + key)
                        key = "Extra"
                frag = results[t0].get(key) or ""
                results[t0][key] = " ".join([frag.strip(), this_line.strip()]).strip()
                key_rects = key + "_rects"
                if not results[t0].get(key_rects):
                    results[t0][key_rects] = []    
                results[t0][key_rects].append(lr)
                #print([x, y, xloc, mode, this_line])
            else:
                print(["ERROR:", x, y, xloc, mode, this_line])
    # pp.pprint(results[t0])
    print()
    #break
                    
    # r_high = rh0
    # annot_rect = page.add_rect_annot(r_rect)
    # annot_high = page.add_highlight_annot(r_high)
    # boxes.append([t0, r_rect, item])




6 6 JL

7 7 Rhododendron

8 8 Arbutus unedo

8 8 Arbutus andrachne

9 9 Erica

9 9 Pentapera sicula

10 10 Samolus

10 10 Ânagallis

11 11 Lysimachia

11 11 Asterolinon

12 12 Cyclamen

12 12 Cyclamen libanoticum

13 13 Cyclamen persicum

14 14 Androsace villosa

14 14 Androsace

14 14 Androsace

15 15 Primula vulgaris

16 16 Acantholimon

17 17 Acantholimon acerosum

18 18 Acantholimon

18 18 Acantholimon libanoticum

19 19 Acantholimon ulicinum

20 20 Goniolimon collinum

20 20 Limonium

21 21 Limonium

21 21 Limonium angustifolium

21 21 Limonium sieberi

22 22 Limonium

22 22 Limonium graecum

22 22 Limonium

23 23 Limoniumglobuliferum

23 23 Psylliostachys spicatus

23 23 Armeria

24 24 Plumbago europaea

25 25 Styrax

25 25 Olea europaea

26 26 Phillyrea

26 26 Fontanesia

27 27 Fraxinus ornus

27 27 Fraxinus excelsior

27 27 Fraxinus syriaca

28 28 Jasminum fruticans

29 29 Vinca

29 29 Vinca

29 29 Vinca libanotica

29 29 Trachomitum venetum

30 30 Nerium oleander

31 31 Blacks

In [41]:
from tqdm import tqdm
import math
# marked_fname = "marked-pages-book-1-sample.pdf"
marked_fname = "marked-pages-book-3.pdf"
#doc = fitz.open(fname)  # open document
doc = fitz.open(marked_fname)  # open document
print(len(results))

from fitz.utils import getColorList
cl = getColorList()
from fitz.utils import getColor

pink = getColor("lightpink")
green = getColor("aquamarine")
blue = getColor("lightskyblue")
gray = getColor("whitesmoke")
yellowish = getColor("antiquewhite")
fills= [pink, green, blue, gray, yellowish]

fc = {"L.":pink, "S.":blue, "Aire":green, "Description":gray, "Extra":yellowish}
for it, name in enumerate(tqdm(results.keys())):
    item = results[name]
    #print(item)
    #print(item["Description"])
    book_page = item["book_page"]
    pdf_page = item["pdf_page"]
    print([it, name, book_page, pdf_page])
    page = doc[book_page]
    for k in filter(lambda x: not re.search(r"_page|_rect", x), item.keys()):
        rects = item[k + "_rects"]
        for r in rects:
            #print(k, r)
            
            if not page.rect.intersects(r):
                print(["ERROR: ", k, r])
            else:
                # #annot = page.add_highlight_annot(fitz.Rect(r))
                # annot_rect = page.add_rect_annot(fitz.Rect(r))
                # #annot_rect.set_colors(stroke=(0,1,0))
                # annot_rect.set_border(width=1, dashes=[1,2])
                # annot_rect.update()

                if k in ["L.", "S.", "Aire", "Description", "Extra"]:
                    #print(["HIGHL: ", fc[k], k, r])
                    annot = page.add_highlight_annot(fitz.Rect(r))
                    annot.set_colors(stroke=fc[k])
                    annot.update()
                    #annot_rect = page.add_rect_annot(fitz.Rect(r))
                    #annot_rect.set_colors(stroke=green)
                    #annot_rect.set_border() #(width=1, dashes=[1,2])
                    #annot_rect.update()
                    #annot_rect.update(fill_color=red)


# doc.delete_pages(end_page, doc.page_count - 1)
# doc.delete_pages(0, start_page - 1)
doc_fname = "results-" + doc.name
doc.save(doc_fname, garbage=4, clean=True)
print(doc_fname)
# !open -a Preview results-marked-pages-book-1-sample.pdf
#!open -a Preview results-marked-pages-book-1.pdf



896


  0%|          | 0/896 [00:00<?, ?it/s]

[0, 'JL', 6, 6]
[1, 'Rhododendron', 7, 7]
[2, 'Arbutus unedo', 8, 8]
[3, 'Arbutus andrachne', 8, 8]
[4, 'Erica', 9, 9]
[5, 'Pentapera sicula', 9, 9]
[6, 'Samolus', 10, 10]
[7, 'Ânagallis', 10, 10]
[8, 'Lysimachia', 11, 11]
[9, 'Asterolinon', 11, 11]
[10, 'Cyclamen', 12, 12]
[11, 'Cyclamen libanoticum', 12, 12]
[12, 'Cyclamen persicum', 13, 13]
[13, 'Androsace villosa', 14, 14]
[14, 'Androsace', 14, 14]
[15, 'Primula vulgaris', 15, 15]
[16, 'Acantholimon', 18, 18]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[17, 'Acantholimon acerosum', 17, 17]
[18, 'Acantholimon libanoticum', 18, 18]
[19, 'Acantholimon ulicinum', 19, 19]
[20, 'Goniolimon collinum', 20, 20]
[21, 'Limonium', 22, 22]
[22, 'Limonium angustifolium', 21, 21]
[23, 'Limonium sieberi', 21, 21]
[24, 'Limonium graecum', 22, 22]
[25, 'Limoniumglobuliferum', 23, 23]
[26, 'Psylliostachys spicatus', 23, 23]
[27, 'Armeria', 23, 23]
[28, 'Plumbago europaea', 24, 24]
[29, 'Styrax', 25, 25]
[30, '

  9%|▉         | 82/896 [00:00<00:01, 809.29it/s]

[81, 'Cuscuta approximata', 51, 51]
[82, 'Cuscuta balansae', 52, 52]
[83, 'Cuscuta brevistyla', 53, 53]
[84, 'Cuscuta europaea', 53, 53]
[85, 'Cuscuta babylonica', 54, 54]
[86, 'Cuscuta pedicellata', 54, 54]
[87, 'Cuscuta campestris', 54, 54]
[88, 'Heliotropium supinum', 56, 56]
[89, 'Heliotropium myosotoides', 56, 56]
[90, 'Heliotropium lasiocarpum', 57, 57]
[91, 'Heliotropium dolosum', 57, 57]
[92, 'Heliotropium', 59, 59]
[93, 'Heliotropium bovei', 58, 58]
[94, 'Heliotropium ramosissimum', 59, 59]
[95, 'Asperugo procumbens', 60, 60]
[96, 'Caccinia', 60, 60]
[97, 'Lappula spinocarpos', 61, 61]
[98, 'Lappula barbata', 61, 61]
[99, 'Lappula sinaica', 62, 62]
[100, 'Lappula', 62, 62]
[101, 'Heterocaryum szovitsianum', 62, 62]
[102, 'Heterocaryum subsessile', 63, 63]
[103, 'Rindera', 63, 63]
[104, 'Paracaryum rugulosum', 64, 64]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[105, 'Mattiastrum lithospermifolium', 64, 64]
[106, 'Mattiastrum lamprocarpu

 19%|█▉        | 168/896 [00:00<00:00, 838.67it/s]

[168, 'Borago', 99, 99]
[169, 'Cordia', 100, 100]
[170, 'Verbena', 100, 100]
[171, 'Verbena supina', 101, 101]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[172, 'Vitex', 102, 102]
[173, 'Ajuga orientalis', 103, 103]
[174, 'Ajuga laevigata', 103, 103]
[175, 'Ajuga chia', 104, 104]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[176, 'Ajuga tridactylites', 104, 104]
[177, 'Ajuga', 106, 106]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[178, 'Teucrium creticum', 108, 108]
[179, 'Teucrium multicaule', 108, 108]
[180, 'Teucrium pruinosum', 108, 108]
[181, 'Teucrium', 115, 115]
[182, 'Teucrium orientale', 109, 109]
[183, 'Teucrium procerum', 109, 109]
[184, 'Teucrium oliverianum', 110, 110]
[185, 'Teucrium parviflorum', 110, 110]
[186, 'Teucrium scordioides', 111, 111]
[187, 'Teucrium spinosum', 111, 111]
[188, 'Teucrium montbretii', 112, 112]
[189, 'Teucrium haradjianii', 1

 29%|██▉       | 261/896 [00:00<00:00, 879.55it/s]

[226, 'Prunella orientalis', 134, 134]
[227, 'Prunella laciniata', 135, 135]
[228, 'Eremostachys', 135, 135]
[229, 'Eremostachys macrophylla', 136, 136]
[230, 'Phlomis', 141, 141]
[231, 'Phlomis kurdica', 138, 138]
[232, 'Phlomis brachyodon', 138, 138]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[233, 'Phlomis chrysophylla', 140, 140]
[234, 'Phlomis viscosa', 140, 140]
[235, 'Phlomis bailanica', 140, 140]
[236, 'Phlomis rigida', 141, 141]
[237, 'Phlomis pungens', 141, 141]
[238, 'Lamium', 146, 146]
[239, 'Lamium veronicifolium', 143, 143]
[240, 'Lamium aleppicum', 144, 144]
[241, 'Lamium ehrenbergii', 144, 144]
[242, 'Lamium adoxifolium', 145, 145]
[243, 'Lamium truncatum', 145, 145]
[244, 'Lamium moschatum', 145, 145]
[245, 'Wiedemannia', 146, 146]
[246, 'Moluccella laevis', 147, 147]
[247, 'Moluccella spinosa', 147, 147]
[248, 'Ballota nigra', 148, 148]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[249, 

 39%|███▉      | 349/896 [00:00<00:00, 879.02it/s]

[349, 'Datura xnetel', 206, 206]
[350, 'Nicotiana glauca', 206, 206]
[351, 'Hyoscyamus muticus', 207, 207]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[352, 'Hyoscyamus desertorum', 207, 207]
[353, 'Hyoscyamus pusillus', 208, 208]
[354, 'Hyoscyamus niger', 208, 208]
[355, 'Hyoscyamus reticulatus', 208, 208]
[356, 'Hyoscyamus', 209, 209]
[357, 'Hyoscyamus aureus', 209, 209]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[358, 'Verbascum', 228, 228]
[359, 'Verbascum agrimoniifolium', 213, 213]
[360, 'Verbascum antilibanoticum', 214, 214]
[361, 'Verbascum levanticum', 214, 214]
[362, 'Verbascum orientale', 214, 214]
[363, 'Verbascum blancheanum', 215, 215]
[364, 'Verbascum tiberiadis', 216, 216]
[365, 'Verbascum alepense', 228, 228]
[366, 'Verbascum eremobium', 218, 218]
[367, 'Verbascum cedreti', 219, 219]
[368, 'Verbascum tropidocarpum', 220, 220]
[369, 'Verbascum sinuatum', 221, 221]
[370, 'Verbascum caesare

 49%|████▉     | 437/896 [00:00<00:00, 846.78it/s]

[429, 'Digitalis ferruginea', 271, 271]
[430, 'Siphonostegia syriaca', 272, 272]
[431, 'Bungea', 272, 272]
[432, 'Parentucellia latifolia', 272, 272]
[433, "Parentucellia viscosa'", 273, 273]
[434, 'Bellardia', 273, 273]
[435, 'Odontites aucheri', 274, 274]
[436, 'Odontites lutea', 274, 274]
[437, 'Pedicularis', 275, 275]
[438, 'Globularia', 275, 275]
[439, 'Utricularia australia', 276, 276]
[440, 'Sesamum indicum', 276, 276]
[441, 'Acanthus mollis', 277, 277]
[442, 'Acanthus dioscoridis', 277, 277]
[443, 'Acanthus syriacus', 278, 278]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[444, 'Cistanche phelipaea', 279, 279]
[445, 'Cistanche tubulosa', 279, 279]
[446, 'Cistanche salsa', 280, 280]
[447, 'Orobanche ramosa', 281, 281]
[448, 'Orobanche mutelii', 282, 282]
[449, 'Orobanche nana', 282, 282]
[450, 'Orobanche lavandulacea', 282, 282]
[451, 'Orobanche', 287, 287]
[452, 'Orobanche astragali', 283, 283]
[453, 'Orobanche libanotica', 284, 284]
[454

 59%|█████▊    | 525/896 [00:00<00:00, 857.68it/s]

[515, 'Valantia muralis', 323, 323]
[516, 'Valantia hispida', 323, 323]
[517, 'Mericarpaea vaillantioides', 323, 323]
[518, 'Callipeltis factorovskyi', 324, 324]
[519, 'Callipeltis cucullaris', 324, 324]
[520, 'Sambucus', 326, 326]
[521, 'Viburnum tinus', 326, 326]
[522, 'Lonicera etrusca', 326, 326]
[523, 'Lonicera', 327, 327]
[524, 'Lonicera orientalis', 327, 327]
[525, 'Valeriana dioscoridis', 328, 328]
[526, 'Centranthus longiflorus', 328, 328]
[527, 'Valerianella tuberculata', 330, 330]
[528, 'Valerianella dactylophylla', 330, 330]
[529, 'Valerianella oxyrhyncha', 331, 331]
[530, 'Valerianella cymbicarpa', 331, 331]
[531, 'Valerianella szovitsiana', 331, 331]
[532, 'Valerianella', 337, 337]
[533, 'Valerianella soyeri', 333, 333]
[534, 'Valerianella locusta', 333, 333]
[535, 'Valerianella dentata', 334, 334]
[536, 'Valerianella muricata', 334, 334]
[537, 'Valerianella carinata', 334, 334]
[538, '.Valerianella', 335, 335]
[539, 'Valerianella pumila', 335, 335]
[540, 'Valerianella an

 68%|██████▊   | 611/896 [00:00<00:00, 845.48it/s]

[611, 'Asteriscus pygmaeus', 377, 377]
[612, 'Asteriscus graveolens', 377, 377]
[613, 'Pallenis spinosa', 378, 378]
[614, 'Postia', 379, 379]
[615, 'Inula', 381, 381]
[616, 'Inula vulgaris', 380, 380]
[617, 'Inula crithmoides', 380, 380]
[618, 'Inula heterolepis', 380, 380]
[619, 'Inula graveolens', 381, 381]
[620, 'Pentanema divaricata', 381, 381]
[621, 'Pulicaria dysenterica', 382, 382]
[622, 'Pulicaria', 383, 383]
[623, 'Pulicaria auranitica', 383, 383]
[624, 'Pulicaria arabica', 383, 383]
[625, 'Pulicaria laniceps', 384, 384]
[626, 'Francœuria', 384, 384]
[627, 'Varthemia iphionoides', 385, 385]
[628, 'Phagnalon', 386, 386]
[629, 'Phagnalon kotschyi', 386, 386]
[630, 'Lasiopogon', 386, 386]
[631, 'Gnaphalium', 387, 387]
[632, 'Gnaphalium uliginosum', 387, 387]
[633, 'Helichrysum', 391, 391]
[634, 'Helichrysum armenium', 389, 389]
[635, 'Helichrysum pallasii', 389, 389]
[636, 'Helichrysum plicatum', 390, 390]
[637, 'Helichrysum sanguineum', 391, 391]
[638, 'Leysera leyseroides', 392

 79%|███████▉  | 706/896 [00:00<00:00, 873.82it/s]

[706, 'Senecio exilis', 436, 436]
[707, 'Senecio gallicus', 436, 436]
[708, 'Senecio aquaticus', 436, 436]
[709, 'Senecio mouterdei', 437, 437]
[710, 'Calendula officinalis', 439, 439]
[711, 'Calendula sinuata', 439, 439]
[712, 'Calendula bicolor', 440, 440]
[713, 'Calendula arvensis', 440, 440]
[714, 'Calendula aegyptiaca', 440, 440]
[715, 'Calendula palaestina', 441, 441]
[716, 'Calendula tripterocarpa', 441, 441]
[717, 'Dipterocome pusilla', 441, 441]
[718, 'Gundelia', 442, 442]
[719, 'Echinops', 444, 444]
[720, 'Echinops gaillardotii', 443, 443]
[721, 'Acantholepis', 445, 445]
[722, 'Cardopatium corymbosum', 445, 445]
[723, 'Xeranthemum', 447, 447]
[724, 'Xeranthemum inapertum', 446, 446]
[725, 'Xeranthemum cylindraceum', 447, 447]
[726, 'Chardinia', 448, 448]
[727, 'Siebera', 448, 448]
[728, 'Carlina', 451, 451]
[729, 'Garlina', 450, 450]
[730, 'Carlina hispanica', 450, 450]
[731, 'Carlina kurdica', 450, 450]
[732, 'Atractylis carduus', 451, 451]
[733, 'Atractylis cancellata', 452

 89%|████████▉ | 799/896 [00:00<00:00, 887.36it/s]

[799, 'Centaurea iberica', 493, 493]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[800, 'Centaurea pallescens', 494, 494]
[801, 'Centaurea procurrens', 495, 495]
[802, 'Centaurea bruguieriana', 495, 495]
[803, 'Centaurea eryngioides', 496, 496]
[804, 'Centaurea mouterdei', 496, 496]
[805, 'Centaurea carduiformis', 497, 497]
[806, 'Centaurea trachonitica', 498, 498]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[807, 'Centaurea heterocarpa', 498, 498]
[808, 'Centaurea triumfettii', 499, 499]
[809, 'Centaurea cyanus', 500, 500]
[810, 'Cnicus benedictus', 500, 500]
[811, 'Carthamus', 503, 503]
[812, 'Carthamus persicus', 501, 501]
[813, 'Carthamus dentatus', 504, 504]
[814, 'Carthamus glaucus', 504, 504]
[815, 'Carduncellus', 505, 505]
[816, 'Scolymus maculatus', 505, 505]
[817, 'Scolymus hispanicus', 505, 505]
[818, 'Catananche lutea', 506, 506]
[819, 'Cichorium intybus', 506, 506]
[820, 'Cichorium pumilum', 5

100%|██████████| 896/896 [00:01<00:00, 864.74it/s]

[886, 'Launaea nudicaulis', 541, 541]
[887, 'Aetheorhiza', 542, 542]
[888, 'Crépis', 549, 549]
[889, 'Crépis pulchra', 546, 546]
[890, 'Crépis pterothecoides', 546, 546]
[891, 'Crépis kotschyana', 547, 547]
[892, 'Andryala integrifolia', 550, 550]
[893, 'Hieracium bauhinii', 551, 551]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[894, 'n.', 551, 551]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[895, 'Hieracium', 552, 552]
results-marked-pages-book-3.pdf





In [26]:
doc.name

'marked-pages-book-1.pdf'

In [18]:
import pandas as pd

df = pd.DataFrame.from_dict(results.values()).fillna('')
df = df[df.columns.drop(list(df.filter(regex='_rects')))]
df.to_csv("book-1.csv", index=False)
df

Unnamed: 0,name,pdf_page,book_page,Extra,Description,L.,Aire,S.
0,J,59,59,J Lettre alternant dans la transcription de mo...,,,,
1,JL,77,77,Nahal Nik Np Ol P Pb Pr Reese Russ Sam Schw Th...,,,,
2,Lycopodium cernuum,78,78,Fructification de novembre à mars. Sur grès tr...,Lycopodium cernuum L. var. capillaceum Willd (...,"L. Mi. Entre Nahr es Safa et 'Aïn Qa'a, 1942 e...",Aire géogr. —- Régions tropicales et subtropic...,
3,Selaginella denticulata,79,79,Végétation active de novembre à avril. Fructif...,Selaginella denticulata (L.) Link — Lycopodium...,"L. Ct. et ML, Ce. Saïda (Bl), Beyrouth et envi...",Aire géogr. —• Tour de la Méditerranée. Madère...,"S. Non signalée. Présence presque certaine, Ct..."
4,Isoetes hystrix,79,79,"Sur terrains très humides, émergée. Spores au ...","Isoetes hystrix Dur., forma subinermis Dur. (P...",L. 'Akkar. Prairies humides au nord de la rout...,"Aire géogr. — Tour de la Méditerranée, Côte de...",S. Présence non constatée mais presque certain...
...,...,...,...,...,...,...,...,...
856,Dianthus strictus,599,599,Floraison: mai-décembre. CC. tous terrains. Le...,Dianthus strictus Banks et Sol. (non Sibth. et...,,,
857,Dianthus judaicus,603,603,Var. auraniticus (Post.) n. comb. — Calice 3 c...,Dianthus judaicus Boiss. — D. pattens Sibth. e...,L. Sy. Baalbeck (Wall). St. Qamou'at Hermel (P...,"Aire géogr. — Turquie sud, Syrie, Liban, Pales...","S. A.L. Ouadi-el-Qarn (Sam, Mt, Pb), Zebdani, ..."
858,Dianthus pachypetalus,603,603,,Dianthus pachypetalus Stapf— D. floribundus Bo...,,,
859,Dianthus crinitus,604,604,Floraison: mai-juin.,"Dianthus crinitus Smith (PI. CLXXXV, n. 4). — ...",,"Aire géogr. — Turquie, Géorgie, Iran, Bélouchi...",S. St. Zélaf (Pb).


In [19]:
!open -a Preview results-marked-pages-book-1.pdf