In [1]:
#!pip uninstall fitz PyMuPDF --yes
#!pip install PyMuPDF
#!pip install PyMuPDF==1.19.1  ## install a release later than 1.19.0
#!pip install PyMuPDF==1.18.19  ## don't install 1.19.0 because of Wrong x0/y0 values for bboxes in get_text("words"), see: https://github.com/pymupdf/PyMuPDF/issues/1328
# ## Releases of PyMuPDF as of Oct 18, 2021: 1.11.2, 1.12.5, 1.13.20, 1.14.19.post2, 1.14.20, 1.14.21, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.16.7, 1.16.8, 1.16.9, 1.16.10, 1.16.11, 1.16.12, 1.16.13, 1.16.14, 1.16.15, 1.16.16, 1.16.17, 1.16.18, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.17.6, 1.17.7, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.18.6, 1.18.7, 1.18.8, 1.18.9, 1.18.10, 1.18.11, 1.18.12, 1.18.13, 1.18.14, 1.18.15, 1.18.16, 1.18.17, 1.18.18, 1.18.19, 1.19.0


In [2]:
!open -a Preview book-1.pdf book-2.pdf book-3.pdf

In [3]:
config = [
   {"filename" : "NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf",
    "name" : "book-1",
    "page_count" : 642,
    "sections" : {
      "preface" : [1, 78],
      "toponymique" : [49, 76],
      "abbrev" : [77, 78],
      "content" : [79, 607],
      "sample" : [79, 79+10],
      "index" : [617, 639]},
    },
   {"filename" : "NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 2.pdf",
    "name" : "book-2",
    "page_count" : 725,
    "sections" : {
      "preface" : [1, 79],
      "abbrev" : [6, 7],
      "content" : [8, 700],
      "sample" : [8, 8+10],
      "index" : [704, 725]},
    },
   {"filename" : "NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf",
    "name" : "book-3",
    "page_count" : 588,
    "sections" : {
      "preface" : [1, 7],
      "abbrev" : [6, 7],
      "content" : [8, 554],
      "sample" : [8, 8+10],
      "index" : [556, 583],
      "familyidx" : [584, 585]},
    },
]


In [4]:
import sys, fitz, re
from pprint import pprint

## Purpose: this cell is a pre-processing step that will create a symlink (e.g. book-1.pdf)
##          that points to the original file, and then will create new PDF documents
##          that are excerpts of subsets of the whole book
##          as named sections such as book-1-index.pdf or book-1-abbrev.pdf
##          The file book-X-sample.pdf contains the first 10 pages of the book-1-content.pdf
##          and the sample is used to speed up development/testing later on

## https://pymupdf.readthedocs.io/en/latest/document.html#Document.delete_pages
##    In general, the performance of this method is dependent on the number of remaining pages
##    NOT on the number of deleted pages
## So, keeping fewer pages is faster, and deleting more pages at once is also faster 
def excerpt(infile, outfile, start, end):
  doc = fitz.open(infile)  # open document
  last_page = doc.page_count
  print(start, end, last_page, outfile)
  deleteme = []
  if start - 1 > 0:
    deleteme.extend(list(range(0, start - 1)))       
  if end < last_page:
    deleteme.extend(list(range(end, last_page)))
  # print(deleteme)
  doc.delete_pages(deleteme)
  doc.save(outfile, garbage=4, clean=True)

import os, os.path
for book in config:
   symlink = book["name"] + ".pdf"
   fname = book["filename"]
   if os.path.exists(book["filename"]) and not os.path.exists(symlink):
      print("Creating symlink: ", symlink, " ->", fname)
      os.symlink(book["filename"], symlink)
   for section in book["sections"]:
      #print(s)
      start, end = book["sections"][section];
      out_fname = book["name"] + "-" + section + ".pdf"
      excerpt(book["filename"], out_fname, start, end)

#!open -a Preview book-[1,2,3]-*.pdf

## Runtime of this cell is ~11m38s

1 78 642 book-1-preface.pdf
49 76 642 book-1-toponymique.pdf
77 78 642 book-1-abbrev.pdf
79 607 642 book-1-content.pdf
79 89 642 book-1-sample.pdf
617 639 642 book-1-index.pdf
1 79 725 book-2-preface.pdf
6 7 725 book-2-abbrev.pdf
8 700 725 book-2-content.pdf
8 18 725 book-2-sample.pdf
704 725 725 book-2-index.pdf
1 7 588 book-3-preface.pdf
6 7 588 book-3-abbrev.pdf
8 554 588 book-3-content.pdf
8 18 588 book-3-sample.pdf
556 583 588 book-3-index.pdf
584 585 588 book-3-familyidx.pdf


In [5]:
# !open -a Preview book-[1,2,3]-*.pdf


In [6]:
# !open -a Preview book-[1,2,3]-index.pdf


In [7]:
# https://pymupdf.readthedocs.io/en/latest/faq/#how-to-analyze-font-characteristics

import fitz
import pprint, re

def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

class Face:
    def __init__(self, font, size, color, flags=0):
        self.font = font
        self.size = size
        self.color = color
        self.flags = flags
        self.style = flags_decomposer(self.flags)
    def __str__(self):
        return f"Font: '{self.font}' ({self.style}), size {self.size:g}, color " + ("#%06x" % self.color)
print(Face("Times", 10.929, 0, 4))


Font: 'Times' (serifed, proportional), size 10.929, color #000000


In [20]:
import sys, fitz, re
from pprint import pprint
# fname = "book-1-sample.pdf"
fname = "book-1.pdf"
doc = fitz.open(fname)  # open document

from fitz.utils import getColor
#blue = getColor("aliceblue")

# txtpgoffset = 77
txtpgoffset = 0
# start_page = 1 + txtpgoffset
start_page = 1
#end_page = start_page + 11
#end_page = start_page + 150
#end_page = start_page + 530
end_page = doc.page_count
items = []
mode = "searching"
print(mode)
for i in range(start_page, end_page):
    page = doc[i]
    # read page text as a dictionary, suppressing extra spaces in CJK fonts
    flags = 0 | fitz.TEXT_INHIBIT_SPACES | fitz.TEXT_DEHYPHENATE
    blocks = page.get_text("dict", flags=flags)["blocks"]
    for x,b in enumerate(blocks):  # iterate through the text blocks
        br = fitz.Rect(b["bbox"])
        #annot = page.add_rect_annot(br)
        for y,l in enumerate(b["lines"]):  # iterate through the text lines
            lr = fitz.Rect(l["bbox"])
            for z,s in enumerate(l["spans"]):  # iterate through the text spans
                if z==0 and mode == "found":
                    mode = "spanning"
                #s["style"] = flags_decomposer(s["flags"])  # readable font flags
                face = Face(s["font"], s["size"], s["color"], s["flags"])
                t = s["text"]
                ## FIXME: [83, 'N  Cystopteris  filix-fragilis  (L.) Borb. —  Polypodiumfilix-fragile  L.,  Cyst,  fragilis']
                ## FIXMECLASS: OCR Error; a spurious mark in the text preceding the genus is interpreted as a letter 'N'
                if z==0 and mode != "found" and re.search(r"Times-Bold\b", str(face)) and s["size"] > 10.0:
                        r_rect = fitz.Rect(lr)
                        r_high = fitz.Rect(s["bbox"])
                        joined = " ".join(list(map(lambda x: x["text"].strip(), l["spans"])))  ## FIXMEDESC: ideally this simplistic code would work if not for OCR Errors
                        item = joined
                        # found = re.match(r"(?P<genus>([A-Z]\w+) (?P<species>\w+)", joined)
                        # d = found.groupdict()
                        matches = re.match(r"([A-Z]\w+) (\w+)", joined)
                        warning = "Possible OCR Error"
                        if matches and len(matches.groups()) == 2:
                            found = matches.groups()
                            warning = "Normal"
                            mode = "found"
                            # item = f"""${d['genus']} ${d['species']}"""
                            item = " ".join(found)
                        else:
                            warning = "Possible OCR Error"

                        print([i, x, y, z, item, warning])
                        annot_high = page.add_highlight_annot(r_high)
                        items.append([t, r_rect, r_high, i, z,s, y,l, x,b, warning])
print(len(items))
#print(i, items[0][:3])

searching
[59, 9, 0, 0, 'J', 'Possible OCR Error']
[77, 8, 5, 0, 'JL', 'Possible OCR Error']
[78, 8, 0, 0, 'Lycopodium cernuum', 'Normal']
[79, 1, 0, 0, 'Selaginella denticulata', 'Normal']
[79, 16, 0, 0, 'Isoetes hystrix', 'Normal']
[79, 24, 1, 0, 'Isoetes olympica', 'Normal']
[80, 13, 0, 0, 'Equisetum maximum', 'Normal']
[80, 18, 0, 0, 'Equisetum palustre', 'Normal']
[80, 25, 0, 0, 'Equisetum ramosissimum', 'Normal']
[81, 9, 0, 0, 'Ophioglossum vulgatum', 'Normal']
[81, 11, 3, 0, 'Ophioglossum lusitanicum', 'Normal']
[81, 21, 0, 0, 'Osmunda regalis', 'Normal']
[82, 9, 0, 0, 'Gymnogramma leptophylla', 'Normal']
[82, 17, 0, 0, 'Cheilanthes pteridioides', 'Normal']
[83, 28, 0, 0, 'Âdiantum capillus-veneris I (PL III, n. 4). — 2|. Rhizome rampant à écailles', 'Possible OCR Error']
[84, 10, 0, 0, 'Pteris longifolia', 'Normal']
[84, 16, 0, 0, 'Pteridium aquilmum', 'Normal']
[85, 1, 0, 0, 'Athyrium filix', 'Normal']
[85, 12, 0, 0, 'Dryopterîs aculeata', 'Normal']
[85, 18, 0, 0, 'Dryopteris 

In [21]:
## This cell requires ~20 mins to calculate results from the first book

import math
doc = fitz.open(fname)  # open document
#subset = items[:6]
subset = items
boxes = []
for it, item in enumerate(subset):
    mode = "searching"
    if (it >= len(subset) - 1):
        break
    next_item = subset[it + 1]
    [t0, rr0, rh0, i0, z0,s0, y0,l0, x0,b0, warning0] = item
    [t, rr, rh, i, z,s, y,l, x,b, warning] = [t0, rr0, rh0, i0, z0,s0, y0,l0, x0,b0, warning0]
    [t1, rr1, rh1, i1, z1,s1, y1,l1, x1,b1, warning1] = next_item
    page = doc[i0]
    flags = 0 | fitz.TEXT_INHIBIT_SPACES | fitz.TEXT_DEHYPHENATE
    blocks = page.get_text("dict", flags=flags)["blocks"]
    blocks = blocks[x:]
    r_rect = rr0
    
    #print(t0)
    # if t0 == "Isoetes olympica":
    #print([int(it), len(subset), rr0, t0, "\n", t1])
    
    annot_rect = page.add_rect_annot(rr0)
    annot_rect.set_border(width=1, dashes=[1,2])
    annot_rect.update()

    
    for x,b in enumerate(blocks):  # iterate through the text blocks
        br = fitz.Rect(b["bbox"])
        #annot = page.add_rect_annot(br)
        for y,l in enumerate(b["lines"][y0:]):  # iterate through the text lines
            lr = fitz.Rect(l["bbox"])
            s = l["spans"][0]
            #for z,s in enumerate(l["spans"]):  # iterate through the text spans
            this_line = " ".join(list(map(lambda x: x["text"], l["spans"])))
            if mode != "found":
                if re.search("Aire g", this_line):
                    mode = s["font"] + " " + str(s["size"])
                    r_rect = r_rect.include_rect(lr)
                    #print(str((x, y, s["size"], s["font"])) + "FOUND: " + this_line)
                elif lr.intersects(rr1):                    
                    mode = "found"
                    #print(str((x, y, s["size"], s["font"])) + "FOUND: " + this_line)
                else:
                    if mode != "searching":
                        (font, size) = mode.split(" ")
                        isize = round(math.ceil(float(size)))
                        if s["size"] >= float(isize):
                            mode = "found"
                    if mode != "found":
                        r_rect = r_rect.include_rect(lr)
                        # if y >= y0:
                        #     r_rect = r_rect.include_rect(lr)
                    
    r_high = rh0
    annot_rect = page.add_rect_annot(r_rect)
    annot_high = page.add_highlight_annot(r_high)
    # boxes.append([t0, start_page + it, r_rect, item]) . ## FIXME: thinking it should be i0 not start_page + it
    boxes.append([t0, i0, r_rect, item])


# print(boxes[0])
print(len(boxes))

#doc.delete_pages(end_page, doc.page_count - 1)
#doc.delete_pages(0, start_page - 1)
marked_epithet_fname = "marked-pages-" + doc.name
doc.save(marked_epithet_fname, garbage=4, clean=True)
#!open -a Preview marked-pages-book-1-sample.pdf
#!open -a Preview marked-pages-book-1.pdf



1063


In [24]:
import pprint
pp = pprint.PrettyPrinter(compact=True)

import math
doc = fitz.open(fname)  # open document
results = {}
for it, thing in enumerate(boxes):
    mode = "searching"
    #print(thing)
    [name, pageno, rrr, item] = thing
    page = doc[pageno] 
    [t0, rr0, rh0, i0, z0,s0, y0,l0, x0,b0, warning0] = item
    flags = 0 | fitz.TEXT_INHIBIT_SPACES | fitz.TEXT_DEHYPHENATE
    blocks = page.get_text("dict", clip=rr0, flags=flags)["blocks"]    


    print(pageno - txtpgoffset, i0, t0)
    results[t0] = {}
    results[t0]["name"] = t0
    results[t0]["name_rects"] = [rr0]
    results[t0]["pdf_page"] = pageno
    results[t0]["book_page"] = pageno - txtpgoffset
    key = "Extra"
    mode = "searching"
    margins = [[(math.ceil(l["bbox"][0]), l["spans"][0]["text"].split(" ")[0]) for l in filter(lambda bl: bl.get("spans"), b["lines"])] for b in blocks]
    margins = [m for n in margins for m in n]
    targets = list(map(lambda n: n[0], filter(lambda m: m[1] in ["L.", "S.", "Aire"] , margins))) or [blocks[0]["lines"][0]["bbox"][0]]
    margins_max = max(targets or margins)
    margins_min = min(list(map(lambda n: n[0], margins)))

    for x,b in enumerate(blocks):  # iterate through the text blocks
        br = fitz.Rect(b["bbox"])
        for y,l in enumerate(b["lines"][y0:]):  # iterate through the text lines
            lr = fitz.Rect(l["bbox"])
            xloc = min(margins_max, math.ceil(lr.x0))
            this_line = re.sub(r"\s+", " ", " ".join(list(map(lambda x: x["text"].strip(), l["spans"])))).strip()
            # matches = re.match("^(?P<label>L\.|S\.|Aire g\w+\.|Not\w+\.|Fleurs|Fructif\.|Fructific|Floraison\w+)\s", this_line)
            #### FIXME: Phyllitis scolopendrium (L.) Newm. —Asplenium scolopendrium L., Scolopendrium
            #### FIXME: Juniper
            matches = re.match("^(?P<label>" + t0 + "|.*?)(\s|$)", this_line)
            if matches:
                d = matches.groupdict()
                if margins_max - xloc < 5:
                    key = d["label"]
                    if ' ' in key:
                        key = "Description"
                    elif key not in ["Description", "L.", "S.", "Aire"]:
                        key = "Extra"
                    elif key in ["Description", "L.", "S.", "Aire"]:
                        pass
                    else:
                        print("UNHANDLED: " + key)
                        key = "Extra"
                frag = results[t0].get(key) or ""
                results[t0][key] = " ".join([frag.strip(), this_line.strip()]).strip()
                key_rects = key + "_rects"
                if not results[t0].get(key_rects):
                    results[t0][key_rects] = []    
                results[t0][key_rects].append(lr)
                #print([x, y, xloc, mode, this_line])
            else:
                print(["ERROR:", x, y, xloc, mode, this_line])
    # pp.pprint(results[t0])
    print()
    #break
                    
    # r_high = rh0
    # annot_rect = page.add_rect_annot(r_rect)
    # annot_high = page.add_highlight_annot(r_high)
    # boxes.append([t0, r_rect, item])




59 59 J

77 77 JL

78 78 Lycopodium cernuum

79 79 Selaginella denticulata

79 79 Isoetes hystrix

79 79 Isoetes olympica

80 80 Equisetum maximum

80 80 Equisetum palustre

80 80 Equisetum ramosissimum

81 81 Ophioglossum vulgatum

81 81 Ophioglossum lusitanicum

81 81 Osmunda regalis

82 82 Gymnogramma leptophylla

82 82 Cheilanthes

83 83 Âdiantum capillus-veneris

84 84 Pteris longifolia

84 84 Pteridium aquilmum

85 85 Athyrium

85 85 Dryopterîs aculeata

85 85 Dryopteris

86 86 Blechnum

87 87 Ceterack officinarum

88 88 Asplenium bourgaei

88 88 Asplenium

88 88 Asplenium ruta-muraria

88 88 Asplenium

88 88 Asplenium

89 89 Phyllitis scolopendrium

89 89 Phyllitis hemionitis

90 90 Polypodium vulgare L.

90 90 Salvinia natans

92 92 Abies cilicica

92 92 Cedrus libani

93 93 Pinus pinea

93 93 Pina

93 93 Pinus halepensis

94 94 Cupressus sempervirens

95 95 Juniperus oxycedrus

96 96 Juniper

96 96 Juniperus

96 96 Arceuthos drupacea

97 97 Ephedra campylopoda

97 97 Ephedra a

In [16]:
from tqdm import tqdm
import math
# marked_fname = "marked-pages-book-1-sample.pdf"
marked_fname = "marked-pages-book-1.pdf"
#doc = fitz.open(fname)  # open document
doc = fitz.open(marked_fname)  # open document
print(len(results))

from fitz.utils import getColorList
cl = getColorList()
from fitz.utils import getColor

pink = getColor("lightpink")
green = getColor("aquamarine")
blue = getColor("lightskyblue")
gray = getColor("whitesmoke")
yellowish = getColor("antiquewhite")
fills= [pink, green, blue, gray, yellowish]

fc = {"L.":pink, "S.":blue, "Aire":green, "Description":gray, "Extra":yellowish}
for it, name in enumerate(tqdm(results.keys())):
    item = results[name]
    #print(item)
    #print(item["Description"])
    book_page = item["book_page"]
    pdf_page = item["pdf_page"]
    print([it, name, book_page, pdf_page])
    page = doc[book_page]
    for k in filter(lambda x: not re.search(r"_page|_rect", x), item.keys()):
        rects = item[k + "_rects"]
        for r in rects:
            #print(k, r)
            
            if not page.rect.intersects(r):
                print(["ERROR: ", k, r])
            else:
                # #annot = page.add_highlight_annot(fitz.Rect(r))
                # annot_rect = page.add_rect_annot(fitz.Rect(r))
                # #annot_rect.set_colors(stroke=(0,1,0))
                # annot_rect.set_border(width=1, dashes=[1,2])
                # annot_rect.update()

                if k in ["L.", "S.", "Aire", "Description", "Extra"]:
                    #print(["HIGHL: ", fc[k], k, r])
                    annot = page.add_highlight_annot(fitz.Rect(r))
                    annot.set_colors(stroke=fc[k])
                    annot.update()
                    #annot_rect = page.add_rect_annot(fitz.Rect(r))
                    #annot_rect.set_colors(stroke=green)
                    #annot_rect.set_border() #(width=1, dashes=[1,2])
                    #annot_rect.update()
                    #annot_rect.update(fill_color=red)


# doc.delete_pages(end_page, doc.page_count - 1)
# doc.delete_pages(0, start_page - 1)
doc_fname = "results-" + doc.name
doc.save(doc_fname, garbage=4, clean=True)
print(doc_fname)
# !open -a Preview results-marked-pages-book-1-sample.pdf
!open -a Preview results-marked-pages-book-1.pdf



861


  8%|▊         | 65/861 [00:00<00:01, 643.20it/s]

[0, 'J', 59, 59]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[1, 'JL', 77, 77]
[2, 'Lycopodium cernuum', 78, 78]
[3, 'Selaginella denticulata', 79, 79]
[4, 'Isoetes hystrix', 79, 79]
[5, 'Isoetes olympica', 79, 79]
[6, 'Equisetum maximum', 80, 80]
[7, 'Equisetum palustre', 80, 80]
[8, 'Equisetum ramosissimum', 80, 80]
[9, 'Ophioglossum vulgatum', 81, 81]
[10, 'Ophioglossum lusitanicum', 81, 81]
[11, 'Osmunda regalis', 81, 81]
[12, 'Gymnogramma leptophylla', 82, 82]
[13, 'Cheilanthes', 82, 82]
[14, 'Âdiantum capillus-veneris', 83, 83]
[15, 'Pteris longifolia', 84, 84]
[16, 'Pteridium aquilmum', 84, 84]
[17, 'Athyrium', 85, 85]
[18, 'Dryopterîs aculeata', 85, 85]
[19, 'Dryopteris', 85, 85]
[20, 'Blechnum', 86, 86]
[21, 'Ceterack officinarum', 87, 87]
[22, 'Asplenium bourgaei', 88, 88]
[23, 'Asplenium', 88, 88]
[24, 'Asplenium ruta-muraria', 88, 88]
[25, 'Phyllitis

 15%|█▌        | 131/861 [00:00<00:01, 637.10it/s]

[131, 'Apera', 144, 144]
[132, 'Gastridium', 144, 144]
[133, 'Gastridium scabrum', 144, 144]
[134, 'Polypogonmonspeliensis', 145, 145]
[135, 'Polypogon maritimus', 145, 145]
[136, 'Lagurus ovatus', 146, 146]
[137, 'Calamagrostis epigeios', 146, 146]
[138, 'Calamagrostis pseudophragmites', 146, 146]
[139, 'Ammophila', 147, 147]
[140, 'Sporobolus arenarius', 147, 147]
[141, 'Eragrostis tatarica', 148, 148]
[142, 'Eragrostis cilianensis', 148, 148]
[143, 'Eragrostis', 149, 149]
[144, 'Eragrostis pilosa', 149, 149]
[145, 'Dactyloctenium aegyptium', 149, 149]
[146, 'Cynodon dactylon', 150, 150]
[147, 'Arundo plinii', 151, 151]
[148, 'Arundo donax', 151, 151]
[149, 'Phragmites', 152, 152]
[150, 'Boissiera pumilio', 152, 152]
[151, 'Holçus', 153, 153]
[152, 'Holcus lanatus', 154, 154]
[153, 'Corynephorus articula˛ii s', 154, 154]
[154, 'Corynephorus', 155, 155]
[155, 'Pilgerochloablanche!', 155, 155]
[156, 'Gaudinopsis', 155, 155]
[157, 'Trisetariacavanillesii', 156, 156]
[158, 'Trisetaria li

 23%|██▎       | 195/861 [00:00<00:01, 599.68it/s]

[165, 'A', 160, 160]
[166, 'Avena convoluta', 161, 161]
[167, 'Arrenatherum elatius', 161, 161]
[168, 'Arrhenatherum palaestinum', 162, 162]
[169, 'Arrhenatherum kotschyi', 162, 162]
[170, 'Lycochloa avenacea', 162, 162]
[171, 'Gaudinia fragilis', 163, 163]
[172, 'Koeleria nitidula', 163, 163]
[173, 'Koeleria phleoides', 164, 164]
[174, 'Antinoria insulari s', 165, 165]
[175, 'Aira elegans', 166, 166]
[176, 'Schismus', 166, 166]
[177, 'Sesleria anatolica', 167, 167]
[178, 'Ammochloa', 167, 167]
[179, 'Echinaria', 168, 168]
[180, 'Melica uniflora Retz', 169, 169]
[181, 'Melica angustifolia', 169, 169]
[182, 'Melica cupani', 169, 169]
[183, 'Melica inaequiglumis', 170, 170]
[184, 'Melica pannosa', 170, 170]
[185, 'Melica ciliata L.,', 170, 170]
[186, 'Molinia', 171, 171]
[187, 'Catabrosa aquatica', 171, 171]
[188, 'Colpodium humile', 172, 172]
[189, 'Sphenopus', 172, 172]
[190, 'Cutandia philistaea', 173, 173]
[191, 'Cutandia maritima', 173, 173]
[192, 'Cutandia memphitica', 173, 173]
[1

 30%|██▉       | 256/861 [00:00<00:01, 561.75it/s]

[236, 'Bromus bikfayensis', 197, 197]
[237, 'Bromus tomentellus', 198, 198]
[238, 'Bromus', 205, 205]
[239, 'Bromus madritensis', 199, 199]
[240, 'Bromus rigidus', 200, 200]
[241, 'Bromus flabellatus', 200, 200]
[242, 'Bromus rubens', 201, 201]
[243, 'Bromus fasciculatus', 201, 201]
[244, 'Bromus alopecuros', 202, 202]
[245, 'Brotnus lanceolatus', 203, 203]
[246, 'Bromus danthoniae', 204, 204]
[247, 'Bromus squarrosus', 204, 204]
[248, 'Bromus intermedius', 205, 205]
[249, 'Bromus brachystachys', 206, 206]
[250, 'Bromus arvensis', 206, 206]
[251, 'BromushordeaceusL.', 206, 206]
[252, 'Bromus japonicus', 206, 206]
[253, 'Trachynia distachya', 207, 207]
[254, 'Brachypodium sylvaticum', 208, 208]
[255, 'Brachypodium pinnatum', 208, 208]
[256, 'Brachypodium ramosum', 209, 209]
[257, 'Lolium', 211, 211]
[258, 'Lolium rigidum', 210, 210]
[259, 'Lolium loliaceum', 211, 211]
[260, 'Lolium persicum', 211, 211]
[261, 'Psilurus incurvus', 212, 212]
[262, 'Monerma cylindrica', 213, 213]
[263, 'Par

 37%|███▋      | 315/861 [00:00<00:00, 569.36it/s]

[315, 'Heleocharis macrantha', 246, 246]
[316, 'Fimbristylis ferruginea', 247, 247]
[317, 'Fimbristylis bis-umbellata', 247, 247]
[318, 'III,', 247, 247]
[319, 'Rhynchospora', 248, 248]
[320, 'Schoenus', 248, 248]
[321, 'Cladium mariscus', 249, 249]
[322, 'Garex stenophylla', 251, 251]
[323, 'Carex divisa', 251, 251]
[324, 'Carex otrubae', 252, 252]
[325, 'Carexdivulsa', 252, 252]
[326, 'Carex pairaei', 253, 253]
[327, 'Carex stellulata', 253, 253]
[328, 'Garex', 256, 256]
[329, 'Carex', 257, 257]
[330, 'Carex pallescens', 254, 254]
[331, 'Carex flacca', 255, 255]
[332, 'Carex buekii', 256, 256]
[333, 'Carex extensa', 257, 257]
[334, 'Carex distans', 258, 258]
[335, 'Cartex pseudocyperus', 258, 258]
[336, 'Carex acutiformis', 258, 258]
[337, 'Carex riparia', 259, 259]
[338, 'Arum', 261, 261]
[339, 'Arum palaestinum', 262, 262]
[340, 'Arum hygrophilum', 264, 264]
[341, 'Arum conophalloides', 264, 264]
[342, 'Arum elongatum', 265, 265]
[343, 'Arum gratum', 266, 266]
[344, 'Arum italicum'

 44%|████▎     | 375/861 [00:00<00:00, 578.57it/s]

[375, 'Colchicum tunicatum', 284, 284]
[376, 'Colchicum tauri', 284, 284]
[377, 'Colchicum libanoticum', 285, 285]
[378, 'Colchicum brachyphyllum', 285, 285]
[379, 'Colchicum steveni', 286, 286]
[380, 'Merendera', 287, 287]
[381, 'Androcymbium palaestinum', 288, 288]
[382, 'Asphodelus microcarpus', 289, 289]
[383, 'Asphodelus fistulosus', 289, 289]
[384, 'Asphodeline lutea', 290, 290]
[385, 'Asphodeline liburnica', 290, 290]
[386, 'Asphodeline brevicaulis', 291, 291]
[387, 'Asphodeline edumea', 291, 291]
[388, 'Asphodeline', 293, 293]
[389, 'Asphodeline isthmocarpa', 293, 293]
[390, 'Asphodeline damascena', 293, 293]
[391, 'Eremurus', 294, 294]
[392, 'Eremurus inderiensis', 294, 294]
[393, 'Aloe vera', 295, 295]
[394, 'Lloydia', 296, 296]
[395, 'Gagea', 303, 303]
[396, 'Gagea arvensis', 297, 297]
[397, 'Gagea peduncularis', 298, 298]
[398, 'Gagea dubia', 298, 298]
[399, 'Gagea minima', 300, 300]
[400, 'Gagea bohemica', 300, 300]
[401, 'Gagea reticulata', 300, 300]
[402, 'Gagea procera'

 51%|█████     | 438/861 [00:00<00:00, 593.59it/s]

[438, 'Bellevalia', 329, 329]
[439, 'Bellevalia nivalis', 328, 328]


 58%|█████▊    | 500/861 [00:00<00:00, 601.64it/s]

[440, 'Bellevalia flexuosa', 328, 328]
[441, 'Muscari', 330, 330]
[442, 'Muscari pinardi', 330, 330]
[443, 'Muscari longipes', 331, 331]
[444, 'Muscari maritimum', 332, 332]
[445, 'Muscari racemosum', 332, 332]
[446, 'Muscari neglectum', 333, 333]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[447, 'Muscari commutatum', 333, 333]
[448, 'Muscari parviflorum', 334, 334]
[449, 'Allium sativum', 339, 339]
[450, 'Allium ampeloprasum', 340, 340]
[451, 'Allium', 362, 362]
[452, 'Allium pseudocalyptratum', 341, 341]
[453, 'Allium calyptratum', 342, 342]
[454, 'Allium drusorum', 342, 342]
[455, 'Allium rotundum', 342, 342]
[456, 'Allium vineale', 343, 343]
[457, 'Allium phanerantherum', 344, 344]
[458, 'Allium affine', 344, 344]
[459, 'Allium arvense', 345, 345]
[460, 'Allium descendens', 345, 345]
[461, 'Allium curtum', 346, 346]
[462, 'Allium emarginatum', 346, 346]
[463, 'Allium artemisietorum', 348, 348]
[464, 'Allium myrianthum', 350, 350]
[465, 'Alli

 65%|██████▌   | 562/861 [00:00<00:00, 606.97it/s]

[530, 'Romulea nivalis', 398, 398]
[531, 'Romulea phoenicia', 398, 398]
[532, 'Romulea', 399, 399]
[533, 'Spiranthes autumnalis', 400, 400]
[534, 'Limodorum', 400, 400]
[535, 'Cephalanthera', 402, 402]
[536, 'Cephalanthera rubra', 401, 401]
[537, 'Epipactis consimilis', 402, 402]
[538, 'Epipactis latifolia', 403, 403]
[539, 'Platanthera', 403, 403]
[540, 'Neotinea intacta', 404, 404]
[541, 'Ophrys', 408, 408]
[542, 'Ophrys iricolor', 406, 406]
[543, 'Ophrys speculum', 407, 407]
[544, 'Ophrys attica', 408, 408]
[545, 'Ophrys scolopax', 409, 409]
[546, 'Ophrys apifera', 410, 410]
[547, 'Ophrys argolica', 410, 410]
[548, 'Ophrys sintenesii', 411, 411]
[549, 'Orchis comperiana', 412, 412]
[550, 'Orchis papilionacea', 413, 413]
[551, 'Orchis morio L.', 413, 413]
[552, 'Orchis coriophora L.', 414, 414]
[553, 'Orchis sancta', 414, 414]
[554, 'Orchis tridentata', 415, 415]
[555, 'Orchis sixnia', 415, 415]
[556, 'Orchis punctulata', 417, 417]
[557, 'Orchis italica', 417, 417]
[558, 'Orchis iber

 73%|███████▎  | 628/861 [00:01<00:00, 621.88it/s]

[607, 'Urtica fragilis', 451, 451]
[608, 'Parietaria judaica', 452, 452]
[609, 'Parietaria eretica', 453, 453]
[610, 'Parietaria lusitanica', 453, 453]
[611, 'Parietaria', 453, 453]
[612, 'Cannabis sativa', 454, 454]
[613, 'Humulus', 454, 454]
[614, 'Osyris alba', 455, 455]
[615, 'Thesium arvense', 456, 456]
[616, 'Thesium', 456, 456]
[617, 'Thesium heterophyllum', 457, 457]
[618, 'Thesium humile', 457, 457]
[619, 'Thesium bergeri', 457, 457]
[620, 'Cynomorium', 458, 458]
[621, 'Cytinus hypocistis', 458, 458]
[622, 'Loranthus europaeus', 459, 459]
[623, 'Viscum', 459, 459]
[624, 'Arceuthobium', 460, 460]
[625, 'Aristolochia altissima', 461, 461]
[626, 'Aristolochia', 464, 464]
[627, 'Aristolochia poecilantha', 463, 463]
[628, 'Rumex', 472, 472]
[629, 'Rumex crispus', 467, 467]
[630, 'Rumex conglomerates', 467, 467]
[631, 'Rumex pulcher', 468, 468]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[632, 'Rumex vesicarius', 472, 472]
[633, 'Rumex cypriu

 81%|████████  | 694/861 [00:01<00:00, 631.19it/s]

[694, 'Suaeda prostrata', 501, 501]
[695, 'Schanginia', 502, 502]
[696, 'Haloxylon', 503, 503]
[697, 'Haloxylon salicornicum', 503, 503]
[698, 'Traganum nudatum', 503, 503]
[699, 'Seidlitzia rosmarinus', 504, 504]
[700, 'Seidlitzia florida', 504, 504]
[701, 'Salsola kali', 506, 506]
[702, 'Salsola volkensii', 506, 506]
[703, 'Salsola inermis', 506, 506]
[704, 'Salsola jordanicola', 507, 507]
[705, 'Salsola incanescens', 507, 507]
[706, 'Salsola autrani', 507, 507]
[707, 'Salsola crassa', 507, 507]
[708, 'Salsola subcrassa', 508, 508]
[709, 'Salsola zenobiae', 508, 508]
[710, 'Salsola tetrandra', 508, 508]
[711, 'Salsola postii', 509, 509]
[712, 'Salsola heliaramiae', 509, 509]
[713, 'Salsola azaurena', 510, 510]
[714, 'Salsola canescens', 510, 510]
[715, 'Salsola villosa', 510, 510]
[716, 'Aellenia glauca', 511, 511]
[717, 'Aellenia', 511, 511]
[718, 'Noaea mucronata', 512, 512]
[719, 'Girgensohnia', 512, 512]
[720, 'Petrosimonia', 513, 513]
[721, 'Anabasis', 513, 513]
[722, 'Anabasis 

 88%|████████▊ | 758/861 [00:01<00:00, 614.75it/s]

[732, 'Amaranthus deflexus', 521, 521]
[733, 'Achyranthes', 521, 521]
[734, 'Alternanthera', 521, 521]
[735, 'Thelygonum cynocrambe', 522, 522]
[736, 'Phytolacca americana', 523, 523]
[737, 'Phytolacca', 523, 523]
[738, 'Boerhavia', 523, 523]
[739, 'Mesembryanthemum', 525, 525]
[740, 'Mesembryanthemum nodiflorum', 525, 525]
[741, 'Aizoon hispanicum', 525, 525]
[742, 'Glinus lotoides', 526, 526]
[743, 'Glinus dictamnoides', 526, 526]
[744, 'Portulacca', 527, 527]
[745, 'Montia fontana L.', 527, 527]
[746, 'Pteranthus', 528, 528]
[747, 'Scleranthus', 529, 529]
[748, 'Habrosia spinuliflora', 529, 529]
[749, 'Gorrigiola', 529, 529]
[750, 'Telephium', 530, 530]
[751, 'Herniaria hirsuta', 530, 530]
[752, 'Herniaria cinerea', 530, 530]
[753, 'Herniaria glabra L.', 531, 531]
[754, 'Herniaria incana', 531, 531]
[755, 'Herniaria arabica', 531, 531]
[756, 'Herniaria hemistemon', 532, 532]
[757, 'Paronychia chionaea', 532, 532]
[758, 'Paronychia kurdica', 533, 533]
[759, 'Paronychia', 533, 533]
[7

100%|██████████| 861/861 [00:01<00:00, 614.98it/s]

[830, 'Saponaria pulvinaris', 585, 585]
[831, 'Saponaria tridentata', 585, 585]
[832, 'Gypsophila', 593, 593]
[833, 'Gypsophila arabica', 588, 588]
[834, 'Gypsophila aucheri', 589, 589]
[835, 'Gypsophila pallida', 589, 589]
[836, 'Gypsophila damascena', 590, 590]
[837, 'Gypsophila ruscifolia', 590, 590]
[838, 'Gypsophila linearifolia', 590, 590]
[839, 'Gypsophila antari', 590, 590]
[840, 'Gypsophila viscosa', 591, 591]
['ERROR: ', 'Extra', Rect(2147483520.0, 2147483520.0, -2147483648.0, -2147483648.0)]
[841, 'Gypsophila mollis', 592, 592]
[842, 'Gypsophila polygonoides', 592, 592]
[843, 'Gypsophila pilosa', 593, 593]
[844, 'Ankyropetalum', 594, 594]
[845, 'Ankyropetalum coelesyriacum', 594, 594]
[846, 'Ankyropetalum arsusianum', 594, 594]
[847, 'Acanthophyllum kurdicum', 595, 595]
[848, 'Kohlrauschia velutina', 595, 595]
[849, 'Tunica syriaca', 596, 596]
[850, 'Tunica pachygona', 596, 596]
[851, 'Tunica arabica', 596, 596]
[852, 'Velezia rigida', 597, 597]
[853, 'Velezia fasciculata', 




In [17]:
doc.name

'marked-pages-book-1.pdf'

In [18]:
import pandas as pd

df = pd.DataFrame.from_dict(results.values()).fillna('')
df = df[df.columns.drop(list(df.filter(regex='_rects')))]
df.to_csv("book-1.csv", index=False)
df

Unnamed: 0,name,pdf_page,book_page,Extra,Description,L.,Aire,S.
0,J,59,59,J Lettre alternant dans la transcription de mo...,,,,
1,JL,77,77,Nahal Nik Np Ol P Pb Pr Reese Russ Sam Schw Th...,,,,
2,Lycopodium cernuum,78,78,Fructification de novembre à mars. Sur grès tr...,Lycopodium cernuum L. var. capillaceum Willd (...,"L. Mi. Entre Nahr es Safa et 'Aïn Qa'a, 1942 e...",Aire géogr. —- Régions tropicales et subtropic...,
3,Selaginella denticulata,79,79,Végétation active de novembre à avril. Fructif...,Selaginella denticulata (L.) Link — Lycopodium...,"L. Ct. et ML, Ce. Saïda (Bl), Beyrouth et envi...",Aire géogr. —• Tour de la Méditerranée. Madère...,"S. Non signalée. Présence presque certaine, Ct..."
4,Isoetes hystrix,79,79,"Sur terrains très humides, émergée. Spores au ...","Isoetes hystrix Dur., forma subinermis Dur. (P...",L. 'Akkar. Prairies humides au nord de la rout...,"Aire géogr. — Tour de la Méditerranée, Côte de...",S. Présence non constatée mais presque certain...
...,...,...,...,...,...,...,...,...
856,Dianthus strictus,599,599,Floraison: mai-décembre. CC. tous terrains. Le...,Dianthus strictus Banks et Sol. (non Sibth. et...,,,
857,Dianthus judaicus,603,603,Var. auraniticus (Post.) n. comb. — Calice 3 c...,Dianthus judaicus Boiss. — D. pattens Sibth. e...,L. Sy. Baalbeck (Wall). St. Qamou'at Hermel (P...,"Aire géogr. — Turquie sud, Syrie, Liban, Pales...","S. A.L. Ouadi-el-Qarn (Sam, Mt, Pb), Zebdani, ..."
858,Dianthus pachypetalus,603,603,,Dianthus pachypetalus Stapf— D. floribundus Bo...,,,
859,Dianthus crinitus,604,604,Floraison: mai-juin.,"Dianthus crinitus Smith (PI. CLXXXV, n. 4). — ...",,"Aire géogr. — Turquie, Géorgie, Iran, Bélouchi...",S. St. Zélaf (Pb).


In [19]:
!open -a Preview results-marked-pages-book-1.pdf