Assumptions:
- Two column page formats
- Indentation differences are known for preprocessing
- <span style="color:#872657"> [NEED TO FIX] </span> Authors only span one line
- If font style is italics for genus then epithets are italicized too

# Imports

In [14]:
import io
import fitz
import re
from PIL import Image, ImageDraw, ImageFont, ImageColor
import operator
import pandas as pd 
import numpy as np
from datetime import datetime
import math
from tqdm import tqdm

# Plot functions

In [15]:
def plot_genus_blocks(page_df, draw, color = '#6c899e', w = 3):
    try:
        genus_list = page_df['draw_genus'].unique()
    except:
        #print("no GENUS found")
        return 

    for g in genus_list:
        temp_df = page_df[(page_df['draw_genus'] == g)]
        g_x0 = temp_df['x0'].min()
        g_y0 = temp_df['y0'].min()
        g_x1 = temp_df['x1'].max()
        g_y1 = temp_df['y1'].max()

        draw.rectangle((g_x0, g_y0, g_x1, g_y1), fill=None, outline=ImageColor.getrgb(color), width = w)
        
def plot_epithet_blocks(page_df, draw, color = '#660066', w = 3):
    try:
        epithet_list = page_df['draw_epithet'].unique()
    except:
        print("no EPITHET found")
        return 
    
    for e in epithet_list:
        temp_df = page_df[(page_df['draw_epithet'] == e)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_author_blocks(page_df, draw, color = '#a3a3a3', w = 1):
    try:
        author_list = page_df['draw_author'].unique()
    except:
        print("no AUTHOR found")
        return 

    for a in author_list:
        temp_df = page_df[(page_df['draw_author'] == a)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_infra_blocks(page_df, draw, color = '#ff6289', w = 1):
    try:
        infra_list = page_df['draw_infra'].unique()
    except:
        print("no INFRA Spp. found")
        return 

    for infra_spp in infra_list:
        temp_df = page_df[(page_df['draw_infra'] == infra_spp)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_valid_words(page_df, draw, color = '#660044', w = 2):
    blocks = page_df['block_no'].unique()
    """for b in blocks:
        lines = page_df[page_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            cond = (page_df['line_no'] == l) & (page_df['block_no'] == b)
            words = page_df[cond]['word_no'].unique()
            page_df = page_df.copy()
            for w in words:
                x0 = page_df[(cond) & (page_df['word_no'] == w)]['x0'].item()
                y0 = page_df[(cond) & (page_df['word_no'] == w)]['y0'].item()
                x1 = page_df[(cond) & (page_df['word_no'] == w)]['x1'].item()
                y1 = page_df[(cond) & (page_df['word_no'] == w)]['y1'].item()
                draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)
    """
    for index, row in page_df.iterrows():
        x0, y0, x1, y1 = row['x0'], row['y0'], row['x1'], row['y1'] 
        draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)

# Filters

In [16]:
def valid(word):
    """
    valid words are words that are:
    - at least 2 characters
        - unless it's x (symbol for hybrid)
    """
    return (not bool(re.search(r"[0-9]+[,.]?", word))) and \
            (word != 'NOUVELLE' and word != 'FLORE') and \
            (word != 'INDEX' and word != 'SPECIERUM') and \
            (len(word) > 1 or \
                word == 'x' or word == 'X' or word == '×' or word == r'\u00D7') and \
            ''.join(e for e in word if e.isalpha()).isalpha()
    
def is_genus(word):
    """
    A word in the index might be a genus if it satisfies the following properties:
    - letters: french alphabet + at most one hyphen (which is not first or last letter)
        - first letter upper case
        - all but first lowecase 
    in regex: ^[A-ZÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]{1}[a-zàâäèéêëîïôœùûüÿç]*[-]?[a-zàâäèéêëîïôœùûüÿç]+$ #ignoring strict beggining and end cause of noise
        * based on the current expression it'd also be at least 2 letters long
    """
    regex = r"[A-ZÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ\u00D7]{1}[a-zàâäèéêëîïôœùûüÿç]*[-]?[a-zàâäèéêëîïôœùûüÿç]+"
    return re.search(regex, word)
    

def is_epithet(word):
    """
    A word in the index might be an epithet if it satisfies the following properties:
    - letters: french alphabet + at most one hyphen (which is not first or last letter)
        - all letters lowecase 
    in regex: ^[a-zàâäèéêëîïôœùûüÿç]+[-]?[a-zàâäèéêëîïôœùûüÿç]+$ #ignoring strict beggining and end cause of noise 
        * based on the current expression it'd also be at least 2 letters long
    """
    regex = r"[a-zàâäèéêëîïôœùûüÿç\u00D7]+[-]?[a-zàâäèéêëîïôœùûüÿç]+"
    return re.search(regex, word)
    
def is_hybrid(word):
    regex = r"^(([Xx\u00D7])|([Xx\u00D7]\.))$"
    return re.search(regex, word)

def is_infra(word):
    regex = r"^(var\.)|(subsp\.)"
    return re.search(regex, word)

first_word_upper = lambda x : type(x) == type("STR") and x[0].isupper()

In [None]:
def preprocessing(pages, page_num, indent_err, TARGET_DPI):
    
    #initiate dataframe
    page_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
    
    #add page number to dataframe
    page_df['page_num'] = np.array([page_num]*page_df.shape[0])
    #initiate all columns that will be added
    page_df['genus'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_genus'] = np.array([np.NaN]*page_df.shape[0])
    page_df['epithet'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_epithet'] = np.array([np.NaN]*page_df.shape[0])
    page_df['author'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_author'] = np.array([np.NaN]*page_df.shape[0])
    page_df['infra'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_infra'] = np.array([np.NaN]*page_df.shape[0])
    page_df['taxon rank'] = np.array([np.NaN]*page_df.shape[0])
    page_df['error_check'] = np.array([np.NaN]*page_df.shape[0])
    #updating coordinates to represent target DPI
    page_df['x0'], page_df['y0'], page_df['x1'], page_df['y1'] = page_df['in_x0']*TARGET_DPI/ 72, page_df['in_y0']*TARGET_DPI/ 72, page_df['in_x1']*TARGET_DPI/ 72, page_df['in_y1']*TARGET_DPI/ 72
    #get x corner coordinates 
    x_min = page_df['x0'].min()
    x_max = page_df['x1'].max()

    y_max = page_df['y1'].max()

    #Remove the extra flore - 18 at page 545
    if page_num == index[4]: #change this with actual page number lol?
        page_df = page_df[~((page_df["word"] == 'Flore') & (page_df['y1'] == y_max))]
    #invalid words dataframe -- for error checking
    pruned_words_df = page_df[~page_df["word"].apply(valid)].reset_index()
    #prune out invalid words (based on function valid)
    page_df = page_df[page_df["word"].apply(valid)].reset_index()
    
    indent_groups = []
    blocks = page_df['block_no'].unique()
    for b in blocks:
        lines = page_df[page_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            #reset word_no values (useful for cases where word that was originally at 0th index was pruned out)
            cond = (page_df['line_no'] == l) & (page_df['block_no'] == b)
            num_words = len(page_df[cond]['word_no'])
            page_df.loc[cond, 'word_no'] = np.arange(num_words).astype(int) #this is slowww
            #set column number (0 or 1)
            x_0 = page_df[cond]['x0'].min()
            #THIS DOESN'T WORK AAAA -- issue was with line no thing
            if not np.isnan(x_0):
                page_df.loc[cond, 'col_no'] = np.array([int(x_0 > ((x_min + x_max) / 2))]*num_words).astype(int)

                #initiate indent groups -- only first word should get an indent_group value 
                new_group = True
                for g_i in range(len(indent_groups)):
                    g = indent_groups[g_i]
                    g_arr = np.array(g)
                    if x_0 <= np.mean(g_arr) + indent_err and x_0 >= np.mean(g_arr) - indent_err:
                        g.append(x_0)
                        new_group = False
                        page_df.loc[cond, 'indent_group'] = np.array([g_i]*num_words).astype(int)
                if new_group:
                    indent_groups.append([x_0])
                    g_i = len(indent_groups) - 1
                    page_df.loc[cond, 'indent_group'] = np.array([g_i]*num_words).astype(int)

    #print("indent groups:", indent_groups)
    #return updated page_df, pruned_words_df, indent groups
    return page_df.reset_index(), pruned_words_df, indent_groups

#https://stackoverflow.com/questions/53468558/adding-image-to-pandas-dataframe

<span style="color:#872657"> [NEED TO FIX] </span> probably should make a new pre processing function based on each volum for fixing specific details? 

# Input and output info / config

In [23]:
#Need to change stuff here
#        make outdir autogenerated? 
config = [
   {"filename" : "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf",
    "output_dir" : "../output/Aaron's_code/",
    "name" : "book-1",
    "page_count" : 642,
    "indent_err" : 30,
    "index" : {
      "range" : range(617, 639) 
      #info about the number of rows and 
      #whether it should look for author after genus name etc.
      #also if infra sepcies stuff exist
    },
    "sections" : {
      "preface" : [1, 78],
      "toponymique" : [49, 76],
      "abbrev" : [77, 78],
      "content" : [79, 607],
      "sample" : [79, 79+10],
      "index" : [617, 639]},
    },
   {"filename" : "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 2.pdf",
    "output_dir" : "../output/Aaron's_code/",
    "name" : "book-2",
    "page_count" : 725,
    "indent_err" : 30,
    "sections" : {
      "preface" : [1, 79],
      "abbrev" : [6, 7],
      "content" : [8, 700],
      "sample" : [8, 8+10],
      "index" : [704, 725]},
    },
   {"filename" : "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf",
    "output_dir" : "../output/Aaron's_code/",
    "name" : "book-3",
    "page_count" : 588,
    "indent_err" : 30,
    "sections" : {
      "preface" : [1, 7],
      "abbrev" : [6, 7],
      "content" : [8, 554],
      "sample" : [8, 8+10],
      "index" : [556, 583],
      "familyidx" : [584, 585]},
    },
]
