In [1]:
import io
import re
from PIL import Image, ImageDraw, ImageFont, ImageColor
import operator
import pandas as pd 
import numpy as np
from datetime import datetime
import math
from tqdm import tqdm
import fitz
pd.options.mode.chained_assignment = None  # default='warn'

# Ploting funcs

In [2]:
def plot_genus_blocks(page_df, draw, color = '#6c899e', w = 3):
    try:
        genus_list = page_df['draw_genus'].unique()
    except:
        #print("no GENUS found")
        return 

    for g in genus_list:
        temp_df = page_df[(page_df['draw_genus'] == g)]
        g_x0 = temp_df['x0'].min()
        g_y0 = temp_df['y0'].min()
        g_x1 = temp_df['x1'].max()
        g_y1 = temp_df['y1'].max()

        draw.rectangle((g_x0, g_y0, g_x1, g_y1), fill=None, outline=ImageColor.getrgb(color), width = w)
        
def plot_epithet_blocks(page_df, draw, color = '#660066', w = 3):
    try:
        epithet_list = page_df['draw_epithet'].unique()
    except:
        print("no EPITHET found")
        return 
    
    for e in epithet_list:
        temp_df = page_df[(page_df['draw_epithet'] == e)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_author_blocks(page_df, draw, color = '#a3a3a3', w = 1):
    try:
        author_list = page_df['draw_author'].unique()
    except:
        print("no AUTHOR found")
        return 

    for a in author_list:
        temp_df = page_df[(page_df['draw_author'] == a)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_infra_blocks(page_df, draw, color = '#ff6289', w = 1):
    try:
        infra_list = page_df['draw_infra'].unique()
    except:
        print("no INFRA Spp. found")
        return 

    for infra_spp in infra_list:
        temp_df = page_df[(page_df['draw_infra'] == infra_spp)]
        e_x0 = temp_df['x0'].min()
        e_y0 = temp_df['y0'].min()
        e_x1 = temp_df['x1'].max()
        e_y1 = temp_df['y1'].max()

        draw.rectangle((e_x0, e_y0, e_x1, e_y1), fill=None, outline=ImageColor.getrgb(color), width = w)

def plot_valid_words(page_df, draw, color = '#660044', w = 2):
    blocks = page_df['block_no'].unique()
    """for b in blocks:
        lines = page_df[page_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            cond = (page_df['line_no'] == l) & (page_df['block_no'] == b)
            words = page_df[cond]['word_no'].unique()
            page_df = page_df.copy()
            for w in words:
                x0 = page_df[(cond) & (page_df['word_no'] == w)]['x0'].item()
                y0 = page_df[(cond) & (page_df['word_no'] == w)]['y0'].item()
                x1 = page_df[(cond) & (page_df['word_no'] == w)]['x1'].item()
                y1 = page_df[(cond) & (page_df['word_no'] == w)]['y1'].item()
                draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)
    """
    for index, row in page_df.iterrows():
        x0, y0, x1, y1 = row['x0'], row['y0'], row['x1'], row['y1'] 
        draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)

# Import Vol1 Index

In [3]:
#pdf_dir = "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf"
pdf_dir = "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf"
index = range(616, 639)
doc = fitz.open(pdf_dir)
pages = [doc[i] for i in range(doc.page_count)] #doesn't work anymore? [doc[i] for i in range(doc.pageCount)]
#index = list(range(555, 583))

pdf_dir = "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 1.pdf"
index = range(616, 639)

TARGET_DPI = 300
mat = fitz.Matrix(TARGET_DPI/ 72, TARGET_DPI/ 72)

indent_groups = []
indent_err = 15

# GET page_df

In [163]:
def get_page_df(page_num):

    #getting lines from block dicts
    """***START BLOCK SECTION***"""
    #words_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
    #words_df['b_l_tuple'] = tuple(zip(words_df['block_no'], words_df['line_no']))
    #using get_text to extract 'size', 'flags', 'font'
    #NOTE: not sure if this is the best way to go about this ...
    content_blocks_df = pd.DataFrame(pages[page_num].get_text("dict")['blocks'])
    line_dicts = content_blocks_df[~content_blocks_df['lines'].isnull()].explode('lines', ignore_index=False)
    line_dicts

    #block_no is number
    #type is image vs text
    #NEW GOAL : for each exploded item --> get sub indexing for that region and use it as the 
    #                                       line_no
    #                                       word_no

    line_dicts.set_index(['number',line_dicts.groupby('number').cumcount()]).rename_axis(['block_no','line_no']).tail(10)
    line_dicts[["_", "line_no"]]=['number',line_dicts.groupby('number').cumcount()]
    line_dicts['idx'] = np.arange(line_dicts.shape[0])

    #df = df.explode('foo')
    #line_dicts.rename(columns={"number": "block_no"}) -- oh wasn't in place that's the problem lol
    """***END BLOCK SECTION***"""

    #from lines to single words per row
    """***START WORD SECTION***"""
    #THE MAIN ONE NOW
    pd.DataFrame(list(line_dicts['lines']))
    #content_df = pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
    span_of_lines = pd.DataFrame(list(line_dicts['lines']))
    content_df_lines = pd.concat([line_dicts[["number", "type", "line_no"]].reset_index(), span_of_lines], axis=1)
    content_df = content_df_lines.explode('spans')

    """
    content_df = content_df[content_df['text'] != ' ']
    content_df['text'] = content_df['text']
    content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
    content_df = content_df.explode('text')
    split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
    pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)
    line_dicts.explode('lines')


    #https://stackoverflow.com/questions/38231591/split-explode-a-column-of-dictionaries-into-separate-columns-with-pandas
    """
    #pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
    pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
    (pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))
    """
    df = pd.DataFrame(list(line_dicts['lines']))['spans']
    df['span_no'] = np.arange(line_dicts['lines'].shape[0])
    pd.DataFrame(list(df.explode('spans')))""";
    pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans')))).head(20)
    df = pd.DataFrame(list(line_dicts['lines'])).explode('spans')

    #df.set_index([df.index,line_dicts.groupby(df.index).cumcount()]).rename_axis(['line_no','span_no']).tail(10)
    df['idx_val'] = df.index
    df[["_", "span_no"]]=['idx_val',df.groupby('idx_val').cumcount()]
    #df = df.explode('foo')
    df.rename(columns={"number": "line_no"})
    #df[df['span_no'] > 0]
    df = pd.DataFrame(list(df['spans']), index= df['idx_val'])
    df['idx'] = df.index
    df[["_", "span_no"]]=['idx',df.groupby('idx').cumcount()]
    content_df = df 
    content_df = content_df[content_df['text'] != ' ']
    content_df['text'] = content_df['text']
    content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
    content_df = content_df.explode('text')
    split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
    pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)
    """***END WORD SECTION***"""

    #merging the lines df and the words df and pruning the extra content
    mergedRes = pd.merge(line_dicts, pretty_content_df, on ='idx')
    pruned = mergedRes[['number', 'line_no', 'idx', 'text', 'flags', 'font','x0', 'y0', 'x1', 'y1']]

    #creating tuples of block_no, line_no -- used to retrive word number
    pruned['b_l_tuple'] = tuple(zip(pruned['number'], pruned['line_no']))
    #getting word_no
    pruned[["_", "word_no"]]=['b_l_tuple',pruned.groupby('b_l_tuple').cumcount()]

    #changing number to block_no
    pruned.rename(columns = {'number':'block_no'}, inplace = True)
    return pruned[['block_no', 'line_no', 'word_no', 'text', 'flags', 'font', 'x0', 'y0', 'x1', 'y1']]


# regex based boolean functions

In [164]:
def valid(word):
    """
    valid words are words that are:
    - at least 2 characters
        - unless it's x (symbol for hybrid)
    """
    return (not bool(re.search(r"[0-9]+[,.]?", word))) and \
            (word != 'NOUVELLE' and word != 'FLORE') and \
            (len(word) > 1 or \
                word == 'x' or word == 'X' or word == '×' or word == r'\u00D7') and \
            ''.join(e for e in word if e.isalpha()).isalpha()
    
def is_genus(word):
    """
    A word in the index might be a genus if it satisfies the following properties:
    - letters: french alphabet + at most one hyphen (which is not first or last letter)
        - first letter upper case
        - all but first lowecase 
    in regex: ^[A-ZÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]{1}[a-zàâäèéêëîïôœùûüÿç]*[-]?[a-zàâäèéêëîïôœùûüÿç]+$ #ignoring strict beggining and end cause of noise
        * based on the current expression it'd also be at least 2 letters long
    """
    regex = r"[A-ZÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ\u00D7]{1}[a-zàâäèéêëîïôœùûüÿç]*[-]?[a-zàâäèéêëîïôœùûüÿç]+"
    return re.search(regex, word)
    

def is_epithet(word):
    """
    A word in the index might be an epithet if it satisfies the following properties:
    - letters: french alphabet + at most one hyphen (which is not first or last letter)
        - all letters lowecase 
    in regex: ^[a-zàâäèéêëîïôœùûüÿç]+[-]?[a-zàâäèéêëîïôœùûüÿç]+$ #ignoring strict beggining and end cause of noise 
        * based on the current expression it'd also be at least 2 letters long
    """
    regex = r"[a-zàâäèéêëîïôœùûüÿç\u00D7]+[-]?[a-zàâäèéêëîïôœùûüÿç]+"
    return re.search(regex, word)
    
def is_hybrid(word):
    regex = r"^(([Xx\u00D7])|([Xx\u00D7]\.))$"
    return re.search(regex, word)

def is_infra(word):
    regex = r"^(var\.)|(subsp\.)"
    return re.search(regex, word)

# pre-processing func

In [5]:
def preprocessing(page_num, indent_err = 30):
    
    #initiate dataframe
    #using get_text_words to extract 'block_no', 'line_no', 'word_no'
    """
    words_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
    words_df['b_l_tuple'] = tuple(zip(words_df['block_no'], words_df['line_no']))
    #using get_text to extract 'size', 'flags', 'font'
    #NOTE: not sure if this is the best way to go about this ...
    content_blocks_df = pd.DataFrame(pages[page_num].get_text("dict")['blocks'])
    line_dicts = content_blocks_df[~content_blocks_df['lines'].isnull()].explode('lines', ignore_index=False)

    content_df = pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
    content_df = content_df[content_df['text'] != ' ']
    content_df['text'] = content_df['text']
    content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
    content_df = content_df.explode('text')
    split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
    pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)

    #have to take this step beforem merging the words don't match even when they should -- matching based on bbox won't help with this either :( 
    pretty_content_df = pretty_content_df[pretty_content_df["text"].apply(valid)].reset_index()
    words_df = words_df[words_df["word"].apply(valid)].reset_index()

    joined_df = pd.concat([pretty_content_df, words_df], axis=1)
    #the rows of each row should correspond. This following assertion assures that this is the case
    #TODO: might be worthwhile to match according to bbox coordinates (x0, y0, x1, y1) and in_x0, in_y0, in_x1, in_y1
    try:
        assert joined_df[joined_df['text'] != joined_df['word']].empty
    except:
        print("failed on page", page_num)

    page_df = joined_df[['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'text','block_no', 'line_no', 'word_no', 'flags', 'font', 'size', 'color', 'b_l_tuple']]
    """

    page_df = get_page_df(page_num)
    #initiate all columns that will be added
    page_df['page_num'] = np.array([page_num]*page_df.shape[0])
    page_df['genus'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_genus'] = np.array([np.NaN]*page_df.shape[0])
    page_df['epithet'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_epithet'] = np.array([np.NaN]*page_df.shape[0])
    page_df['author'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_author'] = np.array([np.NaN]*page_df.shape[0])
    page_df['infra'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_infra'] = np.array([np.NaN]*page_df.shape[0])
    page_df['taxon rank'] = np.array([np.NaN]*page_df.shape[0])
    page_df['error_check'] = np.array([np.NaN]*page_df.shape[0])
    
    #remove italics
    #italics_b_l = page_df[page_df['flags'] != 6]['b_l_tuple']
    #page_df = page_df[page_df['b_l_tuple'].apply(lambda x : x in italics_b_l.unique())]
    #italics_b_l = page_df[page_df['flags'] != 6]['b_l_tuple'].unique()
    #page_df = page_df[page_df['b_l_tuple'].isin(italics_b_l)]
    italics_b_l = page_df[page_df['flags'] == 6]['b_l_tuple'].unique()
    page_df = page_df.drop(page_df[page_df['b_l_tuple'].isin(italics_b_l)].index.tolist())
    
    #updating coordinates to represent target DPI
    page_df['x0'], page_df['y0'], page_df['x1'], page_df['y1'] = page_df['in_x0']*TARGET_DPI/ 72, page_df['in_y0']*TARGET_DPI/ 72, page_df['in_x1']*TARGET_DPI/ 72, page_df['in_y1']*TARGET_DPI/ 72
    #get x corner coordinates 
    x_min = page_df['x0'].min()
    x_max = page_df['x1'].max()

    y_max = page_df['y1'].max()

    #Remove the extra flore - 18 at page 545
    if page_num == index[4]:
        page_df = page_df[~((page_df["word"] == 'Flore') & (page_df['y1'] == y_max))]
    #invalid words dataframe -- for error checking
    pruned_words_df = page_df[~page_df["word"].apply(valid)].reset_index()
    #prune out invalid words (based on function valid)
    #page_df = page_df[page_df["word"].apply(valid)].reset_index()
    
    indent_groups = []
    blocks = page_df['block_no'].unique()
    for b in blocks:
        lines = page_df[page_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            #reset word_no values (useful for cases where word that was originally at 0th index was pruned out)
            cond = (page_df['line_no'] == l) & (page_df['block_no'] == b)
            num_words = len(page_df[cond]['word_no'])
            page_df.loc[cond, 'word_no'] = np.arange(num_words).astype(int) #this is slowww
            #set column number (0 or 1)
            x_0 = page_df[cond]['x0'].min()
            #THIS DOESN'T WORK AAAA -- issue was with line no thing
            if not np.isnan(x_0):
                page_df.loc[cond, 'col_no'] = np.array([int(x_0 > ((x_min + x_max) / 2))]*num_words).astype(int)

                #initiate indent groups -- only first word should get an indent_group value 
                new_group = True
                for g_i in range(len(indent_groups)):
                    g = indent_groups[g_i]
                    g_arr = np.array(g)
                    if x_0 <= np.mean(g_arr) + indent_err and x_0 >= np.mean(g_arr) - indent_err:
                        g.append(x_0)
                        new_group = False
                        page_df.loc[cond, 'indent_group'] = np.array([g_i]*num_words).astype(int)
                if new_group:
                    indent_groups.append([x_0])
                    g_i = len(indent_groups) - 1
                    page_df.loc[cond, 'indent_group'] = np.array([g_i]*num_words).astype(int)
            
    #print("indent groups:", indent_groups)
    #return updated page_df, pruned_words_df, indent groups
    return page_df.reset_index(), pruned_words_df, indent_groups


In [33]:
page_num = index[0] #tqdm(index)
words_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
words_df['b_l_tuple'] = tuple(zip(words_df['block_no'], words_df['line_no']))
#using get_text to extract 'size', 'flags', 'font'
#NOTE: not sure if this is the best way to go about this ...
content_blocks_df = pd.DataFrame(pages[page_num].get_text("dict")['blocks'])
line_dicts = content_blocks_df[~content_blocks_df['lines'].isnull()].explode('lines', ignore_index=False)
line_dicts

#block_no is number
#type is image vs text
#NEW GOAL : for each exploded item --> get sub indexing for that region and use it as the 
#                                       line_no
#                                       word_no

line_dicts.set_index(['number',line_dicts.groupby('number').cumcount()]).rename_axis(['block_no','line_no']).tail(10)
line_dicts[["_", "line_no"]]=['number',line_dicts.groupby('number').cumcount()]
#df = df.explode('foo')


line_dicts.rename(columns={"number": "block_no"})

  return asarray(a).ndim
  result = asarray(a).shape


Unnamed: 0,block_no,type,bbox,lines,width,height,ext,colorspace,xres,yres,bpc,transform,size,image,_,line_no
0,0,0,"(146.63999938964844, 82.23709106445312, 281.47...","{'spans': [{'size': 13.100000381469727, 'flags...",,,,,,,,,,,number,0
1,1,0,"(115.44000244140625, 131.1444091796875, 121.17...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,0
3,3,0,"(42.47999954223633, 153.70440673828125, 114.12...","{'spans': [{'size': 8.549408912658691, 'flags'...",,,,,,,,,,,number,0
5,5,0,"(28.559999465942383, 163.30438232421875, 93.46...","{'spans': [{'size': 8.48757553100586, 'flags':...",,,,,,,,,,,number,0
7,7,0,"(42.2400016784668, 172.90438842773438, 151.875...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,149,0,"(398.6400146484375, 535.7843627929688, 411.588...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,2
150,150,0,"(403.20001220703125, 565.5443725585938, 411.82...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,0
150,150,0,"(403.20001220703125, 565.5443725585938, 411.82...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,1
150,150,0,"(403.20001220703125, 565.5443725585938, 411.82...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,2


In [133]:
span_of_lines = pd.DataFrame(list(line_dicts['lines']))

line_dicts['idx'] = np.arange(line_dicts.shape[0])
#pd.concat([line_dicts[["number", "type", "line_no", 'idx']].reset_index(), span_of_lines], axis=1)
#line_dicts[["number", "type", "line_no"]]
line_dicts #match by idx to get block no from no line no and then span no 
#still not totally sure about how to get word no .... 

Unnamed: 0,number,type,bbox,lines,width,height,ext,colorspace,xres,yres,bpc,transform,size,image,_,line_no,idx
0,0,0,"(146.63999938964844, 82.23709106445312, 281.47...","{'spans': [{'size': 13.100000381469727, 'flags...",,,,,,,,,,,number,0,0
1,1,0,"(115.44000244140625, 131.1444091796875, 121.17...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,0,1
3,3,0,"(42.47999954223633, 153.70440673828125, 114.12...","{'spans': [{'size': 8.549408912658691, 'flags'...",,,,,,,,,,,number,0,2
5,5,0,"(28.559999465942383, 163.30438232421875, 93.46...","{'spans': [{'size': 8.48757553100586, 'flags':...",,,,,,,,,,,number,0,3
7,7,0,"(42.2400016784668, 172.90438842773438, 151.875...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,149,0,"(398.6400146484375, 535.7843627929688, 411.588...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,2,188
150,150,0,"(403.20001220703125, 565.5443725585938, 411.82...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,0,189
150,150,0,"(403.20001220703125, 565.5443725585938, 411.82...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,1,190
150,150,0,"(403.20001220703125, 565.5443725585938, 411.82...","{'spans': [{'size': 8.399999618530273, 'flags'...",,,,,,,,,,,number,2,191


In [97]:
line_dicts['lines'].shape

(193,)

In [134]:
#THE MAIN ONE NOW
pd.DataFrame(list(line_dicts['lines']))
#content_df = pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
span_of_lines = pd.DataFrame(list(line_dicts['lines']))
content_df_lines = pd.concat([line_dicts[["number", "type", "line_no"]].reset_index(), span_of_lines], axis=1)
content_df = content_df_lines.explode('spans')

"""
content_df = content_df[content_df['text'] != ' ']
content_df['text'] = content_df['text']
content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
content_df = content_df.explode('text')
split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)
line_dicts.explode('lines')


#https://stackoverflow.com/questions/38231591/split-explode-a-column-of-dictionaries-into-separate-columns-with-pandas
"""
#pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
(pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))
"""
df = pd.DataFrame(list(line_dicts['lines']))['spans']
df['span_no'] = np.arange(line_dicts['lines'].shape[0])
pd.DataFrame(list(df.explode('spans')))""";
pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans')))).head(20)
df = pd.DataFrame(list(line_dicts['lines'])).explode('spans')

#df.set_index([df.index,line_dicts.groupby(df.index).cumcount()]).rename_axis(['line_no','span_no']).tail(10)
df['idx_val'] = df.index
df[["_", "span_no"]]=['idx_val',df.groupby('idx_val').cumcount()]
#df = df.explode('foo')
df.rename(columns={"number": "line_no"})
#df[df['span_no'] > 0]
df = pd.DataFrame(list(df['spans']), index= df['idx_val'])
df['idx'] = df.index
df[["_", "span_no"]]=['idx',df.groupby('idx').cumcount()]
content_df = df 
content_df = content_df[content_df['text'] != ' ']
content_df['text'] = content_df['text']
content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
content_df = content_df.explode('text')
split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)

  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape


In [135]:
pretty_content_df

Unnamed: 0,idx_val,size,flags,font,color,ascender,descender,text,origin,bbox,idx,_,span_no,x0,y0,x1,y1
0,0,13.100000,4,Times-Roman,0,0.959,-0.322,INDEX,"(146.63999938964844, 94.79998779296875)","(146.63999938964844, 82.23709106445312, 281.47...",0,idx,0,146.639999,82.237091,281.473114,99.018188
1,0,13.100000,4,Times-Roman,0,0.959,-0.322,SPEGIERUM,"(146.63999938964844, 94.79998779296875)","(146.63999938964844, 82.23709106445312, 281.47...",0,idx,0,146.639999,82.237091,281.473114,99.018188
2,1,8.400000,4,Times-Roman,0,0.959,-0.322,A,"(115.44000244140625, 139.20001220703125)","(115.44000244140625, 131.1444091796875, 121.17...",1,idx,0,115.440002,131.144409,121.177200,141.904816
3,2,8.549409,4,Times-Roman,0,0.959,-0.322,cilicica,"(42.47999954223633, 161.760009765625)","(42.47999954223633, 153.70440673828125, 99.590...",2,idx,0,42.480000,153.704407,99.590271,164.464813
4,2,8.549409,4,Times-Roman,0,0.959,-0.322,Ant.,"(42.47999954223633, 161.760009765625)","(42.47999954223633, 153.70440673828125, 99.590...",2,idx,0,42.480000,153.704407,99.590271,164.464813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,188,8.400000,4,Times-Roman,0,0.959,-0.322,448,"(398.6400146484375, 563.0399780273438)","(398.6400146484375, 554.984375, 411.4140014648...",188,idx,0,398.640015,554.984375,411.414001,565.744751
337,189,8.400000,4,Times-Roman,0,0.959,-0.322,28,"(403.44000244140625, 573.5999755859375)","(403.44000244140625, 565.5443725585938, 411.82...",189,idx,0,403.440002,565.544373,411.824219,576.304749
338,190,8.400000,4,Times-Roman,0,0.959,-0.322,29,"(403.20001220703125, 582.719970703125)","(403.20001220703125, 574.6643676757812, 411.74...",190,idx,0,403.200012,574.664368,411.741821,585.424744
339,191,8.400000,4,Times-Roman,0,0.959,-0.322,30,"(403.44000244140625, 591.8400268554688)","(403.44000244140625, 583.784423828125, 411.736...",191,idx,0,403.440002,583.784424,411.736206,594.544800


In [136]:
mergedRes = pd.merge(line_dicts, pretty_content_df, on ='idx')

In [147]:
mergedRes.to_csv("merged_syns.csv")

In [156]:
pruned = mergedRes[['number', 'line_no', 'idx', 'text', 'flags','x0', 'y0', 'x1', 'y1']]
pruned.head(30)
pruned['b_l_tuple'] = tuple(zip(pruned['number'], pruned['line_no']))
pruned

Unnamed: 0,number,line_no,idx,text,flags,x0,y0,x1,y1,b_l_tuple
0,0,0,0,INDEX,4,146.639999,82.237091,281.473114,99.018188,"(0, 0)"
1,0,0,0,SPEGIERUM,4,146.639999,82.237091,281.473114,99.018188,"(0, 0)"
2,1,0,1,A,4,115.440002,131.144409,121.177200,141.904816,"(1, 0)"
3,3,0,2,cilicica,4,42.480000,153.704407,99.590271,164.464813,"(3, 0)"
4,3,0,2,Ant.,4,42.480000,153.704407,99.590271,164.464813,"(3, 0)"
...,...,...,...,...,...,...,...,...,...,...
336,149,2,188,448,4,398.640015,554.984375,411.414001,565.744751,"(149, 2)"
337,150,0,189,28,4,403.440002,565.544373,411.824219,576.304749,"(150, 0)"
338,150,1,190,29,4,403.200012,574.664368,411.741821,585.424744,"(150, 1)"
339,150,2,191,30,4,403.440002,583.784424,411.736206,594.544800,"(150, 2)"


In [157]:
#line_dicts.set_index(['b_l_tuple',line_dicts.groupby('b_l_tuple').cumcount()]).rename_axis(['block_no','line_no']).tail(10)
pruned[["_", "word_no"]]=['b_l_tuple',pruned.groupby('b_l_tuple').cumcount()]
#df = df.explode('foo')


#line_dicts.rename(columns={"number": "block_no"})
pruned

  return asarray(a).ndim
  result = asarray(a).shape


Unnamed: 0,number,line_no,idx,text,flags,x0,y0,x1,y1,b_l_tuple,_,word_no
0,0,0,0,INDEX,4,146.639999,82.237091,281.473114,99.018188,"(0, 0)",b_l_tuple,0
1,0,0,0,SPEGIERUM,4,146.639999,82.237091,281.473114,99.018188,"(0, 0)",b_l_tuple,1
2,1,0,1,A,4,115.440002,131.144409,121.177200,141.904816,"(1, 0)",b_l_tuple,0
3,3,0,2,cilicica,4,42.480000,153.704407,99.590271,164.464813,"(3, 0)",b_l_tuple,0
4,3,0,2,Ant.,4,42.480000,153.704407,99.590271,164.464813,"(3, 0)",b_l_tuple,1
...,...,...,...,...,...,...,...,...,...,...,...,...
336,149,2,188,448,4,398.640015,554.984375,411.414001,565.744751,"(149, 2)",b_l_tuple,0
337,150,0,189,28,4,403.440002,565.544373,411.824219,576.304749,"(150, 0)",b_l_tuple,0
338,150,1,190,29,4,403.200012,574.664368,411.741821,585.424744,"(150, 1)",b_l_tuple,0
339,150,2,191,30,4,403.440002,583.784424,411.736206,594.544800,"(150, 2)",b_l_tuple,0


In [162]:
#select page index
page_num = index[0] #tqdm(index)

#getting lines from block dicts
"""***START BLOCK SECTION***"""
#words_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
#words_df['b_l_tuple'] = tuple(zip(words_df['block_no'], words_df['line_no']))
#using get_text to extract 'size', 'flags', 'font'
#NOTE: not sure if this is the best way to go about this ...
content_blocks_df = pd.DataFrame(pages[page_num].get_text("dict")['blocks'])
line_dicts = content_blocks_df[~content_blocks_df['lines'].isnull()].explode('lines', ignore_index=False)
line_dicts

#block_no is number
#type is image vs text
#NEW GOAL : for each exploded item --> get sub indexing for that region and use it as the 
#                                       line_no
#                                       word_no

line_dicts.set_index(['number',line_dicts.groupby('number').cumcount()]).rename_axis(['block_no','line_no']).tail(10)
line_dicts[["_", "line_no"]]=['number',line_dicts.groupby('number').cumcount()]
line_dicts['idx'] = np.arange(line_dicts.shape[0])

#df = df.explode('foo')
#line_dicts.rename(columns={"number": "block_no"}) -- oh wasn't in place that's the problem lol
"""***END BLOCK SECTION***"""

#from lines to single words per row
"""***START WORD SECTION***"""
#THE MAIN ONE NOW
pd.DataFrame(list(line_dicts['lines']))
#content_df = pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
span_of_lines = pd.DataFrame(list(line_dicts['lines']))
content_df_lines = pd.concat([line_dicts[["number", "type", "line_no"]].reset_index(), span_of_lines], axis=1)
content_df = content_df_lines.explode('spans')

"""
content_df = content_df[content_df['text'] != ' ']
content_df['text'] = content_df['text']
content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
content_df = content_df.explode('text')
split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)
line_dicts.explode('lines')


#https://stackoverflow.com/questions/38231591/split-explode-a-column-of-dictionaries-into-separate-columns-with-pandas
"""
#pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
(pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))
"""
df = pd.DataFrame(list(line_dicts['lines']))['spans']
df['span_no'] = np.arange(line_dicts['lines'].shape[0])
pd.DataFrame(list(df.explode('spans')))""";
pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans')))).head(20)
df = pd.DataFrame(list(line_dicts['lines'])).explode('spans')

#df.set_index([df.index,line_dicts.groupby(df.index).cumcount()]).rename_axis(['line_no','span_no']).tail(10)
df['idx_val'] = df.index
df[["_", "span_no"]]=['idx_val',df.groupby('idx_val').cumcount()]
#df = df.explode('foo')
df.rename(columns={"number": "line_no"})
#df[df['span_no'] > 0]
df = pd.DataFrame(list(df['spans']), index= df['idx_val'])
df['idx'] = df.index
df[["_", "span_no"]]=['idx',df.groupby('idx').cumcount()]
content_df = df 
content_df = content_df[content_df['text'] != ' ']
content_df['text'] = content_df['text']
content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
content_df = content_df.explode('text')
split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)
"""***END WORD SECTION***"""

#merging the lines df and the words df and pruning the extra content
mergedRes = pd.merge(line_dicts, pretty_content_df, on ='idx')
pruned = mergedRes[['number', 'line_no', 'idx', 'text', 'flags','x0', 'y0', 'x1', 'y1']]

#creating tuples of block_no, line_no -- used to retrive word number
pruned['b_l_tuple'] = tuple(zip(pruned['number'], pruned['line_no']))
#getting word_no
pruned[["_", "word_no"]]=['b_l_tuple',pruned.groupby('b_l_tuple').cumcount()]

#changing number to block_no
pruned.rename(columns = {'number':'block_no'}, inplace = True)
pruned[['block_no', 'line_no', 'word_no', 'text', 'flags', 'x0', 'y0', 'x1', 'y1']]

  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape
  return asarray(a).ndim
  result = asarray(a).shape


Unnamed: 0,block_no,line_no,word_no,text,flags,x0,y0,x1,y1
0,0,0,0,INDEX,4,146.639999,82.237091,281.473114,99.018188
1,0,0,1,SPEGIERUM,4,146.639999,82.237091,281.473114,99.018188
2,1,0,0,A,4,115.440002,131.144409,121.177200,141.904816
3,3,0,0,cilicica,4,42.480000,153.704407,99.590271,164.464813
4,3,0,1,Ant.,4,42.480000,153.704407,99.590271,164.464813
...,...,...,...,...,...,...,...,...,...
336,149,2,0,448,4,398.640015,554.984375,411.414001,565.744751
337,150,0,0,28,4,403.440002,565.544373,411.824219,576.304749
338,150,1,0,29,4,403.200012,574.664368,411.741821,585.424744
339,150,2,0,30,4,403.440002,583.784424,411.736206,594.544800


In [None]:
pruned.rename(columns = {'number':'block_no'}, inplace = True)
pruned[['block_no', 'line_no', 'word_no', 'text', 'flags', 'x0', 'y0', 'x1', 'y1']]

In [96]:
pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))

Unnamed: 0,size,flags,font,color,ascender,descender,text,origin,bbox
0,13.100000,4,Times-Roman,0,0.959,-0.322,INDEX SPEGIERUM,"(146.63999938964844, 94.79998779296875)","(146.63999938964844, 82.23709106445312, 281.47..."
1,8.400000,4,Times-Roman,0,0.959,-0.322,A,"(115.44000244140625, 139.20001220703125)","(115.44000244140625, 131.1444091796875, 121.17..."
2,8.549409,4,Times-Roman,0,0.959,-0.322,cilicica Ant. et,"(42.47999954223633, 161.760009765625)","(42.47999954223633, 153.70440673828125, 99.590..."
3,8.565570,4,Times-Roman,0,0.959,-0.322,Ky,"(99.59027099609375, 161.760009765625)","(99.59027099609375, 153.70440673828125, 114.12..."
4,8.487576,4,Times-Roman,0,0.959,-0.322,Acanthophyllum,"(28.559999465942383, 171.3599853515625)","(28.559999465942383, 163.30438232421875, 93.46..."
...,...,...,...,...,...,...,...,...,...
255,8.400000,4,Times-Roman,0,0.959,-0.322,448,"(398.6400146484375, 563.0399780273438)","(398.6400146484375, 554.984375, 411.4140014648..."
256,8.400000,4,Times-Roman,0,0.959,-0.322,28,"(403.44000244140625, 573.5999755859375)","(403.44000244140625, 565.5443725585938, 411.82..."
257,8.400000,4,Times-Roman,0,0.959,-0.322,29,"(403.20001220703125, 582.719970703125)","(403.20001220703125, 574.6643676757812, 411.74..."
258,8.400000,4,Times-Roman,0,0.959,-0.322,30,"(403.44000244140625, 591.8400268554688)","(403.44000244140625, 583.784423828125, 411.736..."


In [48]:
content_df = pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
content_df = content_df[content_df['text'] != ' ']
content_df['text'] = content_df['text']
content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
content_df = content_df.explode('text')
"""
content_df['text'] = content_df['text']
content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
content_df = content_df.explode('text')
split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)
line_dicts.explode('lines')
""";
content_df

Unnamed: 0,size,flags,font,color,ascender,descender,text,origin,bbox
0,13.100000,4,Times-Roman,0,0.959,-0.322,INDEX,"(146.63999938964844, 94.79998779296875)","(146.63999938964844, 82.23709106445312, 281.47..."
0,13.100000,4,Times-Roman,0,0.959,-0.322,SPEGIERUM,"(146.63999938964844, 94.79998779296875)","(146.63999938964844, 82.23709106445312, 281.47..."
1,8.400000,4,Times-Roman,0,0.959,-0.322,A,"(115.44000244140625, 139.20001220703125)","(115.44000244140625, 131.1444091796875, 121.17..."
2,8.549409,4,Times-Roman,0,0.959,-0.322,cilicica,"(42.47999954223633, 161.760009765625)","(42.47999954223633, 153.70440673828125, 99.590..."
2,8.549409,4,Times-Roman,0,0.959,-0.322,Ant.,"(42.47999954223633, 161.760009765625)","(42.47999954223633, 153.70440673828125, 99.590..."
...,...,...,...,...,...,...,...,...,...
255,8.400000,4,Times-Roman,0,0.959,-0.322,448,"(398.6400146484375, 563.0399780273438)","(398.6400146484375, 554.984375, 411.4140014648..."
256,8.400000,4,Times-Roman,0,0.959,-0.322,28,"(403.44000244140625, 573.5999755859375)","(403.44000244140625, 565.5443725585938, 411.82..."
257,8.400000,4,Times-Roman,0,0.959,-0.322,29,"(403.20001220703125, 582.719970703125)","(403.20001220703125, 574.6643676757812, 411.74..."
258,8.400000,4,Times-Roman,0,0.959,-0.322,30,"(403.44000244140625, 591.8400268554688)","(403.44000244140625, 583.784423828125, 411.736..."


# GET page_df

In [None]:
def get_page_df(page_num):

    #getting lines from block dicts
    """***START BLOCK SECTION***"""
    #words_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
    #words_df['b_l_tuple'] = tuple(zip(words_df['block_no'], words_df['line_no']))
    #using get_text to extract 'size', 'flags', 'font'
    #NOTE: not sure if this is the best way to go about this ...
    content_blocks_df = pd.DataFrame(pages[page_num].get_text("dict")['blocks'])
    line_dicts = content_blocks_df[~content_blocks_df['lines'].isnull()].explode('lines', ignore_index=False)
    line_dicts

    #block_no is number
    #type is image vs text
    #NEW GOAL : for each exploded item --> get sub indexing for that region and use it as the 
    #                                       line_no
    #                                       word_no

    line_dicts.set_index(['number',line_dicts.groupby('number').cumcount()]).rename_axis(['block_no','line_no']).tail(10)
    line_dicts[["_", "line_no"]]=['number',line_dicts.groupby('number').cumcount()]
    line_dicts['idx'] = np.arange(line_dicts.shape[0])

    #df = df.explode('foo')
    #line_dicts.rename(columns={"number": "block_no"}) -- oh wasn't in place that's the problem lol
    """***END BLOCK SECTION***"""

    #from lines to single words per row
    """***START WORD SECTION***"""
    #THE MAIN ONE NOW
    pd.DataFrame(list(line_dicts['lines']))
    #content_df = pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
    span_of_lines = pd.DataFrame(list(line_dicts['lines']))
    content_df_lines = pd.concat([line_dicts[["number", "type", "line_no"]].reset_index(), span_of_lines], axis=1)
    content_df = content_df_lines.explode('spans')

    """
    content_df = content_df[content_df['text'] != ' ']
    content_df['text'] = content_df['text']
    content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
    content_df = content_df.explode('text')
    split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
    pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)
    line_dicts.explode('lines')


    #https://stackoverflow.com/questions/38231591/split-explode-a-column-of-dictionaries-into-separate-columns-with-pandas
    """
    #pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
    pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))))
    (pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans'))
    """
    df = pd.DataFrame(list(line_dicts['lines']))['spans']
    df['span_no'] = np.arange(line_dicts['lines'].shape[0])
    pd.DataFrame(list(df.explode('spans')))""";
    pd.DataFrame(list((pd.DataFrame(list(line_dicts['lines']))['spans'].explode('spans')))).head(20)
    df = pd.DataFrame(list(line_dicts['lines'])).explode('spans')

    #df.set_index([df.index,line_dicts.groupby(df.index).cumcount()]).rename_axis(['line_no','span_no']).tail(10)
    df['idx_val'] = df.index
    df[["_", "span_no"]]=['idx_val',df.groupby('idx_val').cumcount()]
    #df = df.explode('foo')
    df.rename(columns={"number": "line_no"})
    #df[df['span_no'] > 0]
    df = pd.DataFrame(list(df['spans']), index= df['idx_val'])
    df['idx'] = df.index
    df[["_", "span_no"]]=['idx',df.groupby('idx').cumcount()]
    content_df = df 
    content_df = content_df[content_df['text'] != ' ']
    content_df['text'] = content_df['text']
    content_df['text'] = content_df['text'].apply(lambda x : list(x.split()))
    content_df = content_df.explode('text')
    split_bbox_df = pd.DataFrame(content_df['bbox'].tolist(), columns=['x0', 'y0', 'x1', 'y1'])
    pretty_content_df = pd.concat([content_df.reset_index(), split_bbox_df], axis=1)
    """***END WORD SECTION***"""

    #merging the lines df and the words df and pruning the extra content
    mergedRes = pd.merge(line_dicts, pretty_content_df, on ='idx')
    pruned = mergedRes[['number', 'line_no', 'idx', 'text', 'flags','x0', 'y0', 'x1', 'y1']]

    #creating tuples of block_no, line_no -- used to retrive word number
    pruned['b_l_tuple'] = tuple(zip(pruned['number'], pruned['line_no']))
    #getting word_no
    pruned[["_", "word_no"]]=['b_l_tuple',pruned.groupby('b_l_tuple').cumcount()]

    #changing number to block_no
    pruned.rename(columns = {'number':'block_no'}, inplace = True)
    return pruned[['block_no', 'line_no', 'word_no', 'text', 'flags', 'x0', 'y0', 'x1', 'y1']]


# Finding indentations associated with genus, epithet, infra

In [8]:
types = ['genus', 'epithet', 'infra', 'author', 'misc.']
def n_leftmost_indent(df, n):
    """return a tuple with at most 3 elements each element itself is a tuple containing indent group, mean, group len"""
    indent_groups = [(g, df[(df['indent_group'] == g) & (df['word_no'] == 0)]['x0'].mean(), len(df[(df['indent_group'] == g) & (df['word_no'] == 0)]['x0'])) for g in df['indent_group'].unique()]
    indent_groups.sort(key = lambda x : x[1])
    #print(indent_groups[:n])
    return indent_groups[:n]

In [9]:
def get_genusEpithetInfra_indent(col_df):
    leftmost_3_indents = n_leftmost_indent(col_df, 2) #for vol1 only 2 indentations will be given 
    min_gap = 0
    max_gap = 75 #error is 30 -- less than 50% of max gap (which will be ignored for now)

    # possibly not specific enough
    # first identifying indent based don distance from one another only
    """if len(leftmost_3_indents) == 3:
        if leftmost_3_indents[0][1] < max_gap:
            leftmost_3_indents = leftmost_3_indents[1:]
        elif ((leftmost_3_indents[1][1] - leftmost_3_indents[0][1]) > max_gap or \
            (leftmost_3_indents[1][1] - leftmost_3_indents[0][1]) < min_gap): #comparing first two (if satisfied last two will be checked in next if block)
            leftmost_3_indents = [max(leftmost_3_indents[1:], key = lambda x : x[2])] + [leftmost_3_indents[2]]
        elif (leftmost_3_indents[2][1] - leftmost_3_indents[1][1]) > max_gap or \
            (leftmost_3_indents[2][1] - leftmost_3_indents[1][1]) < min_gap: #comparing last two
            leftmost_3_indents = [leftmost_3_indents[0]] + [max(leftmost_3_indents[1:], key = lambda x : x[2])]

    if len(leftmost_3_indents) == 2:
        if leftmost_3_indents[0][1] < max_gap:
            leftmost_3_indents = leftmost_3_indents[1]
        elif (leftmost_3_indents[1][1] - leftmost_3_indents[0][1]) > max_gap or (leftmost_3_indents[1][1] - leftmost_3_indents[0][1]) < min_gap:
            leftmost_3_indents = [max(leftmost_3_indents, key = lambda x : x[2])]"""

    has_genus, has_epithet, has_infra = False, False, False
    genus_indent, epithet_indent, infra_indent = -1, -1, -1
    if len(leftmost_3_indents) == 3 and type(leftmost_3_indents) == type([1,2,3]):
        has_genus, has_epithet, has_infra = True, True, True
        print("leftmost 3:", leftmost_3_indents)
        genus_indent, epithet_indent, infra_indent = [el[0] for el in leftmost_3_indents]
    elif len(leftmost_3_indents) == 2:
        if col_df[col_df['indent_group'] == leftmost_3_indents[1][0]]['word'].apply(is_infra).any():
            has_genus, has_epithet, has_infra = False, True, True
            epithet_indent, infra_indent = [el[0] for el in leftmost_3_indents]
        else:
            has_genus, has_epithet, has_infra = True, True, False
            genus_indent, epithet_indent = [el[0] for el in leftmost_3_indents]
    elif len(leftmost_3_indents) == 1 or type(leftmost_3_indents) == type((1,2,3)): 
        if type(leftmost_3_indents) == type((1,2,3)):
            leftmost_3_indents = [leftmost_3_indents]
        has_genus, has_epithet, has_infra = False, True, False
        epithet_indent = leftmost_3_indents[0][0]

    return genus_indent, epithet_indent, infra_indent, leftmost_3_indents

# Processing column dataframes


In [10]:
def process_col(col_df, genus, epithet, draw_genus, draw_epithet, draw_infra = np.NaN):
    genus_indent, epithet_indent, infra_indent, indent_3_left = get_genusEpithetInfra_indent(col_df)
    #print(genus_indent, epithet_indent, infra_indent, indent_3_left)
    
    blocks = col_df['block_no'].unique()
    start_word_cond = -1 
    author = ''
    #draw_infra = np.NaN
    
    col_df = col_df.copy()
    for index, row in col_df.iterrows():
        b, l, w = row['block_no'], row['line_no'], row['word_no']
        word, indent_group = row['word'], row['indent_group']
        row_cond = (col_df['line_no'] == l) & (col_df['block_no'] == b) & (col_df['word_no'] == w) 
        process_hybrid = False
        process_infra = False
        if w == 0: 
            start_word_cond = row_cond
            if indent_group == genus_indent and not ''.join(e for e in word if e.isalpha()).isupper():
                genus = word
                draw_genus = genus
                epithet = ''
                draw_epithet = ''
                author = ''
                misc = ''
                infra = ''
                col_df.loc[start_word_cond, 'genus'] = genus
                col_df.loc[start_word_cond, 'taxon rank'] = 'genus'
                if not is_genus(word):
                    col_df.loc[row_cond, 'error_check'] = True
                col_df.loc[row_cond, 'draw_genus'] = draw_genus
                col_df.loc[row_cond, 'author'] = ''

            elif indent_group == epithet_indent and not ''.join(e for e in word if e.isalpha()).isupper():
                epithet = word
                author = ''
                col_df.loc[row_cond, 'genus'] = genus
                col_df.loc[row_cond, 'epithet'] = epithet
                col_df.loc[row_cond, 'taxon rank'] = 'species'
                if not is_epithet(word):
                    col_df.loc[row_cond, 'error_check'] = True
                draw_epithet = str(genus) + '_' + str(epithet) +'_' + str(b) + '_' + str(l)
                col_df.loc[row_cond, 'draw_genus'] = draw_genus
                col_df.loc[row_cond, 'draw_epithet'] = draw_epithet
                col_df.loc[row_cond, 'author'] = ''
        
        else:
            #print(genus, epithet)
            if w == 1 and epithet == '': 
                epithet = word
                misc = ''
                infra = ''
                author = ''
                start_word_cond = row_cond
                col_df.loc[row_cond, 'genus'] = genus
                col_df.loc[row_cond, 'epithet'] = epithet
                col_df.loc[row_cond, 'taxon rank'] = 'species'
                if not is_epithet(word):
                    col_df.loc[row_cond, 'error_check'] = True
                draw_epithet = str(genus) + '_' + str(epithet) +'_' + str(b) + '_' + str(l)
                col_df.loc[row_cond, 'draw_genus'] = draw_genus
                col_df.loc[row_cond, 'draw_epithet'] = draw_epithet
                col_df.loc[row_cond, 'author'] = ''
            elif (type(genus) == type("STR") and genus != '') or (type(epithet) == type("STR") and epithet != ''):
                #print(col_df.loc[start_word_cond, 'author'])
                """if np.isnan(col_df.loc[start_word_cond, 'author'].item()):
                    author == ''
                    col_df.loc[start_word_cond, 'author'] = ''"""
                curr_author_part = word +  ' '
                col_df.loc[start_word_cond, 'author'] += curr_author_part
                col_df.loc[row_cond, 'draw_author'] = 'author_'+str(b)+'_'+str(l)
                col_df.loc[row_cond, 'draw_genus'] = draw_genus
            #col_df.loc[word_cond, 'draw_genus'] = draw_genus
            #if epithet:
            #    col_df.loc[word_cond, 'draw_epithet'] = draw_epithet
            #if infra: 
            #    col_df.loc[word_cond, 'draw_infra'] = draw_infra"""

    #Last author
    """if author != '':
        col_df.loc[start_word_cond, 'author'] = author"""
                    

    return col_df, genus, epithet, draw_genus, draw_epithet


# Run PreProcessing

In [11]:
#preprocessing
genus = np.NaN
df_dict = {}
pruned_dict = {}

for page_num in tqdm(index):
    page_df, pruned_df, indent_group = preprocessing(page_num)
    df_dict[page_num] = page_df
    pruned_dict[page_num] = pruned_df

genus = np.NaN
epithet = np.NaN
draw_genus = np.NaN
draw_epithet = np.NaN
result_ims_valid_words = []
df_list = []

for page_num in tqdm(index):
    #page_num = index[-1]
    #process the pre-processed dfs
    page_df = df_dict[page_num]
    
    #for drawing
    pix_map = doc.get_page_pixmap(page_num,matrix=mat)
    image = Image.open(io.BytesIO(pix_map.tobytes()))
    draw = ImageDraw.Draw(image)
    
    plot_valid_words(page_df, draw, color = '#660044', w = 2)
    result_ims_valid_words.append(image)
    
    #break 
#result_ims_valid_words[0].save(OUTPUT_PATH + "preprocessed/" + 'valid_words' + TAIL_STR + '.pdf',save_all=True, append_images=result_ims[1:])

100%|██████████| 23/23 [00:01<00:00, 19.17it/s]
100%|██████████| 23/23 [00:04<00:00,  5.36it/s]


# Final Results + SAVE

In [12]:
#Setting up files and directories for saving the results
SCRIPT_NAME = "vol1_index_synonyms.ipynb"
SCRIPT_OUTPUT_PATH = "../output/index/" + SCRIPT_NAME + "/"
DATE_STR = datetime.now().strftime("%Y_%m_%d") 
TIME_STR = datetime.now().strftime("%H%M")
QUICK_FIX = False
TAIL_STR = ''

if QUICK_FIX:
    OUTPUT_PATH = SCRIPT_OUTPUT_PATH + DATE_STR + "/QuickFix/" 
    #TAIL_STR = '_' + DATE_STR + '_' + TIME_STR
else:
    OUTPUT_PATH = SCRIPT_OUTPUT_PATH + DATE_STR + "/" + TIME_STR + "/"

try:
    os.makedirs(OUTPUT_PATH)
except FileExistsError:
    # directory already exists
    pass

try:
    os.makedirs(OUTPUT_PATH + "preprocessed/")
except FileExistsError:
    # directory already exists
    pass

try:
    os.makedirs(OUTPUT_PATH + 'raw/')
except FileExistsError:
    # directory already exists
    pass

In [13]:
pre_processed_df = pd.concat([df_dict[k] for k in df_dict], axis = 0)
result_ims_valid_words[0].save(OUTPUT_PATH + "preprocessed/" + 'valid_words' + TAIL_STR + '.pdf',save_all=True, append_images=result_ims_valid_words[1:])
pre_processed_df.to_html(OUTPUT_PATH + "preprocessed/" + 'vol1_preprocessed_index' + TAIL_STR + '.html')
pre_processed_df.to_csv(OUTPUT_PATH + "preprocessed/" + 'vol1_preprocessed_index' + TAIL_STR + '.csv')

In [14]:
genus = np.NaN
epithet = np.NaN
draw_genus = np.NaN
draw_epithet = np.NaN
result_ims = []
df_list = []

for page_num in tqdm(index):
    #if page_num == index[-2]:
    #    break
    #page_num = index[-1]
    #process the pre-processed dfs
    page_df = df_dict[page_num]
    
    #for drawing
    pix_map = doc.get_page_pixmap(page_num,matrix=mat)
    image = Image.open(io.BytesIO(pix_map.tobytes()))
    draw = ImageDraw.Draw(image)

    #processing each column
    for c in page_df['col_no'].unique():
        col_df = page_df[page_df['col_no'] == c]
        col_df, genus, epithet, draw_genus, draw_epithet = process_col(col_df, genus, epithet, draw_genus, draw_epithet)
        df_list.append(col_df)

        #drawing boxes in each column
        plot_genus_blocks(col_df, draw)
        plot_epithet_blocks(col_df, draw)
        plot_author_blocks(col_df, draw)
        plot_infra_blocks(col_df, draw)

    result_ims.append(image)

#TIME_STR = datetime.now().strftime("%Y_%m_%d-%I_%M_%p")
result_ims[0].save(OUTPUT_PATH + 'vol1_index_ROI.pdf',save_all=True, append_images=result_ims[1:])

pre_processed_df = pd.concat([df_dict[k] for k in df_dict], axis = 0)
result_ims_valid_words[0].save(OUTPUT_PATH + "preprocessed/" + 'valid_words' + TAIL_STR + '.pdf',save_all=True, append_images=result_ims_valid_words[1:])
pre_processed_df.to_html(OUTPUT_PATH + "preprocessed/" + 'vol1_preprocessed_index' + TAIL_STR + '.html')
pre_processed_df.to_csv(OUTPUT_PATH + "preprocessed/" + 'vol1_preprocessed_index' + TAIL_STR + '.csv')

df = pd.concat(df_list, axis = 0)
df.to_html(OUTPUT_PATH + 'raw/' + 'vol1_index' + TAIL_STR + '.html')
df.to_csv(OUTPUT_PATH + 'raw/' + 'vol1_index' + TAIL_STR + '.csv', index = False)

pruned = df[(~df['genus'].isnull())]
pruned = pruned[["page_num", "genus", "epithet", "infra" ,"author", "taxon rank"]]
pruned.to_csv(OUTPUT_PATH + 'vol1_index_pruned' + TAIL_STR + '.csv', index = False)
pruned.to_html(OUTPUT_PATH + 'vol1_index_pruned' + TAIL_STR + '.html')

100%|██████████| 23/23 [00:05<00:00,  3.86it/s]
