In [6]:
import io
import fitz
import re
from PIL import Image, ImageDraw, ImageFont, ImageColor
import operator
import pandas as pd 
import numpy as np
from datetime import datetime
import math
from tqdm import tqdm
import matplotlib

In [9]:
def plot_blocks(page_df, draw, color = '#660044', w = 2):
    blocks = page_df['block_no'].unique()
    """for b in blocks:
        lines = page_df[page_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            cond = (page_df['line_no'] == l) & (page_df['block_no'] == b)
            words = page_df[cond]['word_no'].unique()
            page_df = page_df.copy()
            for w in words:
                x0 = page_df[(cond) & (page_df['word_no'] == w)]['x0'].item()
                y0 = page_df[(cond) & (page_df['word_no'] == w)]['y0'].item()
                x1 = page_df[(cond) & (page_df['word_no'] == w)]['x1'].item()
                y1 = page_df[(cond) & (page_df['word_no'] == w)]['y1'].item()
                draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)
    """
    for b in blocks:
        sub_df = page_df[page_df['block_no'] == b]
        x0, y0, x1, y1 = sub_df['x0'].min(), sub_df['y0'].min(), sub_df['x1'].max(), sub_df['y1'].max() 
        draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)
    #for index, row in page_df.iterrows():
    #    x0, y0, x1, y1 = row['x0'], row['y0'], row['x1'], row['y1'] 
    #    draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)

In [2]:
pdf_dir = "../input/NOUVELLE FLORE DU LIBAN ET DE LA SYRIE 3.pdf"
doc = fitz.open(pdf_dir)
pages = [doc[i] for i in range(doc.pageCount)]
index = list(range(555, 583))

TARGET_DPI = 300
mat = fitz.Matrix(TARGET_DPI/ 72, TARGET_DPI/ 72)

indent_groups = []
indent_err = 15

In [13]:
def valid(word):
    return (not bool(re.search(r"[0-9]+[,.]?", word))) and \
            (word != 'NOUVELLE' and word != 'FLORE') and \
            (len(word) > 1 or \
                word == 'x' or word == 'X' or word == '×' or word == r'\u00D7') and \
            ''.join(e for e in word if e.isalpha()).isalpha()

In [65]:
def preprocessing(page_num, indent_err = 15):
    
    #initiate dataframe
    page_df = pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
    
    #add page number to dataframe
    page_df['page_num'] = np.array([page_num]*page_df.shape[0])
    #initiate all columns that will be added
    page_df['genus'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_genus'] = np.array([np.NaN]*page_df.shape[0])
    page_df['epithet'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_epithet'] = np.array([np.NaN]*page_df.shape[0])
    page_df['author'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_author'] = np.array([np.NaN]*page_df.shape[0])
    page_df['infra'] = np.array([np.NaN]*page_df.shape[0])
    page_df['draw_infra'] = np.array([np.NaN]*page_df.shape[0])
    page_df['taxon rank'] = np.array([np.NaN]*page_df.shape[0])
    page_df['error_check'] = np.array([np.NaN]*page_df.shape[0])
    #updating coordinates to represent target DPI
    page_df['x0'], page_df['y0'], page_df['x1'], page_df['y1'] = page_df['in_x0']*TARGET_DPI/ 72, page_df['in_y0']*TARGET_DPI/ 72, page_df['in_x1']*TARGET_DPI/ 72, page_df['in_y1']*TARGET_DPI/ 72
    #get x corner coordinates 
    x_min = page_df['x0'].min()
    x_max = page_df['x1'].max()

    #invalid words dataframe -- for error checking
    pruned_words_df = page_df[~page_df["word"].apply(valid)].reset_index()
    #prune out invalid words (based on function valid)
    page_df = page_df[page_df["word"].apply(valid)].reset_index()
    
    indent_groups = []
    blocks = page_df['block_no'].unique()
    for b in blocks:
        lines = page_df[page_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            if l == 0:
                #reset word_no values (useful for cases where word that was originally at 0th index was pruned out)
                cond = (page_df['block_no'] == b) #page_df['line_no'] == l) & 
                num_words = len(page_df[cond]['word_no'])
                page_df.loc[cond, 'word_no'] = np.arange(num_words).astype(int) #this is slowww
                #set column number (0 or 1)
                x_0 = page_df[cond]['x0'].min()
                #THIS DOESN'T WORK AAAA -- issue was with line no thing
            if not np.isnan(x_0):
                page_df.loc[cond, 'col_no'] = np.array([int(x_0 > ((x_min + x_max) / 2))]*num_words).astype(int)

                #initiate indent groups -- only first word should get an indent_group value 
                new_group = True
                for g_i in range(len(indent_groups)):
                    g = indent_groups[g_i]
                    g_arr = np.array(g)
                    if x_0 <= np.mean(g_arr) + indent_err and x_0 >= np.mean(g_arr) - indent_err:
                        g.append(x_0)
                        new_group = False
                        page_df.loc[cond, 'indent_group'] = np.array([g_i]*num_words).astype(int)
                if new_group:
                    indent_groups.append([x_0])
                    g_i = len(indent_groups) - 1
                    page_df.loc[cond, 'indent_group'] = np.array([g_i]*num_words).astype(int)
            

    #return updated page_df, pruned_words_df, indent groups
    return  page_df.reset_index()

#https://stackoverflow.com/questions/53468558/adding-image-to-pandas-dataframe

In [66]:
def n_leftmost_indent(df, n):
    """return a tuple with at most 3 elements each element itself is a tuple containing indent group, mean, group len"""
    indent_groups = [(g, df[(df['indent_group'] == g) & (df['word_no'] == 0)]['x0'].mean(), len(df[(df['indent_group'] == g) & (df['word_no'] == 0)]['x0'])) for g in df['indent_group'].unique()]
    indent_groups.sort(key = lambda x : x[1])
    return indent_groups[:n]

In [67]:
list(matplotlib.colors.cnames.values());


In [68]:
def plot_indent_blocks(page_df, draw):
    blocks = page_df['block_no'].unique()
    """for b in blocks:
        lines = page_df[page_df['block_no'] == b]['line_no'].unique()
        for l in lines:
            cond = (page_df['line_no'] == l) & (page_df['block_no'] == b)
            words = page_df[cond]['word_no'].unique()
            page_df = page_df.copy()
            for w in words:
                x0 = page_df[(cond) & (page_df['word_no'] == w)]['x0'].item()
                y0 = page_df[(cond) & (page_df['word_no'] == w)]['y0'].item()
                x1 = page_df[(cond) & (page_df['word_no'] == w)]['x1'].item()
                y1 = page_df[(cond) & (page_df['word_no'] == w)]['y1'].item()
                draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)
    """
    color_list = ['#f54242', '#f59e42', '#f5e342', '#7bf542', '#42f5b6', '#42d1f5', '#426ff5', '#5d42f5', '#bf42f5', '#f542b3', '#b35959', '#b38659', '#b0b359', '#6eb359', '#59b392', '#59a4b3', '#596bb3', '#7159b3', '#b35998']
    
    #list(matplotlib.colors.cnames.values())
        
    for b in blocks:
        sub_df = page_df[page_df['block_no'] == b]
        g_min, g_max = sub_df['indent_group'].min(), sub_df['indent_group'].max()
        g = g_min
        """if g_min != g_max:
            print(g_min, g_max) is only issue with nan so we're good"""
        if not np.isnan(g):
            color = color_list[int(g)]
        else: 
            color = '#FFFFFF'
        
        x0, y0, x1, y1 = sub_df['x0'].min(), sub_df['y0'].min(), sub_df['x1'].max(), sub_df['y1'].max() 
        draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = 5)
    #for index, row in page_df.iterrows():
    #    x0, y0, x1, y1 = row['x0'], row['y0'], row['x1'], row['y1'] 
    #    draw.rectangle((x0, y0, x1, y1), fill=None, outline=ImageColor.getrgb(color), width = w)

In [69]:
result_ims = []
df_dict = {}

for page_num in tqdm(range(2*doc.pageCount//4, 3*doc.pageCount//4)):
    page_df = preprocessing(page_num, indent_err = 30)
    df_dict[page_num] = page_df
    #pd.DataFrame(pages[page_num].get_text_words(), columns =['in_x0', 'in_y0', 'in_x1', 'in_y1', 'word', 'block_no', 'line_no', 'word_no'])
    #page_df['x0'], page_df['y0'], page_df['x1'], page_df['y1'] = page_df['in_x0']*TARGET_DPI/ 72, page_df['in_y0']*TARGET_DPI/ 72, page_df['in_x1']*TARGET_DPI/ 72, page_df['in_y1']*TARGET_DPI/ 72
    pix_map = doc.get_page_pixmap(page_num,matrix=mat)
    image = Image.open(io.BytesIO(pix_map.tobytes()))
    draw = ImageDraw.Draw(image)
    plot_indent_blocks(page_df, draw)
    result_ims.append(image)

#result_ims[0].save(OUTPUT_PATH + 'block_regions_third_quarter.pdf',save_all=True, append_images=result_ims[1:])

100%|██████████| 147/147 [01:10<00:00,  2.08it/s]


In [70]:
result_ims[0].save('block_regions_third_quarterV3.pdf',save_all=True, append_images=result_ims[1:])

In [60]:
from IPython.display import Markdown, display

colors = list(matplotlib.colors.cnames.values())
colors = ['#f54242', '#f59e42', '#f5e342', '#7bf542', '#42f5b6', '#42d1f5', '#426ff5', '#5d42f5', '#bf42f5', '#f542b3', '#b35959', '#b38659', '#b0b359', '#6eb359', '#59b392', '#59a4b3', '#596bb3', '#7159b3', '#b35998']

display(Markdown('<br>'.join(
    f'<span style="font-family: monospace">{color} <span style="color: {color}">████████</span></span>'
    for color in colors
)))
#https://gist.github.com/wmayner/9b099a0e4a5f8e94f0c6ab2f570187a5

<span style="font-family: monospace">#f54242 <span style="color: #f54242">████████</span></span><br><span style="font-family: monospace">#f59e42 <span style="color: #f59e42">████████</span></span><br><span style="font-family: monospace">#f5e342 <span style="color: #f5e342">████████</span></span><br><span style="font-family: monospace">#7bf542 <span style="color: #7bf542">████████</span></span><br><span style="font-family: monospace">#42f5b6 <span style="color: #42f5b6">████████</span></span><br><span style="font-family: monospace">#42d1f5 <span style="color: #42d1f5">████████</span></span><br><span style="font-family: monospace">#426ff5 <span style="color: #426ff5">████████</span></span><br><span style="font-family: monospace">#5d42f5 <span style="color: #5d42f5">████████</span></span><br><span style="font-family: monospace">#bf42f5 <span style="color: #bf42f5">████████</span></span><br><span style="font-family: monospace">#f542b3 <span style="color: #f542b3">████████</span></span><br><span style="font-family: monospace">#b35959 <span style="color: #b35959">████████</span></span><br><span style="font-family: monospace">#b38659 <span style="color: #b38659">████████</span></span><br><span style="font-family: monospace">#b0b359 <span style="color: #b0b359">████████</span></span><br><span style="font-family: monospace">#6eb359 <span style="color: #6eb359">████████</span></span><br><span style="font-family: monospace">#59b392 <span style="color: #59b392">████████</span></span><br><span style="font-family: monospace">#59a4b3 <span style="color: #59a4b3">████████</span></span><br><span style="font-family: monospace">#596bb3 <span style="color: #596bb3">████████</span></span><br><span style="font-family: monospace">#7159b3 <span style="color: #7159b3">████████</span></span><br><span style="font-family: monospace">#b35998 <span style="color: #b35998">████████</span></span>