In [1]:
import pandas as pd
import numpy as np
from shapely.geometry import Polygon
import re

In [2]:
def area(x):
    
    return Polygon(x).area


def join_list(x):
    
    return ''.join(x)


def text_type(x):
    
    try:
        float(x)
        return 0

    except ValueError:

        return 1


def extrac_numbers(x):
    
    return re.sub('[^0-9,.]', "", x)


def number_type(x):
    
    try:
        float(x)
        return 1

    except ValueError:
        try:
            float(x.replace(',', '.'))

            return 1
    
        except ValueError:
            return 0
        
        
def number(x):
    
    try:
        return float(x)

    except ValueError:
        try:
            return float(x.replace(',', '.'))
    
        except ValueError:
            return 0

In [3]:
def get_features(df):
    
    # area
    columns = ['block_vert',
               'paragraph_vert',
               'word_vert',
              ]
    for col in columns:
        df[col.replace('vert', 'area')] = df[col].apply(area)
        
    # weighted area
    columns = ['block_area',
               'paragraph_area',
               'word_area',
              ]
    for col in columns:
        df[col.replace('area', 'weigh')] = df[col].divide(df[col].max())
        
    # relative area
    df.loc[:, 'rel_word_block_area'] = df['word_area'].divide(df['block_area'])
    df.loc[:, 'rel_word_parag_area'] = df['word_area'].divide(df['paragraph_area'])
    df.loc[:, 'rel_parag_block_area'] = df['paragraph_area'].divide(df['block_area'])
    
    # join text
    df.loc[:, 'text_join'] = df['text'].apply(join_list)
    
    # prev symbol
    df.loc[:, 'prev_symbol'] = np.where(df['symbol_search_pos'].shift(-1), 1, 0)
    df.loc[:, 'prev_symbol'] = df.loc[:, 'prev_symbol'].fillna(0)
    
    # next symbol
    df.loc[:, 'next_symbol'] = np.where(df['symbol_search_pos'].shift(1), 1, 0)
    df.loc[:, 'next_symbol'] = df.loc[:, 'next_symbol'].fillna(0)
    
    # text type
    df.loc[:, 'text_type'] = df['text_join'].apply(text_type)
    
    # prev text type
    df.loc[:, 'prev_text_type'] = df['text_type'].shift(1)
    df.loc[:, 'prev_text_type'] = df.loc[:, 'prev_text_type'].fillna(0)
    
    # next text tpye
    df.loc[:, 'next_text_type'] = df['text_type'].shift(-1)
    df.loc[:, 'next_text_type'] = df.loc[:, 'next_text_type'].fillna(0)
    
    # text len
    df.loc[:, 'text_len'] = df['text_join'].str.len()
    
    # is symbol
    df.loc[:, 'is_a_symbol'] = np.where(df['symbol_search_pos'], 1, 0)
    
    # extract numbers
    df.loc[:, 'text_numbers'] = df['text_join'].apply(extrac_numbers)
    
    # number type
    df.loc[:, 'number_type'] = df['text_numbers'].apply(number_type)
    
    # number
    df.loc[:, 'number'] = df['text_join'].apply(number)
    
    return df

In [4]:
df = pd.read_csv('../data/preprocessed/data.csv', index_col=[0,1])

In [5]:
df.shape

(432, 16)

In [6]:
from ast import literal_eval

In [7]:
columns = ['block_vert', 'paragraph_vert', 'word_vert', 'text']

In [8]:
for col in columns:
    
    df[col] = df[col].apply(literal_eval)

In [9]:
df_features = pd.DataFrame()

In [10]:
for i in df.index.get_level_values(0).unique():
    
    df_temp = df.loc[df.index.get_level_values(0)==i,:].copy()
    df_features = df_features.append(get_features(df_temp), ignore_index=True)

In [11]:
df_features.shape

(432, 36)

In [12]:
df_features.sample(3).T

Unnamed: 0,296,165,293
n_page,0,0,0
page_height,581,960,581
page_width,1032,1280,1032
n_block,4,0,2
block_vert,"[(401, 271), (619, 255), (625, 330), (407, 346)]","[(294, 368), (1042, 334), (1044, 376), (296, 4...","[(569, 151), (647, 155), (646, 172), (568, 168)]"
block_confidence,0.97,0.93,0.95
n_paragraph,0,0,0
paragraph_vert,"[(401, 271), (619, 255), (625, 330), (407, 346)]","[(294, 368), (1042, 334), (1044, 376), (296, 4...","[(569, 151), (647, 155), (646, 172), (568, 168)]"
paragraph_confidence,0.97,0.93,0.95
n_word,0,2,1


In [13]:
df_features[df_features['y']==1].T

Unnamed: 0,22,54,74,100,112,123,146,172,201,230,268,297,314,341,363,377,397,419
n_page,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
page_height,719,581,581,581,719,581,581,960,581,719,581,581,581,581,581,581,4032,4032
page_width,1280,1032,1032,1032,1280,1032,1032,1280,1032,1280,1032,1032,1032,1032,1032,1032,3024,3024
n_block,5,4,1,2,2,2,1,1,1,1,4,4,1,1,1,1,3,2
block_vert,"[(433, 437), (752, 423), (756, 526), (437, 540)]","[(411, 286), (638, 260), (646, 338), (420, 363)]","[(259, 273), (534, 271), (535, 353), (260, 355)]","[(449, 248), (587, 244), (589, 319), (451, 323)]","[(743, 318), (858, 296), (868, 348), (753, 370)]","[(479, 145), (704, 143), (705, 198), (480, 200)]","[(423, 152), (569, 144), (573, 224), (427, 232)]","[(509, 472), (654, 474), (653, 581), (508, 579)]","[(388, 216), (535, 236), (523, 323), (376, 303)]","[(677, 332), (814, 343), (807, 419), (671, 408)]","[(382, 274), (622, 245), (633, 330), (392, 359)]","[(401, 271), (619, 255), (625, 330), (407, 346)]","[(376, 243), (759, 248), (758, 320), (375, 315)]","[(454, 197), (547, 198), (546, 273), (453, 272)]","[(450, 250), (635, 247), (636, 309), (451, 312)]","[(272, 203), (568, 200), (569, 276), (273, 279)]","[(953, 2028), (1653, 2017), (1660, 2433), (960...","[(1024, 1856), (2173, 1880), (2162, 2391), (10..."
block_confidence,0.98,0.98,0.49,0.99,0.79,0.52,0.94,0.99,0.94,0.91,0.99,0.97,0.46,0.99,0.69,0.56,0.99,0.99
n_paragraph,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
paragraph_vert,"[(433, 437), (752, 423), (756, 526), (437, 540)]","[(411, 286), (638, 260), (646, 338), (420, 363)]","[(259, 273), (534, 271), (535, 353), (260, 355)]","[(449, 248), (587, 244), (589, 319), (451, 323)]","[(743, 318), (858, 296), (868, 348), (753, 370)]","[(479, 145), (704, 143), (705, 198), (480, 200)]","[(423, 152), (569, 144), (573, 224), (427, 232)]","[(509, 472), (654, 474), (653, 581), (508, 579)]","[(388, 216), (535, 236), (523, 323), (376, 303)]","[(677, 332), (814, 343), (807, 419), (671, 408)]","[(382, 274), (622, 245), (633, 330), (392, 359)]","[(401, 271), (619, 255), (625, 330), (407, 346)]","[(376, 243), (759, 248), (758, 320), (375, 315)]","[(454, 197), (547, 198), (546, 273), (453, 272)]","[(450, 250), (635, 247), (636, 309), (451, 312)]","[(272, 203), (568, 200), (569, 276), (273, 279)]","[(953, 2028), (1653, 2017), (1660, 2433), (960...","[(1024, 1856), (2173, 1880), (2162, 2391), (10..."
paragraph_confidence,0.98,0.98,0.49,0.99,0.79,0.52,0.94,0.99,0.94,0.91,0.99,0.97,0.46,0.99,0.69,0.56,0.99,0.99
n_word,1,1,2,0,0,0,1,1,1,1,1,1,1,0,1,1,0,0


In [14]:
df_features.to_csv('../data/processed/data.csv')