Skip to content

Commit

Permalink
Okay, the model is more than we need. Parse it all into basic python …
Browse files Browse the repository at this point in the history
…objects, which is already easy to serialize.
  • Loading branch information
dcloud committed Feb 24, 2014
1 parent ee196dd commit 9036cdc
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 89 deletions.
1 change: 0 additions & 1 deletion hocrgeo/models/__init__.py

This file was deleted.

44 changes: 0 additions & 44 deletions hocrgeo/models/hocr.py

This file was deleted.

110 changes: 66 additions & 44 deletions hocrgeo/parsers/hocr.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from bs4 import BeautifulSoup
import re

from hocrgeo.models.hocr import HOCRDocument

import logging

logger = logging.getLogger(__name__)
Expand All @@ -12,12 +10,6 @@
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

PAGE = 'ocr_page'
CAREA = 'ocr_carea'
PAR = 'ocr_par'
LINE = 'ocr_line'
WORD = 'ocrx_word'

class HOCRParser:
"""
Parse hOCR documents
Expand Down Expand Up @@ -54,57 +46,87 @@ def load_file(self, inputfile):

@property
def document(self):
'''Parsed HOCR document'''
return self._doc

def parse(self, inputfile=None):
'''Parse hOCR document into a python object.'''

def _extract_objects_from_element(root, el_name, el_class):
nodes = root.find_all(el_name, el_class)
objects = []
for n in nodes:
obj = _extract_features(n)
objects.append(obj)

return (nodes, objects)

def _extract_bbox(input_str):
'''Regular expression matching on a input_str that should contain hOCR bbox coordinates.'''
match = self._bboxreg.search(input_str)
if match:
return match.groupdict()
return None

def _extract_features(element):
'''Extract basic hOCR features from a given element.'''
features = {}
features['id'] = element.get('id')
features['bbox'] = _extract_bbox(element.get('title', ''))
return features

if inputfile:
self.load_file(inputfile)
elif not self._rawdata:
raise Exception('No inputfile specified. You must specify an input file when instantiating or as an argument to the parse method')

soup = BeautifulSoup(self._rawdata, "lxml")

self._doc = HOCRDocument()
self._doc = {}

# Extract ocr system metadata
ocr_system = soup.find('meta', attrs={'name': 'ocr-system'})
self._doc.add('ocr-system', ocr_system.get('content', None) if ocr_system else None)
self._doc['ocr-system'] = ocr_system.get('content', None) if ocr_system else None

# Extract capabilities
ocr_capabilities = soup.find('meta', attrs={'name': 'ocr-capabilities'})
self._doc.add('ocr-capabilities', ocr_capabilities.get('content', ' ').split(' '))

all_pages = soup.find_all('div', PAGE)
logger.info('Found {0} page(s)'.format(len(all_pages)))

for page in all_pages:
page_obj = self._extract_features(page)
# page_careas = page.find_all('div', CAREA)
logger.info('Adding a page')
self._doc.add_page(page_obj)

# all_careas = soup.find_all('div', CAREA)
# logger.info('Found {0} carea(s)'.format(len(all_careas)))

# for carea in all_careas:
# carea_obj = self._extract_features(carea)
# logger.info('Adding a carea: {0}'.format(carea_obj))
# # page_obj['careas'].append(carea_obj)
# parent_obj = carea.parent.get('id', None)
# logger.info(parent_obj)

def _extract_bbox(self, input_str):
'''Regular expression matching on a input_str that should contain hOCR bbox coordinates.'''
match = self._bboxreg.search(input_str)
if match:
return match.groupdict()
return None

def _extract_features(self, element):
'''Extract basic hOCR features from a given element.'''
features = {}
features['id'] = element.get('id')
features['bbox'] = self._extract_bbox(element.get('title', ''))
return features
self._doc['ocr-capabilities'] = ocr_capabilities.get('content', ' ').split(' ')

page_nodes, page_objects = _extract_objects_from_element(soup, 'div', 'ocr_page')
page_tup = zip(page_nodes, page_objects)
logger.info('Found {0} page(s)'.format(len(page_tup)))

for page_node, page_obj in page_tup:
carea_nodes, carea_objects = _extract_objects_from_element(page_node, 'div', 'ocr_carea')
careas_tup = zip(carea_nodes, carea_objects)

for c_node, c_obj in careas_tup:
para_nodes, para_objects = _extract_objects_from_element(c_node, 'p', 'ocr_par')
paras_tup = zip(para_nodes, para_objects)

for para_node, para_obj in paras_tup:
line_nodes, line_objects = _extract_objects_from_element(para_node, 'span', 'ocr_line')
lines_tup = zip(line_nodes, line_objects)

for l_node, l_obj in lines_tup:
word_nodes, word_objects = _extract_objects_from_element(l_node, 'span', 'ocrx_word')
words_tup = zip(word_nodes, word_objects)

for w_node, w_obj in words_tup:
word_str = w_node.get_text(strip=True)
if word_str:
logger.info(word_str)
w_obj['text'] = w_node.get_text()
l_obj['words'] = word_objects

para_obj['lines'] = line_objects

c_obj['paragraphs'] = para_objects

page_obj['careas'] = carea_objects

self._doc['pages'] = page_objects




0 comments on commit 9036cdc

Please sign in to comment.