Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
executable file
161 lines (125 sloc)
5.21 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import __builtin__ as python | |
import random as pyrandom | |
import sys | |
import os.path | |
import re | |
import glob | |
import argparse | |
import codecs | |
import numpy as np | |
from matplotlib.pyplot import imread | |
import ocrolib | |
from ocrolib import hocr | |
parser = argparse.ArgumentParser(""" | |
Construct an HTML output file in hOCR format by putting together | |
the recognition results for each page in sequence. | |
You should usually invoke this program as | |
ocropus-hocr 'book/????.bin.png' | |
For each page like 'book/0001.bin.png', it uses the following files: | |
book/0001.bin.png # page image | |
book/0001.pseg.png # page segmentation | |
book/0001/010001.txt # recognizer output for lines | |
""") | |
parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks") | |
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs") | |
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s") | |
parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s") | |
parser.add_argument('files',nargs='+') | |
args = parser.parse_args() | |
args.files = ocrolib.glob_all(args.files) | |
ostream = codecs.open(args.output,"w","utf-8") | |
def E(*args): | |
args = [str(x) for x in args] | |
sys.stderr.write(" ".join(args)) | |
sys.stderr.write("\n") | |
def P(*args): | |
ostream.write("".join(args)+"\n") | |
def PN(*args): | |
ostream.write("".join(args)) | |
E("writing to",args.output) | |
median_xheight = None | |
dirs = [ocrolib.allsplitext(name)[0] for name in args.files] | |
xhfiles = python.sum([glob.glob(d+"/??????.xheight") for d in dirs],[]) | |
if len(xhfiles)>5: | |
xheights = [float(ocrolib.read_text(f)) for f in xhfiles] | |
if len(xheights)>0: | |
median_xheight = np.median(xheights) | |
else: | |
lfiles = python.sum([glob.glob(d+"/??????.bin.png") for d in dirs],[]) | |
pyrandom.shuffle(lfiles) | |
if len(lfiles)>0: | |
median_xheight = 0.5*np.median([imread(f).shape[0] for f in lfiles[:100]]) | |
E("median_xheight",median_xheight) | |
P(hocr.header()) | |
last_coords = None | |
for arg in args.files: | |
base,_ = ocrolib.allsplitext(arg) | |
try: | |
E("===",arg) | |
image = ocrolib.read_image_binary(arg) | |
height, width = image.shape | |
P("<div class='ocr_page' title='image %s; bbox 0 0 %d %d'>"%(arg,width,height)) | |
# to proceed, we need a pseg file and a | |
# subdirectory containing text lines | |
if not os.path.exists(base+".pseg.png"): | |
E("%s: no such file"%(base+".pseg.png",)) | |
continue | |
if not os.path.isdir(base): | |
E("%s: no such directory"%base) | |
continue | |
# iterate through the text lines in reading order, based | |
# on the page segmentation file | |
pseg = ocrolib.read_page_segmentation(base+".pseg.png") | |
regions = ocrolib.RegionExtractor() | |
regions.setPageLines(pseg) | |
for i in range(1,regions.length()): | |
# keep track of the bounding box information for each line | |
# and insert paragraph breaks as needed | |
id = regions.id(i) | |
y0,x0,y1,x1 = regions.bbox(i) | |
if last_coords is not None: | |
lx0,ly0 = last_coords | |
dx,dy = x0-lx0,y1-ly0 | |
par = 0 | |
if dy>0: | |
par = 0 # column break... moving upwards | |
else: | |
if median_xheight is not None: | |
if abs(dy)>5*median_xheight: par = 1 # whitespace separator | |
if dx>2*median_xheight: par = 1 # indented paragraph | |
if abs(dx)>10*median_xheight: par = 1 # something else | |
if par and not args.nopars: P("<p />") | |
last_coords = (x0,y0) | |
# get the text for the line itself | |
lbase = "%s/%06x"%(base,id) | |
if not os.path.exists(lbase+".txt"): | |
E("note: line %s produced no output (it may not have contained text)"%(lbase+".bin.png")) | |
continue | |
text = ocrolib.read_text(lbase+".txt") | |
text = re.sub(r'\&','\&',text) | |
text = re.sub(r'\<','\<',text) | |
# accumulate information for each line here | |
style = "" | |
info = "" | |
# estimate the font size for this line | |
if median_xheight is not None and os.path.exists(lbase+".xheight"): | |
xheight = float(ocrolib.read_text(lbase+".xheight")) | |
perc = int(np.clip(xheight*100.0/median_xheight,30,300)) | |
perc = 10*((perc+5)//10) | |
if perc!=100: | |
style += "font-size:%d%%;"%perc | |
# output geometric information | |
info += "bbox %d %d %d %d"%(x0,y0,x1,y1) | |
if os.path.exists(lbase+".baseline"): | |
info += "; baseline "+ocrolib.read_text(lbase+".baseline") | |
# put it all together into a SPAN | |
PN("<span") | |
if style!="": PN(" style='"+style+"'") | |
PN(" class='ocr_line' title='%s'>"%info,text,"</span>") | |
if not args.nobreaks: P("<br />") | |
else: P() | |
finally: | |
P("</div>") | |
P(hocr.footer()) | |
ostream.close() |