-
Notifications
You must be signed in to change notification settings - Fork 588
/
ocropus-par
executable file
·121 lines (100 loc) · 4.09 KB
/
ocropus-par
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/python
### Simple command line tool driving the ocropus pipeline.
import sys,os,re,glob,math,glob,signal,traceback
import argparse,subprocess,multiprocessing
from itertools import *
import ocrolib
signal.signal(signal.SIGINT,lambda *args:sys.exit(1))
def concat(l):
result = []
for x in l: result += x
return result
parser = argparse.ArgumentParser(description = """
Parallel processing for the OCRopus pipeline.
""")
parser.add_argument("files",nargs='*',default=None,help="input images")
parser.add_argument("-C","--nocheck",action="store_true",help="don't check for correct installation")
parser.add_argument("-D","--Display",help="display",action="store_true")
parser.add_argument("-m","--model",default=ocrolib.default.model,help="character model")
parser.add_argument("-l","--lmodel",default=ocrolib.default.ngraphs,help="language model")
parser.add_argument("-b","--book",default=None,help="book directory to be used for intermediate computations")
parser.add_argument("-B","--keep",action="store_true",help="keep the book directory")
parser.add_argument("-o","--output",default="book.html",help="output file (HTML/hOCR format)")
parser.add_argument("--preproc",default="ocropus-nlbin",help="preprocessing command")
parser.add_argument("--pageseg",default="ocropus-gpageseg",help="page segmentation command")
parser.add_argument("--linerec",default="ocropus-lattices",help="line recognition command")
parser.add_argument("--langmod",default="ocropus-ngraphs",help="language modeling command")
parser.add_argument("-Q","--parallel",type=int,default=4)
parser.add_argument("--preproc_chunk",type=int,default=5)
parser.add_argument("--pageseg_chunk",type=int,default=5)
parser.add_argument("--linerec_chunk",type=int,default=100)
parser.add_argument("--langmod_chunk",type=int,default=100)
args = parser.parse_args()
args.files = concat([glob.glob(file) for file in args.files])
if not args.nocheck:
assert ocrolib.findfile(ocrolib.default.model,error=0) is not None,\
"cannot find '%s'; run: ocropus-download-models"%ocrolib.default.model
assert ocrolib.findfile(ocrolib.default.ngraphs,error=0) is not None,\
"cannot find '%s'; run: ocropus-download-models"%ocrolib.default.ngraphs
assert ocrolib.findfile(args.model,error=0) is not None,\
"%s: cannot find"%args.model
assert ocrolib.findfile(args.lmodel,error=0) is not None,\
"%s: cannot find"%args.lmodel
def run(*args,**kw):
args = [x if type(x)==list else list(x) for x in args]
args = concat(args)
print "#"," ".join(args)
status = subprocess.call(args,**kw)
if status!=0:
print "exit",status
sys.exit(1)
def execute(cmd):
print cmd
subprocess.call(cmd)
def prun(command,args,par=args.parallel,chunk=10):
par = max(par,1)
jobs = []
n = par*chunk
for i in range(0,len(args),n):
sub = args[i:i+n]
for j in range(par):
l = sub[j::par]
if len(l)<1: continue
jobs.append(command+l)
if par<2:
for job in jobs:
execute(job)
else:
pool = multiprocessing.Pool(par)
pool.imap_unordered(execute,jobs,1)
pool.close()
pool.join()
if args.book is None:
book = "_book-%06d"%os.getpid()
else:
book = args.book
print "book directory",book
os.mkdir(book)
for i,fname in enumerate(args.files):
out = book+"/%04d.png"%(i+1)
print fname,"->",out
image = ocrolib.read_image_gray(fname)
ocrolib.write_image_gray(out,image)
preproc = [args.preproc]
pageseg = [args.pageseg]
linerec = [args.linerec,"-m",args.model]
langmod = [args.langmod,"-l",args.lmodel]
print "=== preprocess"
prun(preproc,glob.glob(book+"/????.png"),chunk=args.preproc_chunk)
print
print "=== page segmentation"
prun(pageseg,glob.glob(book+"/????.png"),chunk=args.pageseg_chunk)
print
print "=== line recognition"
prun(linerec,glob.glob(book+"/????/??????.png"),chunk=args.linerec_chunk)
print
print "=== language modeling"
prun(langmod,glob.glob(book+"/????/??????.lattice"),chunk=args.langmod_chunk)
print
os.system("ocropus-hocr '%s' > '%s'"%(book,args.output))
print "output in",args.output