-
Notifications
You must be signed in to change notification settings - Fork 588
/
ocroold-binarize
executable file
·158 lines (142 loc) · 5.67 KB
/
ocroold-binarize
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/python
# FIXME
# -- add more detailed preprocessing options, separate out stages explicitly
# -- page frame removal, noise removal, ...
# -- parallelize TIFF images
import sys,os,re,optparse,shutil,traceback
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
from multiprocessing import Pool
import signal,fcntl
signal.signal(signal.SIGINT,lambda *args:sys.exit(1))
from pylab import *
from scipy.ndimage import measurements
from scipy.misc import imsave
from PIL import Image
import ocrolib
from ocrolib import number_of_processors
import ocropreproc
from ocrolib import ocroold
parser = optparse.OptionParser(usage="""
%prog -o dir [options] image1 image2 ...
The following applies to "-B ocroold.Standardpreprocessing"
Performs preprocessing on each of the images on the command line and stores
the resulting output in a directory with "book structure". That is, the
input pages will be stored in dir/0001.png dir/0001.bin.png dir/0002.png
dir/0002.bin.png etc. dir/0001.png contains the deskewed grayscale image,
while dir/0001.bin.png contains the binarized version.
Preprocessing uses hysteresis thresholding; you control it mainly through the
-L and -H arguments, which take values between 0 and 1:
* large parts of characters are missing: decrease -H
* there is too much noise in the image: increase -H
* characters are too thin or broken up: increase -L
* characters are too thick or touching: decrease -L
""")
parser.add_option("-o","--output",help="output directory",default="book")
parser.add_option("-O","--Output",help="output image.png to image.bin.png (in place)",action="store_true")
parser.add_option("-d","--display",help="display result",action="store_true")
parser.add_option("-D","--Display",help="display continuously",action="store_true")
parser.add_option("-T","--threshold",help="threshold (simple Sauvola if set)",default=-1,type=float)
parser.add_option("-L","--low",help="low threshold (0.1)",default=0.1,type=float)
parser.add_option("-H","--high",help="high threshold (0.4)",default=0.4,type=float)
parser.add_option("-W","--width",help="width parameter (40)",default=40,type=float)
parser.add_option("-r","--dpi",help="resolution (DPI) (300)",default=300,type=float)
parser.add_option("-q","--silent",action="store_true",help="disable warnings")
parser.add_option("-P","--parallel",type=int,default=number_of_processors(),help="number of parallel processes to use")
parser.add_option("-B","--binarizer",help="binarization component to be used",default="ocropreproc.CommonPreprocessing")
parser.add_option("-g","--gtextension",help="ground truth extension for copying in ground truth (include all dots)",default=None)
options,args = parser.parse_args()
if len(args)<1:
parser.print_help()
sys.exit(0)
if options.threshold>0:
options.low = options.threshold
options.high = options.threshold
if options.binarizer=="StandardPreprocessing":
binarizer = ocrolib.StandardPreprocessing()
binarizer.pset("binarizer","BinarizeByHT")
binarizer.reinit()
binarizer.command("binarizer_pset","k0",str(options.low))
binarizer.command("binarizer_pset","k1",str(options.high))
binarizer.command("binarizer_pset","width",str(options.width))
else:
binarizer = ocrolib.make_IBinarize(options.binarizer)
if options.Display: options.display = 1
if options.display: ion(); show()
if options.Output:
options.output = None
if not options.Output and os.path.exists(options.output):
print "%s: already exists; please remove"%options.output
sys.exit(0)
files = None
if not options.Output:
os.mkdir(options.output)
files = open(options.output+"/FILES","w")
def process_image(image,arg,count):
if options.display:
clf()
imshow(image,cmap=cm.gray)
draw()
ginput(1,timeout=1)
bin,gray = binarizer.binarize(image)
try:
pass
except:
print "# binarizer failed; skipping",arg
return 0
if options.display:
clf()
imshow(bin,cmap=cm.gray)
draw()
if not options.Display:
raw_input("hit ENTER to continue")
else:
ginput(1,timeout=1)
if not options.silent:
if ocrolib.quick_check_page_components(bin,dpi=options.dpi)<0.5:
print "# skipping page"
return 0
if options.Output:
dest,_ = ocrolib.allsplitext(arg)
print "# writing",dest
imsave(dest+".png",gray)
imsave(dest+".bin.png",bin)
else:
dest = "%s/%04d" % (options.output,count)
print "# writing",dest,gray.shape,bin.shape
imsave(dest+".png",gray)
imsave(dest+".bin.png",bin)
if options.gtextension is not None:
base,_ = ocrolib.allsplitext(arg)
shutil.copyfile(base+options.gtextension,dest+".gt.txt")
if files is not None:
fcntl.flock(files,fcntl.LOCK_EX)
files.write("%04d\t%s\n"%(count,arg))
files.flush()
fcntl.flock(files,fcntl.LOCK_UN)
def process1(t):
arg,count = t
n = 0
for image,arg in ocrolib.page_iterator([arg]):
assert n<2,"no multipage files with parallel processing; use -P 0"
print "===",arg,count,image.shape
try:
process_image(image,arg,count)
except:
traceback.print_exc()
raise
n += 1
if options.parallel<2:
count = 0
for image,arg in ocrolib.page_iterator(args):
print "===",arg,count,image.shape
count += 1
process_image(image,arg,count)
else:
pool = Pool(processes=options.parallel)
jobs = []
for i in range(len(args)): jobs += [(args[i],i+1)]
result = pool.map(process1,jobs)
if files is not None:
files.close()