-
Notifications
You must be signed in to change notification settings - Fork 588
/
ocroold-cluster-kmeans2
executable file
·83 lines (66 loc) · 2.59 KB
/
ocroold-cluster-kmeans2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/python
import code,pickle,sys,os,re
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
from pylab import *
from optparse import OptionParser
from scipy import stats
import ocrolib
from ocrolib import dbtables,quant,ocroold
parser = OptionParser("""
usage: %prog [options] chars.db clusters.db
Perform kmeans clustering of characters in a database. This is fairly slow, loads
all characters into memory, and can't be applied to big databases. It is usually
applied after epsilon clustering if a further reduction in size is desired.
""")
parser.add_option("-D","--display",help="display chars",action="store_true")
parser.add_option("-v","--verbose",help="verbose output",action="store_true")
parser.add_option("-t","--table",help="table name",default="chars")
parser.add_option("-k","--k",help="k",type=int,default=300)
parser.add_option("-m","--minvecs",help="minimum number of vectors in a cluster",type=int,default=3)
# parser.add_option("-M","--minchange",help="minimum number of changes (fraction)",type=float,default=0.005)
parser.add_option("-n","--niter",help="max number of iterations",type=int,default=100)
parser.add_option("-O","--outlier",help="outlier range",type=float,default=3.0)
from scipy import mgrid,linalg,ndimage
import sys,os,random,math
import numpy,pylab,scipy
from numpy import *
verbose = 1
(options,args) = parser.parse_args()
if len(args)!=2:
parser.print_help()
sys.exit(0)
input = args[0]
output = args[1]
ion()
show()
table = dbtables.Table(input,options.table)
table.converter("image",dbtables.SmallImage())
table.create(image="blob",cls="text",classes="text")
classes = [row[0] for row in table.query("select distinct(cls) from '%s' order by cls"%options.table)]
extractor = ocroold.ScaledFE()
data = []
print "loading"
for row in table.get():
raw = row.image
if raw.shape[0]>255 or raw.shape[1]>255: continue
c = raw/float(amax(raw))
v = extractor.extract(c)
data.append(v)
print "clustering"
data = array(data,'f')
print "data",data.shape
# minchange=max(1,int(options.minchange*len(data)))
means,counts = quant.kmeans(data,k=options.k,maxiter=options.niter,
# outlier=options.outlier,minvecs=options.minvecs
)
print "writing"
table = dbtables.ClusterTable(output)
table.create(image="blob",cls="text",count="integer",classes="text")
table.converter("image",dbtables.SmallImage())
for i in range(means.shape[0]):
v = means[i]
image = array(v/amax(v)*255.0,'B')
image.shape = (30,30)
table.set(image=image,cls="_",count=counts[i],classes="")