-
Notifications
You must be signed in to change notification settings - Fork 588
/
ocropus-dbinfo
executable file
·80 lines (72 loc) · 2.54 KB
/
ocropus-dbinfo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/python
import code,pickle,sys,os,re
import matplotlib
if "DISPLAY" not in os.environ: matplotlib.use("AGG")
else: matplotlib.use("GTK")
from ocrolib import dbtables
from pylab import *
from optparse import OptionParser
import traceback
import sqlite3
parser = OptionParser("""
usage: %prog [options] database.db ...
Show information about the given databases.
""")
parser.add_option("-f","--full",help="full class counts",action="store_true")
(options,args) = parser.parse_args()
if len(args)<1:
parser.print_help()
sys.exit(0)
fw = int(amax([len(s) for s in args]))
for dbfile in args:
db = sqlite3.connect(dbfile)
cur = db.cursor()
names = cur.execute("select name from sqlite_master where type='table'")
tables = [row[0] for row in names]
for table in tables:
counts = cur.execute("select cls,count(*) from %s group by cls order by cls"%(table))
if options.full:
print
print "===",dbfile,":",table,"==="
print
sql = cur.execute("select sql from sqlite_master where type='table' and name='%s'"%(table))
print list(sql)[0][0]
print
for count in counts:
print "%6s\t%6d"%("[%s]"%count[0],count[1])
print "empty",empty
print "space",space
print "control",control
print "reject",reject
print "unlabeled",unlabeled
print "upper",upper
print "lower",lower
print "digit",digit
print "nonascii",nonascii
print "multi",multi
else:
empty = 0
space = 0
control = 0
reject = 0
unlabeled = 0
upper = 0
lower = 0
digit = 0
nonascii = 0
multi = 0
for count in counts:
cls,n = count
if cls is None or len(cls)<1: empty += n
elif len(cls)>1: multi += n
elif cls==" ": space += n
elif cls=="_": unlabeled += n
elif cls=="~": reject += n
elif ord(cls)<32: control += n
elif re.match('[a-z]',cls): lower += n
elif re.match('[A-Z]',cls): upper += n
elif re.match('[0-9]',cls): digit += n
elif ord(cls[0])>=128: nonascii += n
print "%*s: "%(fw,dbfile),
sys.stdout.flush()
print "rej %7d uc %7d lc %7d dig %7d na %7d _ %7d lig %7d"%(reject,upper,lower,digit,nonascii,unlabeled,multi)