-
Notifications
You must be signed in to change notification settings - Fork 2
/
pdf_gen_by_size.py
189 lines (174 loc) · 8.01 KB
/
pdf_gen_by_size.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os, sys
import datetime
import logging
from optparse import OptionParser
import pdf_gen
#from pdf_gen import OAC_EADtoPDFGenerator as PDFGen
#from pdf_gen import Bunch
FILESIZE = 2**13 #8K
FILESIZE_MAX = 2**27 # ~100M
LOG_BACKUP_COUNT = 10
LOG_LEVEL = logging.INFO
TIMEOUT_LOG_LEVEL = 90
TIMEOUT_SECS_START = 2**5 #32
global info
info = pdf_gen.Bunch(numtimeouts = 0,
numerrs = 0,
numfileattempt = 0,
numfilecomplete = 0,
timer = datetime.timedelta(0),
starttime = datetime.datetime.now(),
nohtml = False,
savehtml = False,
debug=False,
timeout_start = TIMEOUT_SECS_START,
css = pdf_gen.CSSFILE,
path_to_exe = pdf_gen.PATH_TO_PROGS,
xslt=pdf_gen.XSLTFILE,
)
logging.info("PATH_TO_PROGS: ", os.environ.get("PATH_TO_PROGS"))
description = '''
PDF Run by size: Runs the OAC EAD xml files through our PDF Generaotor. Runs
them by increasing size.
'''
def process_args(args_os):
"""Uses optparse.OptionsParser to get options. Stuffs values into the info
Bunch object.
"""
usage = "%prog --dir DIR [options]"
arg_parser = OptionParser(usage=usage, description=description)
arg_parser.add_option("-d", "--dir", dest="dir",
help="Recurse from directory <DIR> and create pdfs for all xml files in subtree.")
arg_parser.add_option("-p", "--logprefix", dest="logprefix",
help="Use <LOGPREFIX> for log file names.")
arg_parser.add_option("-o", "--outdir", dest="outdir",
help=""" parallel=/abs/dir - produce output in parallele directory structure, with root of /abs/dir
subdir=name - produce output in subdir "name"
outdir=/abs/dir - put all pdf files in /abs/dir
"""
)
arg_parser.add_option("--force", dest="force", action="store_true",
help="Force regeneration of existing pdf files")
arg_parser.add_option("-t", "--timeout", dest="timeout",
help="Set <TIMEOUT> in seconds to start run at. Default=%s" %
info.timeout_start)
arg_parser.add_option("--debug", action="store_true", dest="debug",
default=False, help="Run in debug mode")
arg_parser.add_option("--savehtml", action="store_true", dest="savehtml",
default=False, help="Save intermediate html files")
arg_parser.add_option("--nohtml", action="store_true", dest="nohtml",
default=False,
help="Do not generate html files, use existing files.")
(options, args) = arg_parser.parse_args(args_os)
if not options.dir:
arg_parser.error("-d (--dir) must be set.")
if options.outdir:
if options.outdir.lower().find('parallel=') == -1:
#build parallel path
root_dir, drop = os.path.split(options.dir)
if not drop:
root_dir, drop = os.path.split(root_dir)
options.outdir = options.outdir.lower() + '=' + os.path.join(root_dir, 'pdf')
return options
def setup_logger(logprefix='pdf_by_size'):
log = logging.getLogger('OAC')
log = logging.getLogger()
log.setLevel(LOG_LEVEL)
h = logging.handlers.RotatingFileHandler(logprefix+'.log', backupCount=LOG_BACKUP_COUNT )
h.setLevel(LOG_LEVEL)
format = logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s")
h.setFormatter(format)
h.doRollover()
log.addHandler(h)
h = logging.handlers.RotatingFileHandler(logprefix+'.err', backupCount=LOG_BACKUP_COUNT )
h.setLevel(logging.ERROR)
format = logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s")
h.setFormatter(format)
h.doRollover()
log.addHandler(h)
h = logging.handlers.RotatingFileHandler(logprefix+'.timeouts', backupCount=LOG_BACKUP_COUNT )
h.setLevel(TIMEOUT_LOG_LEVEL)
format = logging.Formatter("%(asctime)s %(name)s %(levelname)s %(message)s")
h.setFormatter(format)
h.doRollover()
log.addHandler(h)
return log
def main(args):
#setup our processor
options = process_args(args)
logprefix = options.logprefix if options.logprefix is not None else 'pdf_gen_by_size'
LOGGER = setup_logger(logprefix=logprefix)
LOGGER.info("Process id: %s" % os.getpid())
generator = pdf_gen.OAC_EADtoPDFGenerator(os.path.join(info.path_to_exe,
info.xslt
)
)
filelist = []
errorlist = []
filesize_cur = FILESIZE
filesize_last = 0
timeout_cur = 0
if options.timeout:
timeout_cur = int(options.timeout)
else:
timeout_cur += info.timeout_start
LOGGER.info("Force:%s" % options.force)
LOGGER.info("savehtml:%s" % options.savehtml)
LOGGER.info("nohtml: %s" % options.nohtml)
LOGGER.info("outdir:%s" % options.outdir)
timeouts_last_iter = []
while (filesize_cur < FILESIZE_MAX):
for root, dirs, files in os.walk(options.dir):
for f in files:
if generator.canGeneratePDF(f):
path = os.path.join(root, f)
if filesize_last < os.stat(path).st_size <= filesize_cur:
#LOGGER.info(path, os.stat(path).st_size)
filelist.append(path)
filelist.extend(timeouts_last_iter)
LOGGER.info("For %d < size <= %d, %d files, %d are previous timeouts to process, TimeOut in seconds:%d" % (filesize_last,
filesize_cur,
len(filelist),
len(timeouts_last_iter),
timeout_cur
)
)
complete, timeouts, errs, skips = generator.pdf_gen_list(filelist,
timeoutSecs=timeout_cur,
outdir_option = options.outdir,
data_root=options.dir,
force = options.force,
nohtml=options.nohtml,
savehtml=options.savehtml,
debug=options.debug
)
errorlist.extend(errs)
LOGGER.info("RUNNING TIME: %s" % (datetime.datetime.now()-info.starttime))
LOGGER.info("Attempts:%d Complete:%d Timeouts:%d Errs:%d TimeOut seconds:%d" % (len(filelist),
len(complete),
len(timeouts),
len(errs),
timeout_cur
)
)
#if you get timeouts, add to file list next time
timeouts_last_iter = timeouts
if len(timeouts) > 0:
timeout_cur = timeout_cur*2
filesize_last = filesize_cur
filesize_cur = filesize_cur*2
filelist = []
LOGGER.info("\n++++++++++++++++++++++++++++++++++++++\n")
LOGGER.info("TOTAL RUNNING TIME: %s" % (datetime.datetime.now()-info.starttime))
LOGGER.info("FINAL Attempts:%d Complete:%d Timeouts:%d Errs:%d" % (generator.numfileattempt,
generator.numfilecomplete,
generator.numtimeouts,
generator.numerrs
)
)
LOGGER.info("TIMEOUT LIST:")
LOGGER.info(timeouts)
LOGGER.info("ERROR LIST:")
LOGGER.info(errorlist)
if __name__=='__main__':
main(sys.argv)