-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_dates_in_place.py
executable file
·315 lines (254 loc) · 11.3 KB
/
update_dates_in_place.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#!/usr/bin/env python3
from appscript import *
from datetime import datetime
from osax import *
from plistlib import load, loads, dump
from subprocess import call, check_output, CalledProcessError
import aem
import os
import re
PREFERENCES_PATH = \
os.path.expanduser('~/Library/Preferences/net.sabi.UpdateDates.plist')
DATE_FORMATS = (('%m-%d-%y', r'\d{1,2}-\d{1,2}-\d{2}' ), # Busey new
('%m/%d/%y', r'\d{1,2}/\d{1,2}/\d{1,2}' ), # T-Mobile
('%m.%d.%y', r'\d{1,2}\.\d{1,2}\.\d{1,2}' ), # iFixit
('%b%d,%Y', r'[A-Z][a-z][a-z] ?\d{1,2}, ?\d{4}'), # AmerenIP
('%B%d,%Y', r'[A-Z][a-z]+ *\d{1,2}, *\d{4}' ), # Amazon
('%B%d.%Y', r'[A-Z][a-z]+ *\d{1,2}\. *\d{4}' ), # Amazon
('%b%d.%Y', r'[A-Z][a-z]+ *\d{1,2}\. *\d{4}' ), # Bloomie's
('of%Y%m%d', r'of \d{8}' ), # Amazon
('%m/%d/%Y', r'\d{1,2}/\d{1,2}/\d{4}' ), # Busey
('%b%d%Y', r'[A-Z]{3} \d{1,2} \d{4}' ), # State Farm
('%b%d,%Y', r'[A-Z]{3} \d{1,2}, \d{4}' ), # State Farm
('%d%b%Y', r'\d{1,2} ?[A-Z][A-Za-z]{2} ?\d{4}'), # Apple
('%Y-%m-%d', r'\d{4}-\d{2}-\d{2}' ), # MacSpeech
('%d%B%Y', r'\d{1,2} *[A-Z][a-z]+ *\d{4}' ), # Vagabond Inn
('%Y-%m', r'\d{4}-\d{2}' ), # title
# bad OCR formats - keep at bottom
('%m1%d/%y', r'\d{1,2}1\d{1,2}/\d{1,2}' ), # T-Mo bad
('%m/%d1%y', r'\d{1,2}/\d{1,2}1\d{1,2}' ), # T-Mo bad
('%m/%d/%y', r'\d{1,2}/ \d{1,2}/ \d{1,2}' ), # T-Mo bad
('%m1%d/%Y', r'\d{2}1\d{2}/\d{4}' ), # Temple bad
('%m/%d1%Y', r'\d{1,2}/\d{1,2}1\d{4}' ), # TotalVac bad
('%m/%d/%Y',
r'(?:\d ?){1,2}/ (?:\d ?){1,2}/ (?:\d ?){4}' ), # Busey bad
)
TITLE_DATE_FORMATS = DATE_FORMATS + (('%Y', r'20\d{2}'),) # title only
def date_re(formats):
return re.compile('|'.join(r'(\b%s\b)' % regex
for format, regex in formats))
def date_extractor(formats):
return lambda text, match=None: extract_date(text, match,
date_re(formats), formats)
extract_date_from_contents = date_extractor(DATE_FORMATS)
extract_date_from_title = date_extractor(TITLE_DATE_FORMATS)
def extract_date(text, match, re_date, formats):
no_format = []
for m in re_date.finditer(text):
matched_format = m.lastindex
format = formats[matched_format - 1][0]
matched = m.group(matched_format).replace(' ', '')
try:
parsed = datetime.strptime(matched, format)
except ValueError as e: # not a date
no_format.append((matched, format, e))
continue
if not match or (match.year, match.month) == (parsed.year, parsed.month):
if 1990 < parsed.year < 2100:
return parsed.date(), no_format
no_format.append(m.group(matched_format))
return None, no_format
RE_TITLE_DATE = date_re(TITLE_DATE_FORMATS)
def extract_source_from_title(title, title_date):
if title_date:
return title[:RE_TITLE_DATE.search(title).start(0)].rstrip()
else:
return title
EagleFiler = app(id='com.c-command.EagleFiler')
Paper = EagleFiler.library_documents['Paper.eflibrary']
def read_sources():
return list(map(str, load(open(PREFERENCES_PATH, 'rb')).get('Sources', [])))
def write_sources():
dump({'Sources': sources}, open(PREFERENCES_PATH, 'wb'))
def add_source(source, contents):
print(f'Considering adding source "{source}".')
source = str(source)
if source and re.search(re.escape(source), contents, re.IGNORECASE):
print('- Found source in document, adding.')
source_is_new = source not in sources
if source_is_new:
print('- Source is new.')
else:
print('- Source is not new; moving to top of list.')
sources.remove(source)
sources.insert(0, source) # most recently referenced ones at top
return source_is_new
else:
print('- Source not found in document; not added.')
def has_encoding_application(path, encoding_application):
try:
metadata = loads(check_output(['/usr/bin/mdls', '-plist', '-', path]))
except CalledProcessError:
return False
if not isinstance(metadata, dict):
return False
return encoding_application in metadata.get('kMDItemEncodingApplications', [])
def update_all():
record_count = 0
no_regex_count = 0
no_format_count = 0
impossible_count = 0
new_sources = []
record_ids = Paper.library_records.id()
record_utis = Paper.library_records.universal_type_identifier()
for record_id, record_uti in zip(record_ids, record_utis):
if record_uti != 'com.adobe.pdf':
continue
record = Paper.library_records.ID(record_id)
tags = record.assigned_tag_names()
if 'impossible' in tags:
continue # OCR inadequate/data missing from document
record_count += 1
title = record.title()
title_date, no_format = extract_date_from_title(title)
source = extract_source_from_title(title, title_date)
contents = record.text_content()
if add_source(source, contents):
new_sources.append(source)
contents_date, no_format = extract_date_from_contents(contents,
title_date)
if not contents_date:
print('%s (extracted: %s)' % (title, title_date))
for nf in no_format:
print(' ', nf)
if not title_date:
continue
if no_format:
no_format_count += 1
tags.append('no_format')
else:
no_regex_count += 1
tags.append('no_regex')
record.note_text.set(contents)
Paper_window.selected_records.set([record])
EagleFiler.activate()
record.assigned_tag_names.set(tags)
disposition = input()
if disposition == 'i':
tags.append('impossible')
record.note_text.set('')
record.assigned_tag_names.set(tags)
elif disposition == 'd':
while True:
date_format = input('date format: ')
if not date_format: break
regex = input('regex: ')
if not regex: break
date_formats = ((date_format.replace(' ', ''), regex),)
print(extract_date(contents, title_date,
re_wrap(date_formats), date_formats))
elif disposition == 'q':
return
record.creation_date.set(contents_date or title_date)
# print 'date:', contents_date or title_date
print()
print('-' * 50)
print(' Total records:', record_count)
print(' No regex match:', no_regex_count)
print(' No format match:', no_format_count)
print('Successful match:', record_count - no_regex_count - no_format_count)
print('%d new sources:' % len(new_sources))
for source in sorted(new_sources):
print('\t%s' % source)
write_sources()
def title_date_record(record):
Paper_window.selected_records.set([record])
title = record.title()
contents = record.text_content()
date, no_format = extract_date_from_contents(contents)
title_date, no_format = extract_date_from_title(title)
if not title_date:
m = re.search('(%s)' % '|'.join(map(re.escape, sources)), contents,
re.IGNORECASE)
if m:
# use the saved source's case
title = sources[list(map(str.lower, sources)).index(m.group(1).lower())]
else:
title = '???'
if date:
title += date.strftime(' %Y-%m')
SA = OSAX(id='com.apple.systemevents')
try:
result = SA.display_dialog('Title this document:',
buttons=['Cancel', 'Title'],
cancel_button=1, default_button=2,
default_answer=title)
except CommandError:
return
if not result:
return
title = result[k.text_returned]
title_date, no_format = extract_date_from_title(title)
if title_date and (not date or (title_date.year, title_date.month) !=
(date.year, date.month)):
date = title_date
if date:
record.creation_date.set(date)
if add_source(extract_source_from_title(title, title_date),
record.text_content()):
write_sources()
record.title.set(title)
record.filename.set(title)
def optimize_record(record):
Acrobat = app(id='com.adobe.Acrobat.Pro')
SystemEvents = app(id='com.apple.systemevents')
acro_process = SystemEvents.application_processes[u'Acrobat']
file = record.file()
filename = os.path.basename(file.path)
creator = SystemEvents.files[file.hfspath].creator_type()
if creator == 'CARO':
return # already written by Acrobat
if not has_encoding_application(file.path, 'IJ Scan Utility'):
return # not a scanned document
Acrobat.activate()
Acrobat.open(record.file())
acro_process.menu_bars[1].menu_bar_items['Document'].menus[1].\
menu_items['Optimize Scanned PDF'].click()
acro_process.windows['Optimize Scanned PDF'].buttons['OK'].click()
Acrobat.documents[filename].save(to=file)
Acrobat.documents[filename].close()
def update_selected():
selected_records = Paper_window.selected_records()
for record in selected_records:
title_date_record(record)
for record in selected_records:
if record.universal_type_identifier() != 'com.adobe.pdf':
continue
optimize_record(record)
if __name__ == '__main__':
if not Paper.exists():
EagleFiler.open(os.path.expanduser('~/Documents/Paper/Paper.eflibrary'))
# XXX filtering doesn't work, even in AppleScript
# Paper_window = EagleFiler.browser_windows[aem.its.property(aem.app.elements('docu')).eq(Paper)]()
# appscript gets confused between the property and class 'document'
window_documents = EagleFiler.AS_newreference(
aem.app.elements('BroW').property('docu'))()
for window_index, window_document in enumerate(window_documents):
if window_document == Paper:
# we can't store a persistent reference, because the class returned
# is '\0\0\0\0'
# Paper_window = EagleFiler.browser_windows[window_index + 1].get()
Paper_window = EagleFiler.browser_windows.ID(
EagleFiler.browser_windows[window_index + 1].id.get())
if os.path.exists(PREFERENCES_PATH):
try:
sources = read_sources()
except:
call(['/usr/bin/plutil', '-convert', 'xml1', PREFERENCES_PATH])
sources = read_sources()
else:
sources = []
# update_all()
update_selected()
EagleFiler.activate()
# XXX incremental source recording from EagleFiler (use tag to record)