This repository has been archived by the owner on Feb 5, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
checks.sh
471 lines (419 loc) · 13.2 KB
/
checks.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
#######################################################################
#
# Bash:
# perform language checks for LaTeX files
#
# Actions and usage:
# - see README.md
#
#
# Matthias Baumann, 2018-2019
#
# check these, if no argument files given
#
all_tex_files="vorwort.tex */*.tex"
# LanguageTool for check of native-language text
#
LTprefix=../LT/LanguageTool-4.7
LTcommand="$LTprefix/languagetool-commandline.jar \
--language de-DE --encoding utf-8 \
--disable \
WHITESPACE_RULE,\
KOMMA_VOR_UND_ODER"
# regular expression for filtering of LT's output
# LT produces: '1.) Line 25, column 13, Rule ID: ...'
#
LT_info_line='^\d+\.\) Line (\d+), column (\d+), Rule ID: '
tooldir=Tools/LT # Python scripts and private dictionaries
txtdir=../Tex2txt # directory for extraction of raw text
# (subdirectories will be created if necessary)
ext=txt # file name extension for raw text
num=lin # ... for line number information
chr=chr # ... for character position information
foreign=en # ... for foreign-language text
ori=ori # ... for original dictionary files in LT tree
# Tex2txt script
#
tex2txt_py=$tooldir/tex2txt.py
tex2txt_repl="--repl $tooldir/repls.txt"
tex2txt_defs="--defs $tooldir/defs.py"
# sed filter for hunspell (on option --no-lt):
# '0' --> 'Null'
#
repls_hunspell=' -e s/\<0\>/Null/g'
# repls_hunspell+=' -e s/\$\$/GCHQ/g'
# list of accepted acronyms during search for single letters;
# use normal space here (is replaced by nbsp)
#
check_for_single_letters=yes
acronyms='d\. h\.|f\. ü\.|i\. A\.|u\. a\.|v\. g\. V\.|z\. B\.'
equation_repls='|B-B-B|C-C-C|D-D-D|E-E-E|F-F-F|G-G-G'
equation_repls+='|U-U-U|V-V-V|W-W-W|X-X-X|Y-Y-Y|Z-Z-Z'
utf8_nbsp=$(python3 -c 'print("\N{NO-BREAK SPACE}", end="")')
acronyms=$(echo -n $acronyms | sed -E "s/ /$utf8_nbsp/g")
acronyms+=$equation_repls
# LAB:ADAPT-LT
# append private hunspell dictionary to LT's spelling.txt
# ATTENTION: LT does not like empty lines
#
lt_dic_native=$LTprefix/org/languagetool/resource/de/hunspell/spelling.txt
priv_dic_native=$tooldir/hunspell.de
# prohibit certain words
# ATTENTION: LT does not like empty lines
#
lt_prohib_native=$LTprefix/org/languagetool/resource/de/hunspell/prohibit.txt
priv_prohib_native=$tooldir/prohibit.de
# check of foreign-language text
#
foreign_macro=engl # name of macro for foreign-language text
hunsp_foreign_lang=en_GB # foreign-language code for hunspell -d ...
priv_dic_foreign=$tooldir/hunspell.en
# private hunspell dictionary
# LAB:RECURSE scan file inclusions
#
# recognized file inclusion macros
input_macros=input,include
# do not read these files at all (regular expression)
# - our files from fig2dev, placed in different sub-directories
input_do_not_read='(.*/)?figure\.tex'
# scan these files for inclusions, but don't perform language checks on them
# - our root files containing mainly fancy things, but no "real text"
input_do_not_check='(defs|main)\.tex'
##########################################################
##########################################################
trap exit SIGINT
# parse command-line
#
declare -A options
while [ "${1:0:1}" == "-" ]
do
case $1 in
--recurse|--adapt-lt|--no-lt|--delete|--columns)
options[$1]=1
;;
--)
shift
break
;;
*)
echo unknown option "'$1'" >&2
echo usage: bash $0 '[--recurse] [--adapt-lt] [--no-lt] [--delete] [--columns] [files]' >&2
exit 1
esac
shift
done
if (( $# == 0 ))
then
args=$all_tex_files
else
args=$*
fi
# only delete auxiliary directory $txtdir?
#
if [ -n "${options[--delete]}" ]
then
read -p "enter YES to remove directory '$txtdir': " repl
if [ "$repl" == YES ]
then
rm -fr $txtdir
fi
exit
fi
# adjust LT's spelling files?
#
if [ -n "${options[--adapt-lt]}" ]
then
echo Updating $lt_dic_native ... >&2
if [ -f $lt_dic_native.$ori ]
then
cp $lt_dic_native.$ori $lt_dic_native
else
# create backup copy, if not yet present
cp $lt_dic_native $lt_dic_native.$ori
fi
cat $priv_dic_native >> $lt_dic_native
echo Updating $lt_prohib_native ... >&2
if [ -f $lt_prohib_native.$ori ]
then
cp $lt_prohib_native.$ori $lt_prohib_native
else
# create backup copy, if not yet present
cp $lt_prohib_native $lt_prohib_native.$ori
fi
cat $priv_prohib_native >> $lt_prohib_native
fi
# Python3:
# add original line numbers to messages
# - argv[1]: RE search pattern;
# first () group must contain the number;
# text till second () group is removed
# - argv[2]: file with original line numbers
#
repl_lines='
import re, sys
expr = sys.argv[1]
numbers = open(sys.argv[2]).readlines()
def repl(m):
lin = int(m.group(1)) - 1
if lin >= 0 and lin < len(numbers):
n = numbers[lin].strip()
return (m.string[m.start(0):m.end(1)] + " [" + n + "]"
+ m.string[m.end(2):m.end(0)])
return m.group(0)
for lin in sys.stdin:
sys.stdout.write(re.sub(expr, repl, lin))
'
#
# Python3:
# replace line and column numbers
# - argv[1]: regular expression:
# matching group 1 is line number, group 2 is column number
# - argv[2]: original LaTeX file
# - argv[3]: derived file with plain text
# - argv[4]: file with character offset mapping
#
repl_lines_columns='
import re, sys
expr = sys.argv[1]
tex = open(sys.argv[2]).read()
plain = open(sys.argv[3]).read()
def f(s):
s = s.strip()
if s[-1] == "+":
return -int(s[:-1])
return int(s)
map = list(f(s) for s in open(sys.argv[4]))
starts = list(m.start(0) for m in re.finditer(r"\n", "\n" + plain))
def f(m):
def ret(s1, s2):
s = m.group(0)
return (s[:m.start(1)] + "[" + s1 + "]" + s[m.end(1):m.start(2)]
+ "[" + s2 + "]" + s[m.end(2):])
def unkn():
return ret("?", "?")
lin = int(m.group(1))
col = int(m.group(2))
if lin < 1 or col < 1:
return unkn()
# find start of line number lin in plain file
if lin > len(starts):
return unkn()
n = starts[lin - 1]
# add column number col
s = plain[n:]
i = s.find("\n")
if i >= 0 and col > i or i < 0 and col > len(s):
return unkn()
n += col - 1
# map to character position in tex file
if n >= len(map):
return unkn()
n = map[n]
mark = ""
if n < 0:
mark = "+"
n = -n
# get line and column in tex file
if n > len(tex):
return unkn()
s = tex[:n]
lin = s.count("\n") + 1
col = len(s) - (s.rfind("\n") + 1)
return ret(str(lin) + mark, str(col) + mark)
for s in sys.stdin:
sys.stdout.write(re.sub(expr, f, s))
'
# Python3:
# - scan given files for inclusion macros
# - search recursively
# - print list of files
#
track_input_macro='
import re, os, sys
todo = sys.argv[1:]
done = []
while todo:
f = todo.pop(0)
if re.fullmatch(r"'$input_do_not_read'", f):
continue
if f not in done:
done.append(f)
p = os.popen(
"python3 '$tex2txt_py' '"$tex2txt_defs"' --extr '$input_macros' " + f)
fs = p.read()
if p.close():
sys.stderr.write("\n'$0': error while checking file \"" + f + "\"\n")
exit(1)
for f in fs.split():
if not f.endswith(".tex"):
f += ".tex"
if f not in done + todo:
todo.append(f)
iter = (f for f in done if not re.fullmatch(r"'$input_do_not_check'", f))
print(" ".join(iter))
'
# track file inclusions?
#
if [ -n "${options[--recurse]}" ]
then
echo Scanning file inclusions starting with $args ... >&2
args=$(python3 -c "$track_input_macro" $args)
fi
##########################################################
##########################################################
txtdir_path=$(readlink -m $txtdir)/
for i in $args
do
# avoid creation of aux files outside of txtdir
#
i_path=$(readlink -m $txtdir/$i.$ext)
if [ "${i_path:0:${#txtdir_path}}" != "$txtdir_path" ]
then
echo "$0: ERROR:" >&2
echo "for input file '$i'," >&2
echo "auxiliary text file '$txtdir/$i.$ext'" >&2
echo "would be created outside of directory '$txtdir'" >&2
echo "--> exit" >&2
exit 1
fi
if [ ! -r $i ]
then
echo "$0: cannot read file '$i'" >&2
exit 1
fi
dir=$(dirname $txtdir/$i)
if [ ! -d $dir ]
then
if ! mkdir -p $dir
then
echo "$0: cannot create directory '$dir'" >&2
exit 1
fi
fi
#####################################################
# extract raw text, save line numbers / character positions
#####################################################
python3 $tex2txt_py \
$tex2txt_repl $tex2txt_defs --nums $txtdir/$i.$num $i \
> $txtdir/$i.$ext
if [ -n "${options[--columns]}" ]
then
python3 $tex2txt_py \
$tex2txt_repl $tex2txt_defs --char --nums $txtdir/$i.$chr $i \
> /dev/null
fi
#####################################################
# call LT or hunspell
#####################################################
if [ -z "${options[--no-lt]}" ]
then
if [[ "$OS" =~ Windows ]]
then
# under Cygwin: LT with Windows Java produces Latin-1
LTfilter() { iconv -f latin1 -t utf8; }
else
LTfilter() { cat; }
fi
LT_output=$(java -jar $LTcommand $txtdir/$i.$ext \
| LTfilter)
LT_output_lines=$(echo "$LT_output" | wc -l)
if (( $LT_output_lines == 1 ))
then
echo $LT_output >&2
fi
else
echo Hunspell $txtdir/$i.$ext ... >&2
errs_hunspell=$(sed -E $repls_hunspell $txtdir/$i.$ext \
| hunspell -l -p $priv_dic_native)
fi
#####################################################
# check for single letters
#####################################################
if [ "$check_for_single_letters" == yes ]
then
single_letters=$(grep -n '^' $txtdir/$i.$ext \
| sed -E "s/\\<$acronyms//g" \
| grep -E '\<[[:alpha:]]\>')
fi
#####################################################
# check foreign-language text with hunspell
#####################################################
errs_foreign=$(python3 $tex2txt_py $tex2txt_defs \
--extr $foreign_macro --nums $txtdir/$i.$foreign.$num $i \
| grep -n '^' \
| hunspell -L -d $hunsp_foreign_lang -p $priv_dic_foreign)
#####################################################
# output of results
#####################################################
if [[ ( -n "$single_letters" ) \
|| ( -n "$errs_foreign" ) \
|| ( "$LT_output_lines" > 1 ) \
|| ( -n "$errs_hunspell" ) ]]
then
echo '=================================================='
echo $i
echo '=================================================='
echo
fi
if [ -n "$single_letters" ]
then
echo '=============='
echo 'Single letters'
echo '=============='
echo "$single_letters" \
| python3 -c "$repl_lines" '^(\d+)():' $txtdir/$i.$num
echo
fi
if [ -n "$errs_foreign" ]
then
echo '==============================='
echo 'Errors in foreign-language text'
echo '==============================='
echo "$errs_foreign" \
| python3 -c "$repl_lines" '^(\d+)():' $txtdir/$i.$foreign.$num
echo
fi
if [[ "$LT_output_lines" > 1 ]]
then
if [ -z "${options[--columns]}" ]
then
echo "$LT_output" \
| python3 -c "$repl_lines" "$LT_info_line" $txtdir/$i.$num
else
echo "$LT_output" \
| python3 -c "$repl_lines_columns" "$LT_info_line" \
$i $txtdir/$i.txt $txtdir/$i.$chr
fi
echo
fi
if [ -n "$errs_hunspell" ]
then
echo '============'
echo 'Faulty lines'
echo '============'
grep -n '^' $txtdir/$i.$ext \
| sed -E $repls_hunspell \
| hunspell -L -p $priv_dic_native \
| python3 -c "$repl_lines" '^(\d+)():' $txtdir/$i.$num
echo
echo '============='
echo 'Unknown words'
echo '============='
echo "$errs_hunspell"
echo
fi
done