Skip to content

Commit

Permalink
Merge pull request #19538 from anntzer/psfontsmap
Browse files Browse the repository at this point in the history
Speedup pdftex.map parsing.
  • Loading branch information
jkseppan committed May 1, 2021
2 parents d7c70bb + 04d28e9 commit b85e958
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 97 deletions.
169 changes: 79 additions & 90 deletions lib/matplotlib/dviread.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,24 +828,30 @@ class PsfontsMap:
{'slant': 0.16700000000000001}
>>> entry.filename
"""
__slots__ = ('_font', '_filename')
__slots__ = ('_filename', '_unparsed', '_parsed')

# Create a filename -> PsfontsMap cache, so that calling
# `PsfontsMap(filename)` with the same filename a second time immediately
# returns the same object.
@lru_cache()
def __new__(cls, filename):
self = object.__new__(cls)
self._font = {}
self._filename = os.fsdecode(filename)
# Some TeX distributions have enormous pdftex.map files which would
# take hundreds of milliseconds to parse, but it is easy enough to just
# store the unparsed lines (keyed by the first word, which is the
# texname) and parse them on-demand.
with open(filename, 'rb') as file:
self._parse(file)
self._unparsed = {line.split(b' ', 1)[0]: line for line in file}
self._parsed = {}
return self

def __getitem__(self, texname):
assert isinstance(texname, bytes)
if texname in self._unparsed:
self._parse_and_cache_line(self._unparsed.pop(texname))
try:
result = self._font[texname]
return self._parsed[texname]
except KeyError:
fmt = ('A PostScript file for the font whose TeX name is "{0}" '
'could not be found in the file "{1}". The dviread module '
Expand All @@ -854,100 +860,83 @@ def __getitem__(self, texname):
'This problem can often be solved by installing '
'a suitable PostScript font package in your (TeX) '
'package manager.')
msg = fmt.format(texname.decode('ascii'), self._filename)
msg = textwrap.fill(msg, break_on_hyphens=False,
break_long_words=False)
_log.info(msg)
_log.info(textwrap.fill(
fmt.format(texname.decode('ascii'), self._filename),
break_on_hyphens=False, break_long_words=False))
raise
fn, enc = result.filename, result.encoding
if fn is not None and not fn.startswith(b'/'):
fn = find_tex_file(fn)
if enc is not None and not enc.startswith(b'/'):
enc = find_tex_file(result.encoding)
return result._replace(filename=fn, encoding=enc)

def _parse(self, file):
"""
Parse the font mapping file.
The format is, AFAIK: texname fontname [effects and filenames]
Effects are PostScript snippets like ".177 SlantFont",
filenames begin with one or two less-than signs. A filename
ending in enc is an encoding file, other filenames are font
files. This can be overridden with a left bracket: <[foobar
indicates an encoding file named foobar.

There is some difference between <foo.pfb and <<bar.pfb in
subsetting, but I have no example of << in my TeX installation.
def _parse_and_cache_line(self, line):
"""
Parse a line in the font mapping file.
The format is (partially) documented at
http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf
https://tug.org/texinfohtml/dvips.html#psfonts_002emap
Each line can have the following fields:
- tfmname (first, only required field),
- psname (defaults to tfmname, must come immediately after tfmname if
present),
- fontflags (integer, must come immediately after psname if present,
ignored by us),
- special (SlantFont and ExtendFont, only field that is double-quoted),
- fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always
precedes a font, <[ always precedes an encoding, < can precede either
but then an encoding file must have extension .enc; < and << also
request different font subsetting behaviors but we ignore that; < can
be separated from the filename by whitespace).
special, fontfile, and encodingfile can appear in any order.
"""
# If the map file specifies multiple encodings for a font, we
# follow pdfTeX in choosing the last one specified. Such
# entries are probably mistakes but they have occurred.
# http://tex.stackexchange.com/questions/10826/
# http://article.gmane.org/gmane.comp.tex.pdftex/4914

empty_re = re.compile(br'%|\s*$')
word_re = re.compile(
br'''(?x) (?:
"<\[ (?P<enc1> [^"]+ )" | # quoted encoding marked by [
"< (?P<enc2> [^"]+.enc)" | # quoted encoding, ends in .enc
"<<? (?P<file1> [^"]+ )" | # quoted font file name
" (?P<eff1> [^"]+ )" | # quoted effects or font name
<\[ (?P<enc3> \S+ ) | # encoding marked by [
< (?P<enc4> \S+ .enc) | # encoding, ends in .enc
<<? (?P<file2> \S+ ) | # font file name
(?P<eff2> \S+ ) # effects or font name
)''')
effects_re = re.compile(
br'''(?x) (?P<slant> -?[0-9]*(?:\.[0-9]+)) \s* SlantFont
| (?P<extend>-?[0-9]*(?:\.[0-9]+)) \s* ExtendFont''')

lines = (line.strip()
for line in file
if not empty_re.match(line))
for line in lines:
effects, encoding, filename = b'', None, None
words = word_re.finditer(line)

# The named groups are mutually exclusive and are
# referenced below at an estimated order of probability of
# occurrence based on looking at my copy of pdftex.map.
# The font names are probably unquoted:
w = next(words)
texname = w.group('eff2') or w.group('eff1')
w = next(words)
psname = w.group('eff2') or w.group('eff1')

for w in words:
# Any effects are almost always quoted:
eff = w.group('eff1') or w.group('eff2')
if eff:
effects = eff
continue
# Encoding files usually have the .enc suffix
# and almost never need quoting:
enc = (w.group('enc4') or w.group('enc3') or
w.group('enc2') or w.group('enc1'))
if enc:
if encoding is not None:
_log.debug('Multiple encodings for %s = %s',
texname, psname)
encoding = enc
continue
# File names are probably unquoted:
filename = w.group('file2') or w.group('file1')

effects_dict = {}
for match in effects_re.finditer(effects):
slant = match.group('slant')
if slant:
effects_dict['slant'] = float(slant)
else:
effects_dict['extend'] = float(match.group('extend'))

self._font[texname] = PsFont(
texname=texname, psname=psname, effects=effects_dict,
encoding=encoding, filename=filename)
if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):
return
tfmname = basename = special = encodingfile = fontfile = None
matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line)
for match in matches:
quoted, unquoted = match.groups()
if unquoted:
if unquoted.startswith(b"<<"): # font
fontfile = unquoted[2:]
elif unquoted.startswith(b"<["): # encoding
encodingfile = unquoted[2:]
elif unquoted.startswith(b"<"): # font or encoding
word = (
# <foo => foo
unquoted[1:]
# < by itself => read the next word
or next(filter(None, next(matches).groups())))
if word.endswith(b".enc"):
encodingfile = word
else:
fontfile = word
elif tfmname is None:
tfmname = unquoted
elif basename is None:
basename = unquoted
elif quoted:
special = quoted
if basename is None:
basename = tfmname
effects = {}
if special:
words = reversed(special.split())
for word in words:
if word == b"SlantFont":
effects["slant"] = float(next(words))
elif word == b"ExtendFont":
effects["extend"] = float(next(words))
if encodingfile is not None and not encodingfile.startswith(b"/"):
encodingfile = find_tex_file(encodingfile)
if fontfile is not None and not fontfile.startswith(b"/"):
fontfile = find_tex_file(fontfile)
self._parsed[tfmname] = PsFont(
texname=tfmname, psname=basename, effects=effects,
encoding=encodingfile, filename=fontfile)


# Note: this function should ultimately replace the Encoding class, which
Expand Down
14 changes: 7 additions & 7 deletions lib/matplotlib/tests/baseline_images/dviread/test.map
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
% used by test_dviread.py
TeXfont1 PSfont1 <font1.pfb "<font1.enc"
TeXfont2 PSfont2 <font2.enc "<font2.pfa"
"TeXfont3" PSfont3 "1.23 UnknownEffect" <[enc3.foo <font3.pfa
TeXfont1 PSfont1 <font1.pfb <font1.enc
TeXfont2 PSfont2 <font2.enc <font2.pfa
TeXfont3 PSfont3 "1.23 UnknownEffect" <[enc3.foo < font3.pfa
TeXfont4 PSfont4 "-0.1 SlantFont 2.2 ExtendFont" <font4.enc <font4.pfa
TeXfont5 "PSfont5" <encoding1.enc <encoding2.enc <font5.pfb
TeXfont5 PSfont5 <encoding1.enc <encoding2.enc <font5.pfb
TeXfont6 PSfont6
TeXfont7 PSfont7 <font7.enc
TeXfont8 PSfont8 <font8.pfb
TeXfont9 PSfont9 </absolute/font9.pfb
TeXfont7 PSfont7 < font7.enc
TeXfont8 PSfont8 <<font8.pfb
TeXfont9 </absolute/font9.pfb
3 changes: 3 additions & 0 deletions lib/matplotlib/tests/test_dviread.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,13 @@ def test_PsfontsMap(monkeypatch):
assert entry.filename == b'font8.pfb'
assert entry.encoding is None
entry = fontmap[b'TeXfont9']
assert entry.psname == b'TeXfont9'
assert entry.filename == b'/absolute/font9.pfb'
# Missing font
with pytest.raises(KeyError, match='no-such-font'):
fontmap[b'no-such-font']
with pytest.raises(KeyError, match='%'):
fontmap[b'%']


@pytest.mark.skipif(shutil.which("kpsewhich") is None,
Expand Down

0 comments on commit b85e958

Please sign in to comment.