Merge pull request #19538 from anntzer/psfontsmap

Speedup pdftex.map parsing.
matplotlib · May 1, 2021 · b85e958 · b85e958
2 parents d7c70bb + 04d28e9
commit b85e958
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 97 deletions.
diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py
@@ -828,24 +828,30 @@ class PsfontsMap:
     {'slant': 0.16700000000000001}
     >>> entry.filename
     """
-    __slots__ = ('_font', '_filename')
+    __slots__ = ('_filename', '_unparsed', '_parsed')
 
     # Create a filename -> PsfontsMap cache, so that calling
     # `PsfontsMap(filename)` with the same filename a second time immediately
     # returns the same object.
     @lru_cache()
     def __new__(cls, filename):
         self = object.__new__(cls)
-        self._font = {}
         self._filename = os.fsdecode(filename)
+        # Some TeX distributions have enormous pdftex.map files which would
+        # take hundreds of milliseconds to parse, but it is easy enough to just
+        # store the unparsed lines (keyed by the first word, which is the
+        # texname) and parse them on-demand.
         with open(filename, 'rb') as file:
-            self._parse(file)
+            self._unparsed = {line.split(b' ', 1)[0]: line for line in file}
+        self._parsed = {}
         return self
 
     def __getitem__(self, texname):
         assert isinstance(texname, bytes)
+        if texname in self._unparsed:
+            self._parse_and_cache_line(self._unparsed.pop(texname))
         try:
-            result = self._font[texname]
+            return self._parsed[texname]
         except KeyError:
             fmt = ('A PostScript file for the font whose TeX name is "{0}" '
                    'could not be found in the file "{1}". The dviread module '
@@ -854,100 +860,83 @@ def __getitem__(self, texname):
                    'This problem can often be solved by installing '
                    'a suitable PostScript font package in your (TeX) '
                    'package manager.')
-            msg = fmt.format(texname.decode('ascii'), self._filename)
-            msg = textwrap.fill(msg, break_on_hyphens=False,
-                                break_long_words=False)
-            _log.info(msg)
+            _log.info(textwrap.fill(
+                fmt.format(texname.decode('ascii'), self._filename),
+                break_on_hyphens=False, break_long_words=False))
             raise
-        fn, enc = result.filename, result.encoding
-        if fn is not None and not fn.startswith(b'/'):
-            fn = find_tex_file(fn)
-        if enc is not None and not enc.startswith(b'/'):
-            enc = find_tex_file(result.encoding)
-        return result._replace(filename=fn, encoding=enc)
-
-    def _parse(self, file):
-        """
-        Parse the font mapping file.
-
-        The format is, AFAIK: texname fontname [effects and filenames]
-        Effects are PostScript snippets like ".177 SlantFont",
-        filenames begin with one or two less-than signs. A filename
-        ending in enc is an encoding file, other filenames are font
-        files. This can be overridden with a left bracket: <[foobar
-        indicates an encoding file named foobar.
 
-        There is some difference between <foo.pfb and <<bar.pfb in
-        subsetting, but I have no example of << in my TeX installation.
+    def _parse_and_cache_line(self, line):
+        """
+        Parse a line in the font mapping file.
+
+        The format is (partially) documented at
+        http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf
+        https://tug.org/texinfohtml/dvips.html#psfonts_002emap
+        Each line can have the following fields:
+
+        - tfmname (first, only required field),
+        - psname (defaults to tfmname, must come immediately after tfmname if
+          present),
+        - fontflags (integer, must come immediately after psname if present,
+          ignored by us),
+        - special (SlantFont and ExtendFont, only field that is double-quoted),
+        - fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always
+          precedes a font, <[ always precedes an encoding, < can precede either
+          but then an encoding file must have extension .enc; < and << also
+          request different font subsetting behaviors but we ignore that; < can
+          be separated from the filename by whitespace).
+
+        special, fontfile, and encodingfile can appear in any order.
         """
         # If the map file specifies multiple encodings for a font, we
         # follow pdfTeX in choosing the last one specified. Such
         # entries are probably mistakes but they have occurred.
         # http://tex.stackexchange.com/questions/10826/
-        # http://article.gmane.org/gmane.comp.tex.pdftex/4914
-
-        empty_re = re.compile(br'%|\s*$')
-        word_re = re.compile(
-            br'''(?x) (?:
-                 "<\[ (?P<enc1>  [^"]+    )" | # quoted encoding marked by [
-                 "<   (?P<enc2>  [^"]+.enc)" | # quoted encoding, ends in .enc
-                 "<<? (?P<file1> [^"]+    )" | # quoted font file name
-                 "    (?P<eff1>  [^"]+    )" | # quoted effects or font name
-                 <\[  (?P<enc3>  \S+      )  | # encoding marked by [
-                 <    (?P<enc4>  \S+  .enc)  | # encoding, ends in .enc
-                 <<?  (?P<file2> \S+      )  | # font file name
-                      (?P<eff2>  \S+      )    # effects or font name
-            )''')
-        effects_re = re.compile(
-            br'''(?x) (?P<slant> -?[0-9]*(?:\.[0-9]+)) \s* SlantFont
-                    | (?P<extend>-?[0-9]*(?:\.[0-9]+)) \s* ExtendFont''')
-
-        lines = (line.strip()
-                 for line in file
-                 if not empty_re.match(line))
-        for line in lines:
-            effects, encoding, filename = b'', None, None
-            words = word_re.finditer(line)
-
-            # The named groups are mutually exclusive and are
-            # referenced below at an estimated order of probability of
-            # occurrence based on looking at my copy of pdftex.map.
-            # The font names are probably unquoted:
-            w = next(words)
-            texname = w.group('eff2') or w.group('eff1')
-            w = next(words)
-            psname = w.group('eff2') or w.group('eff1')
-
-            for w in words:
-                # Any effects are almost always quoted:
-                eff = w.group('eff1') or w.group('eff2')
-                if eff:
-                    effects = eff
-                    continue
-                # Encoding files usually have the .enc suffix
-                # and almost never need quoting:
-                enc = (w.group('enc4') or w.group('enc3') or
-                       w.group('enc2') or w.group('enc1'))
-                if enc:
-                    if encoding is not None:
-                        _log.debug('Multiple encodings for %s = %s',
-                                   texname, psname)
-                    encoding = enc
-                    continue
-                # File names are probably unquoted:
-                filename = w.group('file2') or w.group('file1')
-
-            effects_dict = {}
-            for match in effects_re.finditer(effects):
-                slant = match.group('slant')
-                if slant:
-                    effects_dict['slant'] = float(slant)
-                else:
-                    effects_dict['extend'] = float(match.group('extend'))
 
-            self._font[texname] = PsFont(
-                texname=texname, psname=psname, effects=effects_dict,
-                encoding=encoding, filename=filename)
+        if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):
+            return
+        tfmname = basename = special = encodingfile = fontfile = None
+        matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line)
+        for match in matches:
+            quoted, unquoted = match.groups()
+            if unquoted:
+                if unquoted.startswith(b"<<"):  # font
+                    fontfile = unquoted[2:]
+                elif unquoted.startswith(b"<["):  # encoding
+                    encodingfile = unquoted[2:]
+                elif unquoted.startswith(b"<"):  # font or encoding
+                    word = (
+                        # <foo => foo
+                        unquoted[1:]
+                        # < by itself => read the next word
+                        or next(filter(None, next(matches).groups())))
+                    if word.endswith(b".enc"):
+                        encodingfile = word
+                    else:
+                        fontfile = word
+                elif tfmname is None:
+                    tfmname = unquoted
+                elif basename is None:
+                    basename = unquoted
+            elif quoted:
+                special = quoted
+        if basename is None:
+            basename = tfmname
+        effects = {}
+        if special:
+            words = reversed(special.split())
+            for word in words:
+                if word == b"SlantFont":
+                    effects["slant"] = float(next(words))
+                elif word == b"ExtendFont":
+                    effects["extend"] = float(next(words))
+        if encodingfile is not None and not encodingfile.startswith(b"/"):
+            encodingfile = find_tex_file(encodingfile)
+        if fontfile is not None and not fontfile.startswith(b"/"):
+            fontfile = find_tex_file(fontfile)
+        self._parsed[tfmname] = PsFont(
+            texname=tfmname, psname=basename, effects=effects,
+            encoding=encodingfile, filename=fontfile)
 
 
 # Note: this function should ultimately replace the Encoding class, which

diff --git a/lib/matplotlib/tests/baseline_images/dviread/test.map b/lib/matplotlib/tests/baseline_images/dviread/test.map
@@ -1,10 +1,10 @@
 % used by test_dviread.py
-TeXfont1 PSfont1 <font1.pfb "<font1.enc"
-TeXfont2 PSfont2 <font2.enc "<font2.pfa"
-"TeXfont3" PSfont3 "1.23 UnknownEffect" <[enc3.foo <font3.pfa
+TeXfont1 PSfont1 <font1.pfb <font1.enc
+TeXfont2 PSfont2 <font2.enc <font2.pfa
+TeXfont3 PSfont3 "1.23 UnknownEffect" <[enc3.foo < font3.pfa
 TeXfont4 PSfont4 "-0.1 SlantFont 2.2 ExtendFont" <font4.enc <font4.pfa
-TeXfont5 "PSfont5" <encoding1.enc <encoding2.enc <font5.pfb
+TeXfont5 PSfont5 <encoding1.enc <encoding2.enc <font5.pfb
 TeXfont6 PSfont6
-TeXfont7 PSfont7 <font7.enc
-TeXfont8 PSfont8 <font8.pfb
-TeXfont9 PSfont9 </absolute/font9.pfb
+TeXfont7 PSfont7 < font7.enc
+TeXfont8 PSfont8 <<font8.pfb
+TeXfont9 </absolute/font9.pfb
diff --git a/lib/matplotlib/tests/test_dviread.py b/lib/matplotlib/tests/test_dviread.py
@@ -42,10 +42,13 @@ def test_PsfontsMap(monkeypatch):
     assert entry.filename == b'font8.pfb'
     assert entry.encoding is None
     entry = fontmap[b'TeXfont9']
+    assert entry.psname == b'TeXfont9'
     assert entry.filename == b'/absolute/font9.pfb'
     # Missing font
     with pytest.raises(KeyError, match='no-such-font'):
         fontmap[b'no-such-font']
+    with pytest.raises(KeyError, match='%'):
+        fontmap[b'%']
 
 
 @pytest.mark.skipif(shutil.which("kpsewhich") is None,