Parse PsfontMap entries on-demand.

See previous commit for description of pdftex.map. The vast majority of entries (dozens of thousands) in pdftex.map actually end up being unused, and their parsing is just wasted. This patch takes advantage of the fact that we can quickly recover the tex font name from pdftex.map entries (it's just the first word), so we can very quickly build a mapping of tex font names to unparsed pdftex.map entries, and then only parse the few entries that we'll need on-demand. This speeds up e.g. ``` python -c 'from pylab import *; rcParams["text.usetex"] = True; plot(); savefig("/tmp/test.pdf")' ``` by ~700ms (~20%) on the matplotlib macos.
matplotlib · Feb 19, 2021 · ba7f9fd · ba7f9fd
1 parent 6e7ed9d
commit ba7f9fd
Show file tree

Hide file tree

Showing 2 changed files with 63 additions and 64 deletions.
diff --git a/lib/matplotlib/dviread.py b/lib/matplotlib/dviread.py
@@ -838,24 +838,30 @@ class PsfontsMap:
     {'slant': 0.16700000000000001}
     >>> entry.filename
     """
-    __slots__ = ('_font', '_filename')
+    __slots__ = ('_filename', '_unparsed', '_parsed')
 
     # Create a filename -> PsfontsMap cache, so that calling
     # `PsfontsMap(filename)` with the same filename a second time immediately
     # returns the same object.
     @lru_cache()
     def __new__(cls, filename):
         self = object.__new__(cls)
-        self._font = {}
         self._filename = os.fsdecode(filename)
+        # Some TeX distributions have enormous pdftex.map files which would
+        # take hundreds of milliseconds to parse, but it is easy enough to just
+        # store the unparsed lines (keyed by the first word, which is the
+        # texname) and parse them on-demand.
         with open(filename, 'rb') as file:
-            self._parse(file)
+            self._unparsed = {line.split(b' ', 1)[0]: line for line in file}
+        self._parsed = {}
         return self
 
     def __getitem__(self, texname):
         assert isinstance(texname, bytes)
+        if texname in self._unparsed:
+            self._parse_and_cache_line(self._unparsed.pop(texname))
         try:
-            result = self._font[texname]
+            return self._parsed[texname]
         except KeyError:
             fmt = ('A PostScript file for the font whose TeX name is "{0}" '
                    'could not be found in the file "{1}". The dviread module '
@@ -864,21 +870,14 @@ def __getitem__(self, texname):
                    'This problem can often be solved by installing '
                    'a suitable PostScript font package in your (TeX) '
                    'package manager.')
-            msg = fmt.format(texname.decode('ascii'), self._filename)
-            msg = textwrap.fill(msg, break_on_hyphens=False,
-                                break_long_words=False)
-            _log.info(msg)
+            _log.info(textwrap.fill(
+                fmt.format(texname.decode('ascii'), self._filename),
+                break_on_hyphens=False, break_long_words=False))
             raise
-        fn, enc = result.filename, result.encoding
-        if fn is not None and not fn.startswith(b'/'):
-            fn = find_tex_file(fn)
-        if enc is not None and not enc.startswith(b'/'):
-            enc = find_tex_file(result.encoding)
-        return result._replace(filename=fn, encoding=enc)
-
-    def _parse(self, file):
+
+    def _parse_and_cache_line(self, line):
         """
-        Parse the font mapping file.
+        Parse a line in the font mapping file.
 
         The format is, AFAIK: texname fontname [effects and filenames]
         Effects are PostScript snippets like ".177 SlantFont",
@@ -898,50 +897,50 @@ def _parse(self, file):
         # http://tex.stackexchange.com/questions/10826/
         # http://article.gmane.org/gmane.comp.tex.pdftex/4914
 
-        word_re = re.compile(br'"([^"]*)(?:"|$)|(\S+)')
-        for line in file:
-            if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):
-                continue
-            tfmname = basename = special = encodingfile = fontfile = None
-            matches = word_re.finditer(line)
-            for match in matches:
-                quoted, unquoted = match.groups()
-                if unquoted:
-                    if unquoted.startswith(b"<<"):  # font
-                        fontfile = unquoted[2:]
-                    elif unquoted.startswith(b"<["):  # encoding
-                        encodingfile = unquoted[2:]
-                    elif unquoted.startswith(b"<"):  # font or encoding
-                        if unquoted == b"<":
-                            word = next(filter(None, next(matches).groups()))
-                            if unquoted.endswith(b".enc"):
-                                encodingfile = word
-                            else:
-                                fontfile = word
-                        else:
-                            if unquoted.endswith(b".enc"):
-                                encodingfile = unquoted[1:]
-                            else:
-                                fontfile = unquoted[1:]
-                    elif tfmname is None:
-                        tfmname = unquoted
-                    elif basename is None:
-                        basename = unquoted
-                elif quoted:
-                    special = quoted
-            if basename is None:
-                basename = tfmname
-            effects = {}
-            if special:
-                words = reversed(special.split())
-                for word in words:
-                    if word == b"SlantFont":
-                        effects["slant"] = float(next(words))
-                    elif word == b"ExtendFont":
-                        effects["extend"] = float(next(words))
-            self._font[tfmname] = PsFont(
-                texname=tfmname, psname=basename, effects=effects,
-                encoding=encodingfile, filename=fontfile)
+        if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):
+            return
+        tfmname = basename = special = encodingfile = fontfile = None
+        matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line)
+        for match in matches:
+            quoted, unquoted = match.groups()
+            if unquoted:
+                if unquoted.startswith(b"<<"):  # font
+                    fontfile = unquoted[2:]
+                elif unquoted.startswith(b"<["):  # encoding
+                    encodingfile = unquoted[2:]
+                elif unquoted.startswith(b"<"):  # font or encoding
+                    word = (
+                        # <foo => foo
+                        unquoted[1:]
+                        # < by itself => read the next word
+                        or next(filter(None, next(matches).groups())))
+                    if word.endswith(b".enc"):
+                        encodingfile = word
+                    else:
+                        fontfile = word
+                elif tfmname is None:
+                    tfmname = unquoted
+                elif basename is None:
+                    basename = unquoted
+            elif quoted:
+                special = quoted
+        if basename is None:
+            basename = tfmname
+        effects = {}
+        if special:
+            words = reversed(special.split())
+            for word in words:
+                if word == b"SlantFont":
+                    effects["slant"] = float(next(words))
+                elif word == b"ExtendFont":
+                    effects["extend"] = float(next(words))
+        if encodingfile is not None and not encodingfile.startswith(b"/"):
+            encodingfile = find_tex_file(encodingfile)
+        if fontfile is not None and not fontfile.startswith(b"/"):
+            fontfile = find_tex_file(fontfile)
+        self._parsed[tfmname] = PsFont(
+            texname=tfmname, psname=basename, effects=effects,
+            encoding=encodingfile, filename=fontfile)
 
 
 @_api.deprecated("3.3")

diff --git a/lib/matplotlib/tests/baseline_images/dviread/test.map b/lib/matplotlib/tests/baseline_images/dviread/test.map
@@ -1,10 +1,10 @@
 % used by test_dviread.py
 TeXfont1 PSfont1 <font1.pfb <font1.enc
 TeXfont2 PSfont2 <font2.enc <font2.pfa
-TeXfont3 PSfont3 "1.23 UnknownEffect" <[enc3.foo <font3.pfa
+TeXfont3 PSfont3 "1.23 UnknownEffect" <[enc3.foo < font3.pfa
 TeXfont4 PSfont4 "-0.1 SlantFont 2.2 ExtendFont" <font4.enc <font4.pfa
 TeXfont5 PSfont5 <encoding1.enc <encoding2.enc <font5.pfb
 TeXfont6 PSfont6
-TeXfont7 PSfont7 <font7.enc
-TeXfont8 PSfont8 <font8.pfb
+TeXfont7 PSfont7 < font7.enc
+TeXfont8 PSfont8 <<font8.pfb
 TeXfont9 </absolute/font9.pfb