In [1]:
from bs4 import BeautifulSoup as BS
import os
from pathlib import Path

In [9]:
def adjust(root, file, dry = False):
    '''
    mark an html file deprecated, incl' canonical link for crawlers.
    dry: do not overwrite, but go to temp file.
    '''
    base = Path(root)
    # if (Path(file).suffix != '.html') raise ValueError("Not an html file.")
    if ('.tmp' in Path(file).suffixes): return # prefer to ignore own temp files.
            
    ver, *rest = base.parts # '/'.join(rest) is our best-guess current version.
    target = os.path.join('https://matplotlib.org', *rest, file)
        # ADJUST as necessary.
        # Base path contains version string, so '..' is wrong.
                
    with open(base / file, "rt") as f:
        parse = BS(f, 'lxml')
        for child in parse.html.children:
            # accessing children outside of iteration confuses beautifulsoup
            if child.name == 'head':                    
                redirect_bots = \
                    BS('''<link href="{}" rel="canonical">'''.format(target), 'html.parser')
                    # lxml would wrap this snippet. yuck.
                child.append(redirect_bots)

            elif child.name == 'body':
                banner = BS('''<div id="old-version-banner">
            You are reading documentation for a static version of Matplotlib.
            <a href="{}">This page may have been updated.</a>
            </div>'''.format(target), 'html.parser')
                child.insert(0, banner)

        *name, ext = file.split('.')
        dummy = ('.'.join([*name, 'tmp', ext]))
                # file if (name[-1] == 'tmp') else
        
        with open(base/dummy if dry else base/file, "wt") as g:
            g.write(str(parse))

In [10]:
'''
import profile

pr = profile.Profile()
pr.run("adjust(Path('./2.0.2'), 'index.html', dry=True)")
pr.create_stats()
pr.print_stats(sort=1) # want to initialize lxml only once!
''';

         18035 function calls (17694 primitive calls) in 0.057 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    170/1    0.006    0.000    0.022    0.022 element.py:1119(decode)
      164    0.004    0.000    0.019    0.000 _lxml.py:145(start)
     2438    0.003    0.000    0.003    0.000 :0(isinstance)
        1    0.003    0.003    0.032    0.032 _lxml.py:246(feed)
    170/1    0.003    0.000    0.021    0.021 element.py:1218(decode_contents)
      508    0.002    0.000    0.009    0.000 __init__.py:337(endData)
     2075    0.002    0.000    0.002    0.000 :0(append)
      738    0.002    0.000    0.002    0.000 element.py:203(setup)
      284    0.002    0.000    0.004    0.000 __init__.py:367(object_was_parsed)
      170    0.002    0.000    0.005    0.000 element.py:813(__init__)
      877    0.001    0.000    0.001    0.000 :0(len)
      167    0.001    0.000    0.013    0.000 __init__.py:447(handle_starttag)
    

In [4]:
# base, file = Path('./2.0.2/api'), 'pyplot_summary.html'
# with open(base / file, "rt") as f:
#    parse = BS(f, 'lxml')

In [5]:
def process(target, verbose = False, dry = False):
    delve = ''; deeper = ''
    for root, _, files in os.walk(Path(target)):
        path = Path(root)

        # progress meter, inspired loosely by https://stackoverflow.com/a/2165062
        if verbose:
            try:
                s = path.parts[1] # toplevel.index()
                if s != delve:
                    delve = s; print(s)
                else:
                    r = path.parts[2] if s not in ['mpl_examples', 'examples'] else '...'
                    if r != deeper:
                        deeper = r; print('\t' + r)
            except: pass

        for f in files:
            try:
                # expect .pdf, .png, .py files. ignore them.
                if (Path(f).suffix == '.html'):
                    adjust(path, f, False) # time.sleep(.0001)

            except UnicodeDecodeError as e:
                print(f, e)
            except AttributeError as e:
                print(f, e)

#### main tool

In [6]:
# convert a single site (for re-use or testing)
# ! svn checkout https://github.com/matplotlib/matplotlib.github.com/trunk/2.0.2

# process('2.0.2')

#### batch utility

In [7]:
import re
targets = [d for d in os.listdir() if re.match(r'''\d+\.\d+\.\d+''', d)]
targets

# Below cell will mutate all HTML files in the directories listed.

['1.4.0',
 '1.2.1',
 '1.5.0',
 '2.0.2',
 '2.1.1',
 '2.0.0',
 '1.4.3',
 '2.0.1',
 '1.4.1',
 '1.3.1',
 '2.2.0',
 '1.5.1',
 '2.2.2',
 '1.5.3',
 '2.1.0',
 '1.3.0',
 '2.1.2',
 '1.4.2']

# run me as code cell to batch run.
for t in targets: process(t, verbose=True)