In [1]:
from bs4 import BeautifulSoup as BS
import os
from pathlib import Path

In [2]:
def adjust(root, file, dry = False):
    '''
    mark an html file deprecated, incl' canonical link for crawlers.
    dry: do not overwrite, but go to temp file.
    '''
    base = Path(root)
    # if (Path(file).suffix != '.html') raise ValueError("Not an html file.")
    if ('.tmp' in Path(file).suffixes): return # prefer to ignore own temp files.
    
    with open(base / file, "rt") as f:
        parse = BS(f, 'lxml')
        for child in parse.html.children: # beautifulsoup likes explicit iteration
            
            if child.name == 'head':
                ver, *rest = base.parts # '/'.join(rest) is our best-guess current version.
                target = os.path.join('https://matplotlib.org', *rest, file)
                    # ADJUST as necessary.
                    # Base path contains version string, so '..' is wrong.
                    
                redirect_bots = \
                    BS('''<link href="{}" rel="canonical">'''.format(target), 'html.parser')
                    # lxml would wrap this snippet. yuck.
                child.append(redirect_bots)

            elif child.name == 'body':
                banner = BS('''<div id="old-version-banner">
            You are reading documentation for a static version of Matplotlib.
            <a href="{}">This page may have been updated.</a>
            </div>'''.format(target), 'html.parser')
                child.insert(0, banner)

        *name, ext = file.split('.')
        dummy = ('.'.join([*name, 'tmp', ext]))
                # file if (name[-1] == 'tmp') else
        
        with open(base/dummy if dry else base/file, "wt") as g:
            g.write(str(parse))

In [3]:
import profile

pr = profile.Profile()
pr.run("adjust(Path('./2.0.2'), 'index.html', dry=True)")
pr.create_stats()
pr.print_stats(sort=1) # want to initialize lxml only once!

         17847 function calls (17501 primitive calls) in 0.063 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    167/1    0.006    0.000    0.021    0.021 element.py:1119(decode)
        1    0.004    0.004    0.035    0.035 _lxml.py:246(feed)
2410/2402    0.003    0.000    0.004    0.000 :0(isinstance)
      161    0.003    0.000    0.019    0.000 _lxml.py:145(start)
      499    0.003    0.000    0.010    0.000 __init__.py:337(endData)
    167/1    0.003    0.000    0.021    0.021 element.py:1218(decode_contents)
     2053    0.002    0.000    0.002    0.000 :0(append)
      167    0.002    0.000    0.006    0.000 element.py:813(__init__)
      281    0.002    0.000    0.004    0.000 __init__.py:367(object_was_parsed)
      729    0.002    0.000    0.002    0.000 element.py:203(setup)
      115    0.002    0.000    0.003    0.000 __init__.py:152(_replace_cdata_list_attribute_values)
      164    0.002    0.000    0.014 

In [4]:
# base, file = Path('./2.0.2/api'), 'pyplot_summary.html'
# with open(base / file, "rt") as f:
#    parse = BS(f, 'lxml')

In [5]:
def process(target, verbose = False, dry = False):
    delve = ''; deeper = ''
    for root, _, files in os.walk(Path(target)):
        path = Path(root)

        # progress meter, inspired loosely by https://stackoverflow.com/a/2165062
        if verbose:
            try:
                s = path.parts[1] # toplevel.index()
                if s != delve:
                    delve = s; print(s)
                else:
                    r = path.parts[2] if s not in ['mpl_examples', 'examples'] else '...'
                    if r != deeper:
                        deeper = r; print('\t' + r)
            except: pass

        for f in files:
            try:
                # expect .pdf, .png, .py files. ignore them.
                if (Path(f).suffix == '.html'):
                    adjust(path, f, False) # time.sleep(.0001)

            except UnicodeDecodeError as e:
                print(f, e)
            except AttributeError as e:
                print(f, e)

In [6]:
# test on a single legacy site
# ! svn checkout https://github.com/matplotlib/matplotlib.github.com/trunk/2.0.2
# process('2.0.2', dry=True)

In [7]:
import re
targets = [d for d in os.listdir() if re.match(r'''\d+\.\d+\.\d+''', d)]
targets

# Below cell will mutate all HTML files in the directories listed.

['1.4.0',
 '1.2.1',
 '1.5.0',
 '2.0.2',
 '2.1.1',
 '2.0.0',
 '1.4.3',
 '2.0.1',
 '1.4.1',
 '1.3.1',
 '2.2.0',
 '1.5.1',
 '2.2.2',
 '1.5.3',
 '2.1.0',
 '1.3.0',
 '2.1.2',
 '1.4.2']

In [8]:
# run me as code cell to proceed.
for t in targets: process(t, verbose=True)

mpl_toolkits
	mplot3d
	axes_grid
api
faq
devel
	gitwash
_sources
	mpl_toolkits
	api
	faq
	devel
	glossary
	examples
	users
	resources
glossary
mpl_examples
	...
pyplots
plot_directive
	mpl_examples
examples
	...
_images
	mathmpl
users
	plotting
_static
resources
mpl_toolkits
	mplot3d
	axes_grid
api
faq
devel
	gitwash
_sources
	mpl_toolkits
	api
	faq
	devel
	glossary
	examples
	users
glossary
mpl_examples
	...
pyplots
plot_directive
	mpl_examples
examples
	...
_images
	mathmpl
users
	plotting
_static
mpl_toolkits
	mplot3d
	axes_grid
api
faq
devel
	gitwash
	MEP
_sources
	mpl_toolkits
	api
	faq
	devel
	glossary
	examples
	users
	resources
glossary
mpl_examples
	...
pyplots
plot_directive
	mpl_examples
examples
	...
_images
	mathmpl
users
	plotting
_static
resources
mpl_toolkits
	mplot3d
	axes_grid
api
	_as_gen
faq
devel
	gitwash
	MEP
_sources
	mpl_toolkits
	api
	faq
	devel
	glossary
	examples
	users
	resources
glossary
mpl_examples
	...
plot_directive
	mpl_examples
examples
	...
_images
	