In [1]:
import re
import numpy as np

# Numbering Headers
The following notebook is just a few functions to help renumber the headers and to generate a table of contents.

In [2]:
def renumber(headers, baselevel=1):
    '''Renumbers a list of headers based on the number of # symbols
    
    
    Parameters
    ----------
    headers : list of str
        List of markdown headers beginning with # symbols
    baselevel : int >= 1
        The smallest number of # symbols to be considered as chapters
        Exception is thrown if the a header begins with fewer # symbols than baselevel
    
    
    Output
    ------
    List of str with the chapter/section numberings of the headers
    
    
    Example
    -------
    renumber(headers=['## Chapter 1', '### Section',
                      '#### Subection', '#### Subection',
                      '### Section', '#### Subection',
                      '## Chapter 2', '### Section',
                      '## Chapter 3'],
             baselevel=2)
    Returns:
    ['1', '1.1', '1.1.1', '1.1.2', '1.2', '1.2.1', '2', '2.1', '3']
    '''

    if (not isinstance(baselevel, int)) or (baselevel < 1):
        raise Exception('baselevel must be at least 1')
        
    # Array to keep track of chapter/section/subsection numbering
    # [Chapter, Section, Subsection, etc.]
    num_stack = np.array([0,])
    
    numbering = []

    for h in headers:
        level = re.match(r"^[#]+", h)
        if level:
            curlev = len(level[0]) - baselevel + 1
            
            if curlev < 1:
                raise Exception('{} has fewer \'#\' symbols than the baselevel.'.format(h))
            
            # Adjust stack to the desired numbering depth
            # Either pushing 1s
            if len(num_stack) < curlev:
                num_stack = np.concatenate((num_stack,
                                            np.ones((curlev - len(num_stack),), dtype=int)))
            else:
                # Or popping to shorten list
                if len(num_stack) > curlev:
                    num_stack = num_stack[0:curlev]
                    
                # Increase the value at the last index by 1
                num_stack[-1] = num_stack[-1] + 1

            numbering.append(str(np.apply_along_axis('.'.join,
                                                     0,
                                                     num_stack.astype(str))))
        else:
            raise Exception('\'{}\' is not a header.'.format(h))
    
    return numbering

In [3]:
def norm_header(header):
    '''Normalizes the header:
    1. Convert header to lower case
    2. Remove any symbol that is not a letter, number, space or hyphen
    3. Strip leading and trailing spaces
    4. Replaces spaces with hyphens'''
    
    return re.sub(r' ',
                  '-', 
                  re.sub(r'[^a-zA-Z0-9 -]',
                         '',
                         header.lower()
                        ).strip()
                 )

In [4]:
def toc(headers, normed):
    '''Generates table of contents for markdown file'''
    
    for h, n in zip(headers, normed):
        print('{}- [{}](#{})'.format('    ' * (re.match(r'[#]+', h).end() - 2),
                                     h.replace('#','').strip(),
                                     n))

def tag_headers(headers, normed):
    '''Prints headers with HTML a id tag'''
    
    for h, n in zip(headers, normed):
        print('{} <a id=\"{}"/>'.format(h, n))

In [5]:
def get_toc_heads(headers, number_headers=False, baselevel=1):
    if number_headers:
        headers = [re.match(r"^[#]+", head)[0] + ' ' 
                   + num + ' '
                   + head.replace('#','').strip()
                   for head, num in zip(headers, renumber(headers, baselevel))]
        
    normed = list(map(norm_header, headers))
    
    print('## Table of Contents')
    toc(headers, normed)

    print('\nHeaders:')
    tag_headers(headers, normed)

## Titanic

In [6]:
titanic = ["## Data Preparation",
           "### Pair Plot",
           "### Correlation Matrix",
           "### Treating Missing Values",
           "### Missing Embarked Values",
           "### Missing Age Values",
           "#### Age vs Survived",
           "#### Age vs Pclass",
           "#### Age vs Sex",
           "#### Age vs Siblings and Spouses",
           "#### Age vs Parents and Children",
           "#### Age vs Port Embarked",
           "#### Grouping FareRange",
           "##### FareRange vs Pclass",
           "##### First Class 0-10 Passengers",
           "##### Second Class 0-10 Passengers",
           "##### Updating 0 Fare Passengers",
           "##### Second Class 50+ Passengers",
           "##### Third Class 50+ Passengers",
           "##### Replacing FareRange for Third Class 50+ Passengers",
           "##### 50+S Sage Family",
           "##### 50+A Asian Descent",
           "#### Remaining Missing Age Values",
           "### Missing Cabin Values",
           "### Number of Passengers on Same Ticket",
           "## Processing Data: New Attributes",
           "### Group Age Ranges",
           "## Analyzing Survived",
           "### Survived vs Pclass",
           "### Survived vs Sex",
           "### Survived vs Siblings and Spouses",
           "### Survived vs Parents and Children",
           "### Survived vs Port Embarked",
           "### Survived vs AgeRange",
           "### Survived vs FareRange",
           "### Survived vs Pclass and Sex",
           "### Survived vs AgeRange and Siblings",
           "### Survived vs AgeRange and FareRange",
           "### Survived vs Sex and AgeRange",
           "### Survived vs Sex and FareRange",
           "### Survived vs Sex, Pclass and AgeRange",
           "### Survived vs Sex, Pclass and FareRange",
           "## Visualizing Data: Age vs Fare",
           "## Summary",
           ]

We can see the difference in numbering when we set `baselevel=1` vs `baselevel=2`.

In [7]:
# If we set baselevel = 1
renumber(titanic, 1)

['0.1',
 '0.1.1',
 '0.1.2',
 '0.1.3',
 '0.1.4',
 '0.1.5',
 '0.1.5.1',
 '0.1.5.2',
 '0.1.5.3',
 '0.1.5.4',
 '0.1.5.5',
 '0.1.5.6',
 '0.1.5.7',
 '0.1.5.7.1',
 '0.1.5.7.2',
 '0.1.5.7.3',
 '0.1.5.7.4',
 '0.1.5.7.5',
 '0.1.5.7.6',
 '0.1.5.7.7',
 '0.1.5.7.8',
 '0.1.5.7.9',
 '0.1.5.8',
 '0.1.6',
 '0.1.7',
 '0.2',
 '0.2.1',
 '0.3',
 '0.3.1',
 '0.3.2',
 '0.3.3',
 '0.3.4',
 '0.3.5',
 '0.3.6',
 '0.3.7',
 '0.3.8',
 '0.3.9',
 '0.3.10',
 '0.3.11',
 '0.3.12',
 '0.3.13',
 '0.3.14',
 '0.4',
 '0.5']

In [8]:
# If we set baselevel = 2
renumber(titanic, 2)

['1',
 '1.1',
 '1.2',
 '1.3',
 '1.4',
 '1.5',
 '1.5.1',
 '1.5.2',
 '1.5.3',
 '1.5.4',
 '1.5.5',
 '1.5.6',
 '1.5.7',
 '1.5.7.1',
 '1.5.7.2',
 '1.5.7.3',
 '1.5.7.4',
 '1.5.7.5',
 '1.5.7.6',
 '1.5.7.7',
 '1.5.7.8',
 '1.5.7.9',
 '1.5.8',
 '1.6',
 '1.7',
 '2',
 '2.1',
 '3',
 '3.1',
 '3.2',
 '3.3',
 '3.4',
 '3.5',
 '3.6',
 '3.7',
 '3.8',
 '3.9',
 '3.10',
 '3.11',
 '3.12',
 '3.13',
 '3.14',
 '4',
 '5']

In [9]:
get_toc_heads(titanic, number_headers=True, baselevel=2)

## Table of Contents
- [1 Data Preparation](#1-data-preparation)
    - [1.1 Pair Plot](#11-pair-plot)
    - [1.2 Correlation Matrix](#12-correlation-matrix)
    - [1.3 Treating Missing Values](#13-treating-missing-values)
    - [1.4 Missing Embarked Values](#14-missing-embarked-values)
    - [1.5 Missing Age Values](#15-missing-age-values)
        - [1.5.1 Age vs Survived](#151-age-vs-survived)
        - [1.5.2 Age vs Pclass](#152-age-vs-pclass)
        - [1.5.3 Age vs Sex](#153-age-vs-sex)
        - [1.5.4 Age vs Siblings and Spouses](#154-age-vs-siblings-and-spouses)
        - [1.5.5 Age vs Parents and Children](#155-age-vs-parents-and-children)
        - [1.5.6 Age vs Port Embarked](#156-age-vs-port-embarked)
        - [1.5.7 Grouping FareRange](#157-grouping-farerange)
            - [1.5.7.1 FareRange vs Pclass](#1571-farerange-vs-pclass)
            - [1.5.7.2 First Class 0-10 Passengers](#1572-first-class-0-10-passengers)
            - [1.5.7.3 Second Class 0-10 Passengers](#1573