In [12]:
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
import csv
from pprint import pprint
from collections import namedtuple
from io import StringIO

In [13]:
os.chdir(os.path.expanduser('~/Documents/le-nozze-di-figaro/'))
os.listdir()

['outline.sh',
 'files.txt',
 'pages-103-108.pdf',
 'pages-066-067.pdf',
 'pages-144-146.pdf',
 'le-nozze-di-figaro-act-i.pdf',
 'figaro-toc-act-i.txt',
 'pages-005-028.pdf',
 'le-nozze-di-figaro-act-i-page-labels.pdf',
 'pages-109-126.pdf',
 'pages-068-079.pdf',
 'pages-139-144.pdf',
 'pages-090-093.pdf',
 'pages-042-053.pdf',
 'pages-053-057.pdf',
 'pages-137-139.pdf',
 'le-nozze-di-figaro-act-i-copy.pdf',
 'pages-161-166.pdf',
 'figaro-toc-act-i.csv',
 'concat.sh',
 'pages-029-040.pdf',
 'pages-146-160.pdf',
 'pages-127-128.pdf',
 'pages-057-066.pdf',
 'front-matter.pdf',
 'pages-081-090.pdf',
 'pages-080-080.pdf',
 'pages-094-103.pdf',
 'figaro-toc-act-i_bookmarks.txt',
 'nma_305_-30_-3_eng.pdf',
 'pages-041-042.pdf',
 'pages-129-136.pdf']

In [14]:

Record = namedtuple('Record', 'level title label page')

def get_record(record_dict):
    level = int(record_dict['Level'])
    title = record_dict['Title']
    page_string = record_dict['Page']
    if page_string:
        page = int(page_string)
    else:
        page = None
    label = record_dict['Label']
    return Record(level, title, label, page)
    

In [15]:
class Node:
    
    
    def __init__(self, rec):
        self._rec = rec
        self._children = []
        
        
    @property
    def label(self):
        return self._rec.label
    
    @property
    def level(self):
        return self._rec.level
    
    @property
    def title(self):
        return self._rec.title
    
    @property
    def page(self):
        return self._rec.page
    
    @property
    def children(self):
        return self._children
    
    
    def append(self, child):
        self._children.append(child)
        
        
    def __repr__(self):
        with StringIO() as sp:
            sp.write((self.level) * '  ')
            sp.write(f'{self.title} (page {self.page})')
            sp.write('\n')
            
            for child in self._children:
                sp.write(str(child))
                
            return sp.getvalue()
        
    def __str__(self):
        return self.__repr__()
    

def get_raw_toc():
    with open('figaro-toc-act-i.csv') as fp:
        reader = csv.DictReader(fp)
        return [get_record(row) for row in reader]
    
def build_toc_tree():
    raw_toc = get_raw_toc()
    root = Node(Record(title='root', level=0, label=None, page=None))
    parents = []
    for i in range(len(raw_toc)):
        rec = raw_toc[i]
        node = Node(rec)
        if rec.level == 1:
            root.append(node)
            parents.clear()
            parents.append(node)
        else:
            parent_node = parents[-1]
            parent_level = parent_node.level
            if rec.level <= parent_level:
                parents.pop()
                parent_node = parents[-1]
                parent_node.append(node)
                parents.append(node)
            elif rec.level > parent_node.level:
                parents[-1].append(node)
                parents.append(node)

    return root

tree = build_toc_tree()
    
pprint(tree)

root (page None)
  Zur Edition (page 4)
  Vorwort (page 5)
  Faksimiles (page 21)
  Sinfonia (page 29)
  Atto primo (page 53)
    Scena I (page 53)
      No. 1 Duettino, ‟Cinque ... dieci ...” (page 53)
      Recitativo, ‟Cosa stai misurando” (page 65)
      No. 2 Duettino, ‟Se a caso madama la notte ti chiama” (page 66)
      Recitativo,‟Or bene; ascolta, e taci!” (page 77)
    Scena II (page 80)
      Recitativo, ‟Bravo signor padrone!” (page 80)
      No. 3 Cavatina, ‟Se vuol ballare signor Contino” (page 81)
    Scena III  (page 90)
      Recitativo, ‟Ed aspettaste il giorno fissato” (page 90)
      No. 4 Aria, ‟La vendetta” (page 92)
    Scena IV (page 104)
      Recitativo, ‟Tutto ancor no ho perso” (page 104)
      No. 5 Duettino, ‟Va resti vita, madama brillante” (page 105)
    Scena V (page 114)
      Recitativo, ‟Va’ là, vecchia pedante” (page 114)
      No. 6 Aria, ‟No so più cosa son, cosa faccio” (page 118)
    Scena VI (page 127)
      Recitativo, ‟Ah son perduto!” (page 

In [16]:
writer = PdfFileWriter()
with open('le-nozze-di-figaro-act-i-page-labels.pdf', mode='rb') as fp1:
    reader = PdfFileReader(fp1)
    for i in range(reader.getNumPages()):
        writer.addPage(reader.getPage(i))
    
#     parents = []
#     previous_record = None
#     for record in get_toc():
#         if record.level == 1:
#             previous_record = writer.addBookmark(record.title, record.page)
#             parents.clear()
#             parents.append(previous_record)
#         else:
#             writer.addBookmark(record.title, record.page, parents[-1])
    
    for record in get_toc():
        writer.addBookmark(record.title, record.page - 1)
    
        
    with open('le-nozze-di-figaro-act-i-copy.pdf', mode='wb') as fp2:
        writer.write(fp2)

NameError: name 'get_toc' is not defined

In [17]:
dir(PdfFileWriter)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_addObject',
 '_addPage',
 '_sweepIndirectReferences',
 '_valid_layouts',
 '_valid_modes',
 'addAttachment',
 'addBlankPage',
 'addBookmark',
 'addBookmarkDestination',
 'addBookmarkDict',
 'addJS',
 'addLink',
 'addMetadata',
 'addNamedDestination',
 'addNamedDestinationObject',
 'addPage',
 'addURI',
 'appendPagesFromReader',
 'cloneDocumentFromReader',
 'cloneReaderDocumentRoot',
 'encrypt',
 'getNamedDestRoot',
 'getNumPages',
 'getObject',
 'getOutlineRoot',
 'getPage',
 'getPageLayout',
 'getPageMode',
 'getReference',
 'insertBlankPage',
 'insertPage',
 'pageLayout',
 'pageMode',
 'removeImages',
 'removeLinks',
 

In [11]:
class NumContainer():

    def __init__(self, i):
        self._number = i
        self._children = None

    @property
    def number(self):
        return self._number

    @property
    def children(self):
        return self._children

    @children.setter
    def children(self, value):
        self._children = value


def get_alpha():
    for i in range(ord('a'), ord('z')):
        yield chr(i)

def get_numbers():
    for i in range(1, 5):
       cont = NumContainer(i)
       cont.children = get_alpha()
       yield cont

for cont in get_numbers():
    print(cont.number)
    for alpha in cont.children:
        print(alpha)



KeyboardInterrupt: 