# Preprocessing

## Parsing the html in the epubs

In [3]:
import glob
from collections import OrderedDict
from bs4 import BeautifulSoup

### UK edition (Bloomsbury version)

In [4]:
def EnglishHarry():
    HP = OrderedDict()
    uk_path = '../../data/rowling/potter/UK/OEBPS'

    for i in range(1, 8):
        # iterate over individual book folders
        p = f'../../data/rowling/potter/UK/OEBPS/{i}'
        files = sorted(glob.glob(p + '/hp*_ch*.html'))

        book_title = None

        for html_file in files:
            with open(html_file, 'r') as f:
                tree = BeautifulSoup(f.read(), 'lxml')

                # get title of current book:
                if book_title is None:
                    book_title = tree.title.text.split(' - ')[0].strip()
                    HP[book_title] = OrderedDict()
                
                # get title of current chapter:
                chapter = tree.h1
                chapter_title = chapter.text.split(' - ')[0].strip()
                HP[book_title][chapter_title] = []

                for element in chapter.next_siblings:
                    if not element.name == 'p':
                        continue
                    paragraph = ' '.join(element.text.split())
                    HP[book_title][chapter_title].append(paragraph)
    
    return HP

In [5]:
UK_HP = EnglishHarry()

for book in UK_HP:
    print(book)
    for chapter in UK_HP[book]:
        print(f'   {chapter} ({len(UK_HP[book][chapter])} paragraphs)')

Harry Potter and the Philosopher's Stone
   The Boy Who Lived (111 paragraphs)
   The Vanishing Glass (100 paragraphs)
   The Letters from No One (128 paragraphs)
   The Keeper of the Keys (137 paragraphs)
   Diagon Alley (260 paragraphs)
   The Journey from Platform Nine and Three-Quarters (288 paragraphs)
   The Sorting Hat (143 paragraphs)
   The Potions Master (84 paragraphs)
   The Midnight Duel (214 paragraphs)
   Hallowe’en (162 paragraphs)
   Quidditch (138 paragraphs)
   The Mirror of Erised (210 paragraphs)
   Nicolas Flamel (125 paragraphs)
   Norbert the Norwegian Ridgeback (140 paragraphs)
   The Forbidden Forest (188 paragraphs)
   Through the Trapdoor (305 paragraphs)
   The Man with Two Faces (233 paragraphs)
Harry Potter and the Chamber of Secrets
   The Worst Birthday (95 paragraphs)
   The Burrow (183 paragraphs)
   At Flourish and Blotts (192 paragraphs)
   The Whomping Willow (196 paragraphs)
   Gilderoy Lockhart (148 paragraphs)
   Mudbloods and Murmurs (170 parag

### US edition (Scholastic)

*Note: manually remove these files, which have not been properly inserted: part0024.html, part0113.html*

In [6]:
def AmericanHarry():
    HP = OrderedDict()
    us_path = '../../data/rowling/potter/US/text/'
    
    book_title = ''
    
    for fn in sorted(glob.glob(us_path + '*.html')):
        with open(fn, 'r') as f:
            tree = BeautifulSoup(f.read(), 'lxml')
            
            title = tree.title.text.split(' - ')[0].strip()
            title = title.replace('’', "'")
            
            if 'collection' in title.lower():
                continue
            
            # detect start of new book:
            if title is not None and title != book_title:
                book_title = title
                HP[book_title] = OrderedDict()

            chapter = tree.html.body.h3
            if not chapter:
                chapter = tree.html.body.h2
            
            if chapter:
                chapter_title = chapter.text.strip()
                chapter_title = chapter_title.replace('’', "'")
                
                # skip ToC
                if 'contents' in chapter_title.lower():
                    continue
                
                HP[book_title][chapter_title] = []
            
                for element in chapter.next_siblings:
                    if not element.name == 'p':
                        continue
                    paragraph = ' '.join(element.text.split())
                    HP[book_title][chapter_title].append(paragraph)
    
    return HP

In [7]:
US_HP = AmericanHarry()

for book in US_HP:
    print(book)
    for chapter in US_HP[book]:
        print(f'   {chapter} ({len(US_HP[book][chapter])} paragraphs)')

Harry Potter and the Sorcerer's Stone
   THE BOY WHO LIVED (110 paragraphs)
   THE VANISHING GLASS (100 paragraphs)
   THE LETTERS FROM NO ONE (121 paragraphs)
   THE KEEPER OF THE KEYS (137 paragraphs)
   DIAGON ALLEY (252 paragraphs)
   THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS (282 paragraphs)
   THE SORTING HAT (143 paragraphs)
   THE POTIONS MASTER (84 paragraphs)
   THE MIDNIGHT DUEL (211 paragraphs)
   HALLOWEEN (161 paragraphs)
   QUIDDITCH (137 paragraphs)
   THE MIRROR OF ERISED (203 paragraphs)
   NICOLAS FLAMEL (120 paragraphs)
   NORBERT THE NORWEGIAN RIDGEBACK (134 paragraphs)
   THE FORBIDDEN FOREST (184 paragraphs)
   THROUGH THE TRAPDOOR (304 paragraphs)
   THE MAN WITH TWO FACES (229 paragraphs)
Harry Potter and the Chamber of Secrets
   THE WORST BIRTHDAY (96 paragraphs)
   THE BURROW (184 paragraphs)
   AT FLOURISH AND BLOTTS (198 paragraphs)
   THE WHOMPING WILLOW (199 paragraphs)
   GILDEROY LOCKHART (148 paragraphs)
   MUDBLOODS AND MURMURS (170 paragraph

### Compare

First at the level of paragraphs - or rather text blocks - which we extracted per chapter:

In [8]:
diffs = []
for uk_book, us_book in zip(UK_HP, US_HP):
    print(uk_book, 'vs', us_book)
    for uk_chap, us_chap in zip(UK_HP[uk_book], US_HP[us_book]):
        print('   ', uk_chap, 'vs', us_chap)
        us_len = len(UK_HP[uk_book][uk_chap])
        uk_len = len(US_HP[us_book][us_chap])
        diff = abs(us_len - uk_len)
        print('   ', us_len, 'vs', uk_len, '-> diff of ', diff)
        diffs.append(diff)

print('Maximum difference in text blocks between chapters:', max(diffs))

Harry Potter and the Philosopher's Stone vs Harry Potter and the Sorcerer's Stone
    The Boy Who Lived vs THE BOY WHO LIVED
    111 vs 110 -> diff of  1
    The Vanishing Glass vs THE VANISHING GLASS
    100 vs 100 -> diff of  0
    The Letters from No One vs THE LETTERS FROM NO ONE
    128 vs 121 -> diff of  7
    The Keeper of the Keys vs THE KEEPER OF THE KEYS
    137 vs 137 -> diff of  0
    Diagon Alley vs DIAGON ALLEY
    260 vs 252 -> diff of  8
    The Journey from Platform Nine and Three-Quarters vs THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS
    288 vs 282 -> diff of  6
    The Sorting Hat vs THE SORTING HAT
    143 vs 143 -> diff of  0
    The Potions Master vs THE POTIONS MASTER
    84 vs 84 -> diff of  0
    The Midnight Duel vs THE MIDNIGHT DUEL
    214 vs 211 -> diff of  3
    Hallowe’en vs HALLOWEEN
    162 vs 161 -> diff of  1
    Quidditch vs QUIDDITCH
    138 vs 137 -> diff of  1
    The Mirror of Erised vs THE MIRROR OF ERISED
    210 vs 203 -> diff of  7
   

    207 vs 202 -> diff of  5
    The Unknowable Room vs THE UNKNOWABLE ROOM
    191 vs 192 -> diff of  1
    After the Burial vs AFTER THE BURIAL
    220 vs 221 -> diff of  1
    Horcruxes vs HORCRUXES
    160 vs 162 -> diff of  2
    Sectumsempra vs SECTUMSEMPRA
    164 vs 161 -> diff of  3
    The Seer Overheard vs THE SEER OVERHEARD
    220 vs 212 -> diff of  8
    The Cave vs THE CAVE
    225 vs 222 -> diff of  3
    The Lightning-Struck Tower vs THE LIGHTNING-STRUCK TOWER
    166 vs 160 -> diff of  6
    Flight of the Prince vs FLIGHT OF THE PRINCE
    99 vs 92 -> diff of  7
    The Phoenix Lament vs THE PHOENIX LAMENT
    187 vs 185 -> diff of  2
    The White Tomb vs THE WHITE TOMB
    133 vs 131 -> diff of  2
Harry Potter and the Deathly Hallows vs Harry Potter and the Deathly Hallows
    The Dark Lord Ascending vs THE DARK LORD ASCENDING
    108 vs 107 -> diff of  1
    In Memoriam vs IN MEMORIAM
    25 vs 26 -> diff of  1
    The Dursleys Departing vs THE DURSLEYS DEPARTING
 

Now compare at the per-chapter character level:

In [9]:
for uk_book, us_book in zip(UK_HP, US_HP):
    print(uk_book, 'vs', us_book)
    for uk_chap, us_chap in zip(UK_HP[uk_book], US_HP[us_book]):
        print('   ', uk_chap, 'vs', us_chap)
        us_len = len('\n'.join(UK_HP[uk_book][uk_chap]))
        uk_len = len('\n'.join(US_HP[us_book][us_chap]))
        diff = abs(us_len - uk_len)
        print('   ', us_len, 'vs', uk_len, '-> diff of ', diff)
        diffs.append(diff)

print('Maximum character difference between two chapters: ', max(diffs))

Harry Potter and the Philosopher's Stone vs Harry Potter and the Sorcerer's Stone
    The Boy Who Lived vs THE BOY WHO LIVED
    25677 vs 25842 -> diff of  165
    The Vanishing Glass vs THE VANISHING GLASS
    18983 vs 19070 -> diff of  87
    The Letters from No One vs THE LETTERS FROM NO ONE
    21246 vs 21338 -> diff of  92
    The Keeper of the Keys vs THE KEEPER OF THE KEYS
    19558 vs 19627 -> diff of  69
    Diagon Alley vs DIAGON ALLEY
    35890 vs 36080 -> diff of  190
    The Journey from Platform Nine and Three-Quarters vs THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS
    34257 vs 34277 -> diff of  20
    The Sorting Hat vs THE SORTING HAT
    23648 vs 23809 -> diff of  161
    The Potions Master vs THE POTIONS MASTER
    16332 vs 16425 -> diff of  93
    The Midnight Duel vs THE MIDNIGHT DUEL
    28013 vs 28026 -> diff of  13
    Hallowe’en vs HALLOWEEN
    23695 vs 23737 -> diff of  42
    Quidditch vs QUIDDITCH
    18919 vs 19025 -> diff of  106
    The Mirror of Er

    38277 vs 38503 -> diff of  226
    Elf Tails vs ELF TAILS
    35887 vs 36157 -> diff of  270
    Lord Voldemort’s Request vs LORD VOLDEMORT'S REQUEST
    37639 vs 37819 -> diff of  180
    The Unknowable Room vs THE UNKNOWABLE ROOM
    32698 vs 33009 -> diff of  311
    After the Burial vs AFTER THE BURIAL
    32747 vs 33234 -> diff of  487
    Horcruxes vs HORCRUXES
    33124 vs 33264 -> diff of  140
    Sectumsempra vs SECTUMSEMPRA
    33821 vs 34111 -> diff of  290
    The Seer Overheard vs THE SEER OVERHEARD
    29250 vs 29457 -> diff of  207
    The Cave vs THE CAVE
    36282 vs 36555 -> diff of  273
    The Lightning-Struck Tower vs THE LIGHTNING-STRUCK TOWER
    25705 vs 25997 -> diff of  292
    Flight of the Prince vs FLIGHT OF THE PRINCE
    20571 vs 20388 -> diff of  183
    The Phoenix Lament vs THE PHOENIX LAMENT
    32077 vs 32287 -> diff of  210
    The White Tomb vs THE WHITE TOMB
    30882 vs 31122 -> diff of  240
Harry Potter and the Deathly Hallows vs Harry Potte

## Save as XML

Simple dump:

In [13]:
import lxml
import spacy
nlp = spacy.load('en')

def simple_xml(HP, fn):
    series = lxml.etree.Element('HarryPotterSeries')

    for book_idx, book_title in enumerate(HP):
        print(book_idx, book_title)
        book_node = lxml.etree.Element('book')
        book_node.attrib['title'] = book_title
        book_node.attrib['n'] = str(book_idx + 1)

        for chapter_idx, chapter_title in enumerate(HP[book_title]):
            print('   ', chapter_idx, chapter_title)
            chapter_node = lxml.etree.Element('chapter')
            chapter_node.attrib['title'] = chapter_title
            chapter_node.attrib['n'] = str(chapter_idx + 1)

            for paragraph_idx, paragraph in enumerate(HP[book_title][chapter_title]):
                paragraph_node = lxml.etree.Element('p')
                paragraph_node.attrib['n'] = str(paragraph_idx + 1)
                paragraph_node.text = paragraph

                chapter_node.append(paragraph_node)

            book_node.append(chapter_node)

        series.append(book_node)

    with open(fn, 'w') as f:
        f.write(lxml.etree.tostring(series, xml_declaration=True,
                                    pretty_print=True, encoding='utf-8').decode())

In [14]:
simple_xml(UK_HP, 'simple_potter_uk.xml')
simple_xml(US_HP, 'simple_potter_us.xml')

0 Harry Potter and the Philosopher's Stone
    0 The Boy Who Lived
    1 The Vanishing Glass
    2 The Letters from No One
    3 The Keeper of the Keys
    4 Diagon Alley
    5 The Journey from Platform Nine and Three-Quarters
    6 The Sorting Hat
    7 The Potions Master
    8 The Midnight Duel
    9 Hallowe’en
    10 Quidditch
    11 The Mirror of Erised
    12 Nicolas Flamel
    13 Norbert the Norwegian Ridgeback
    14 The Forbidden Forest
    15 Through the Trapdoor
    16 The Man with Two Faces
1 Harry Potter and the Chamber of Secrets
    0 The Worst Birthday
    2 The Burrow
    3 At Flourish and Blotts
    4 The Whomping Willow
    5 Gilderoy Lockhart
    6 Mudbloods and Murmurs
    7 The Deathday Party
    8 The Writing on the Wall
    9 The Rogue Bludger
    10 The Duelling Club
    11 The Polyjuice Potion
    12 The Very Secret Diary
    13 Cornelius Fudge
    14 Aragog
    15 The Chamber of Secrets
    16 The Heir of Slytherin
    17 Dobby’s Reward
2 Harry Potter and the 

For the US edition, in which different quotations marks are used, the solution is relatively simple. (But not perfect, because there are still pages or names that also get tagged):

In [11]:
import lxml
import spacy


nlp = spacy.load('en')

series = lxml.etree.Element('HarryPotterSeries')

for book_idx, book_title in enumerate(US_HP):
    print(book_idx, book_title)
    book_node = lxml.etree.Element('book')
    book_node.attrib['title'] = book_title
    book_node.attrib['n'] = str(book_idx + 1)
    
    for chapter_idx, chapter_title in enumerate(US_HP[book_title]):
        print('   ', chapter_idx, chapter_title)
        chapter_node = lxml.etree.Element('chapter')
        chapter_node.attrib['title'] = chapter_title
        chapter_node.attrib['n'] = str(chapter_idx + 1)
        
        for paragraph_idx, paragraph in enumerate(US_HP[book_title][chapter_title]):
            paragraph_node = lxml.etree.Element('p')
            paragraph_node.attrib['n'] = str(paragraph_idx + 1)
            
            said_node = lxml.etree.Element('said')
            said_node.attrib['direct'] = 'false'
            said_node.text = ''
            just_flushed = False
            
            tokens = nlp(paragraph)
            
            for idx, token in enumerate(tokens):
                
                # opening quotation mark:
                if token.text == '“':
                    if len(said_node.text):
                        paragraph_node.append(said_node)
                    
                    said_node = lxml.etree.Element('said')
                    said_node.attrib['direct'] = 'true'
                    said_node.attrib['who'] = 'unknown'
                    said_node.text = token.text_with_ws
                
                elif token.text[-1] == '”':
                    said_node.text += token.text_with_ws
                    paragraph_node.append(said_node)
                    just_flushed = True
                else:
                    if just_flushed:
                        said_node = lxml.etree.Element('said')
                        said_node.attrib['direct'] = 'false'
                        said_node.text = ''
                        just_flushed = False
                    
                    said_node.text += token.text_with_ws
            
            # don't forget last bit dangling:
            if said_node.text:
                paragraph_node.append(said_node)
            
            chapter_node.append(paragraph_node)
        
        book_node.append(chapter_node)

    series.append(book_node)

with open('potter_us.xml', 'w') as f:
    f.write(lxml.etree.tostring(series, xml_declaration=True,
                                pretty_print=True, encoding='utf-8').decode())

0 Harry Potter and the Sorcerer's Stone
    0 THE BOY WHO LIVED
    1 THE VANISHING GLASS
    2 THE LETTERS FROM NO ONE
    3 THE KEEPER OF THE KEYS
    4 DIAGON ALLEY
    5 THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS
    6 THE SORTING HAT


KeyboardInterrupt: 

Add basic NLP annotations using Spacy. Very good illustration of ambiguity of natural language. UK spelling doesn't differentiate between apostrophes and closing quotations, this requires a bit of hacking (the problem is surprisingly simple for the US edition).
Apostrophe at end of token is typically closing quotation, but not always:
- genitive for plurals nouns, e.g. `The Dursleys' house`)
- abbrevitated ing-forms, e.g. `flyin'` (i.e. slang, typically uttered by Hagrid)
- words with special emphasis or verbatim quotes, e.g. `What is his name? 'Harry'`

The latter is very hard to detect, but we can try to solve, to some extent, the first two issues below. (Note that the problem gets extra emphasis because Spacy doesn't properly recognize the closing quotes, cf. `token.is_quote` property)

In [158]:
import lxml
import spacy

nlp = spacy.load('en')

series = lxml.etree.Element('HarryPotterSeries')

for book_idx, book_title in enumerate(UK_HP):
    print(book_idx, book_title)
    book_node = lxml.etree.Element('book')
    book_node.attrib['title'] = book_title
    book_node.attrib['n'] = str(book_idx + 1)
    
    for chapter_idx, chapter_title in enumerate(UK_HP[book_title]):
        print('   ', chapter_idx, chapter_title)
        chapter_node = lxml.etree.Element('chapter')
        chapter_node.attrib['title'] = chapter_title
        chapter_node.attrib['n'] = str(chapter_idx + 1)
        
        for paragraph_idx, paragraph in enumerate(UK_HP[book_title][chapter_title]):
            paragraph_node = lxml.etree.Element('p')
            paragraph_node.attrib['n'] = str(paragraph_idx + 1)
            
            said_node = lxml.etree.Element('said')
            said_node.attrib['direct'] = 'false'
            said_node.text = ''
            just_flushed = False
            
            tokens = nlp(paragraph)
            
            for idx, token in enumerate(tokens):
                
                # catch potential plural genitive
                plural_genitive = False
                if token.text[-1] == '’':
                    try:
                        plural_genitive = (nlp(token.text[:-1])[0].tag_ == 'NNS')
                        plural_genitive = (plural_genitive and not tokens[idx + 1].is_sent_start)
                    except:
                        pass
                
                # catch potential abbreviation
                abbreviation = False
                if token.text.endswith(('an’', 'in’', 'o’')):
                    abbreviation = True
                    abbreviation = (abbreviation and not tokens[idx + 1].is_sent_start)
                
                # opening quotation mark:
                if token.text == '‘':
                    if len(said_node.text):
                        paragraph_node.append(said_node)
                    
                    said_node = lxml.etree.Element('said')
                    said_node.attrib['direct'] = 'true'
                    said_node.attrib['who'] = 'unknown'
                    said_node.text = token.text_with_ws
                
                elif token.text[-1] == '’' and not (plural_genitive or abbreviation):
                    said_node.text += token.text_with_ws
                    paragraph_node.append(said_node)
                    just_flushed = True
                else:
                    if just_flushed:
                        said_node = lxml.etree.Element('said')
                        said_node.attrib['direct'] = 'false'
                        said_node.text = ''
                        just_flushed = False
                    
                    said_node.text += token.text_with_ws
            
            # don't forget last bit dangling:
            if said_node.text:
                paragraph_node.append(said_node)
            
            chapter_node.append(paragraph_node)
        
        book_node.append(chapter_node)

    series.append(book_node)

with open('potter_uk.xml', 'w') as f:
    f.write(lxml.etree.tostring(series, xml_declaration=True,
                                pretty_print=True, encoding='utf-8').decode())

0 Harry Potter and the Philosopher's Stone
    0 The Boy Who Lived
    1 The Vanishing Glass
    2 The Letters from No One
    3 The Keeper of the Keys
    4 Diagon Alley
    5 The Journey from Platform Nine and Three-Quarters
    6 The Sorting Hat
    7 The Potions Master
    8 The Midnight Duel
    9 Hallowe’en
    10 Quidditch
    11 The Mirror of Erised
    12 Nicolas Flamel
    13 Norbert the Norwegian Ridgeback
    14 The Forbidden Forest
    15 Through the Trapdoor
    16 The Man with Two Faces
1 Harry Potter and the Chamber of Secrets
    0 The Worst Birthday
    2 The Burrow
    3 At Flourish and Blotts
    4 The Whomping Willow
    5 Gilderoy Lockhart
    6 Mudbloods and Murmurs
    7 The Deathday Party
    8 The Writing on the Wall
    9 The Rogue Bludger
    10 The Duelling Club
    11 The Polyjuice Potion
    12 The Very Secret Diary
    13 Cornelius Fudge
    14 Aragog
    15 The Chamber of Secrets
    16 The Heir of Slytherin
    17 Dobby’s Reward
2 Harry Potter and the 

Now, keep annotations:

In [167]:
import lxml
import spacy

nlp = spacy.load('en')

series = lxml.etree.Element('HarryPotterSeries')

def token_to_xml(token):
    token_node = lxml.etree.Element('token')
    
    token_node.text = token.text_with_ws
    token_node.attrib['lemma'] = token.lemma_
    token_node.attrib['pos'] = token.tag_
    token_node.attrib['ent'] = token.ent_type_
    token_node.attrib['ent_iob'] = token.ent_iob_
    
    return token_node

cnt = 0

for book_idx, book_title in enumerate(UK_HP):
    print(book_idx, book_title)
    book_node = lxml.etree.Element('book')
    book_node.attrib['title'] = book_title
    book_node.attrib['n'] = str(book_idx + 1)
    
    for chapter_idx, chapter_title in enumerate(UK_HP[book_title]):
        print('   ', chapter_idx, chapter_title)
        chapter_node = lxml.etree.Element('chapter')
        chapter_node.attrib['title'] = chapter_title
        chapter_node.attrib['n'] = str(chapter_idx + 1)
        
        for paragraph_idx, paragraph in enumerate(UK_HP[book_title][chapter_title]):
            paragraph_node = lxml.etree.Element('p')
            paragraph_node.attrib['n'] = str(paragraph_idx + 1)
            
            said_node = lxml.etree.Element('said')
            said_node.attrib['direct'] = 'false'
            said_node.text = ''
            just_flushed = False
            
            tokens = nlp(paragraph)
            
            for idx, token in enumerate(tokens):
                
                # catch potential plural genitive
                plural_genitive = False
                if token.text[-1] == '’':
                    try:
                        plural_genitive = (nlp(token.text[:-1])[0].tag_ == 'NNS')
                        plural_genitive = (plural_genitive and not tokens[idx + 1].is_sent_start)
                    except:
                        pass
                
                # catch potential abbreviation
                abbreviation = False
                if token.text.endswith(('an’', 'in’', 'o’')):
                    abbreviation = True
                    abbreviation = (abbreviation and not tokens[idx + 1].is_sent_start)
                
                # opening quotation mark:
                if token.text == '‘':
                    if len(said_node):
                        paragraph_node.append(said_node)
                    
                    said_node = lxml.etree.Element('said')
                    said_node.attrib['direct'] = 'true'
                    said_node.attrib['who'] = 'unknown'
                    said_node.append(token_to_xml(token))
                
                elif token.text[-1] == '’' and not (plural_genitive or abbreviation):
                    said_node.append(token_to_xml(token))
                    paragraph_node.append(said_node)
                    just_flushed = True
                else:
                    if just_flushed:
                        said_node = lxml.etree.Element('said')
                        said_node.attrib['direct'] = 'false'
                        just_flushed = False
                    
                    said_node.append(token_to_xml(token))
            
            # don't forget last bit dangling:
            if len(said_node):
                paragraph_node.append(said_node)
            
            
            chapter_node.append(paragraph_node)

        book_node.append(chapter_node)

    series.append(book_node)

with open('rich_potter_uk.xml', 'w') as f:
    f.write(lxml.etree.tostring(series, xml_declaration=True,
                                pretty_print=True, encoding='utf-8').decode())

0 Harry Potter and the Philosopher's Stone
    0 The Boy Who Lived
    1 The Vanishing Glass
    2 The Letters from No One
    3 The Keeper of the Keys
    4 Diagon Alley
    5 The Journey from Platform Nine and Three-Quarters
    6 The Sorting Hat
    7 The Potions Master
    8 The Midnight Duel
    9 Hallowe’en
    10 Quidditch
    11 The Mirror of Erised
    12 Nicolas Flamel
    13 Norbert the Norwegian Ridgeback
    14 The Forbidden Forest
    15 Through the Trapdoor
    16 The Man with Two Faces
1 Harry Potter and the Chamber of Secrets
    0 The Worst Birthday
    2 The Burrow
    3 At Flourish and Blotts
    4 The Whomping Willow
    5 Gilderoy Lockhart
    6 Mudbloods and Murmurs
    7 The Deathday Party
    8 The Writing on the Wall
    9 The Rogue Bludger
    10 The Duelling Club
    11 The Polyjuice Potion
    12 The Very Secret Diary
    13 Cornelius Fudge
    14 Aragog
    15 The Chamber of Secrets
    16 The Heir of Slytherin
    17 Dobby’s Reward
2 Harry Potter and the 

## Tagging character names

Wikipedia ids uit deze lijst gebruiken:
https://en.m.wikipedia.org/wiki/List_of_Harry_Potter_characters