# Preprocessing

## Parsing the html in the epubs

In [13]:
import glob
from collections import OrderedDict
from bs4 import BeautifulSoup

### UK edition (Bloomsbury version)

In [16]:
def EnglishHarry():
    HP = OrderedDict()
    uk_path = '../../data/rowling/potter/UK/OEBPS'

    for i in range(1, 8):
        # iterate over individual book folders
        p = f'../../data/rowling/potter/UK/OEBPS/{i}'
        files = sorted(glob.glob(p + '/hp*_ch*.html'))

        book_title = None

        for html_file in files:
            with open(html_file, 'r') as f:
                tree = BeautifulSoup(f.read(), 'lxml')

                # get title of current book:
                if book_title is None:
                    book_title = tree.title.text.split(' - ')[0].strip()
                    HP[book_title] = OrderedDict()
                
                # get title of current chapter:
                chapter = tree.h1
                chapter_title = chapter.text.split(' - ')[0].strip()
                HP[book_title][chapter_title] = []

                for element in chapter.next_siblings:
                    if not element.name == 'p':
                        continue
                    paragraph = ' '.join(element.text.split())
                    HP[book_title][chapter_title].append(paragraph)
    
    return HP

In [17]:
UK_HP = EnglishHarry()

for book in UK_HP:
    print(book)
    for chapter in UK_HP[book]:
        print(f'   {chapter} ({len(UK_HP[book][chapter])} paragraphs)')

Harry Potter and the Philosopher's Stone
   The Boy Who Lived (111 paragraphs)
   The Vanishing Glass (100 paragraphs)
   The Letters from No One (128 paragraphs)
   The Keeper of the Keys (137 paragraphs)
   Diagon Alley (260 paragraphs)
   The Journey from Platform Nine and Three-Quarters (288 paragraphs)
   The Sorting Hat (143 paragraphs)
   The Potions Master (84 paragraphs)
   The Midnight Duel (214 paragraphs)
   Hallowe’en (162 paragraphs)
   Quidditch (138 paragraphs)
   The Mirror of Erised (210 paragraphs)
   Nicolas Flamel (125 paragraphs)
   Norbert the Norwegian Ridgeback (140 paragraphs)
   The Forbidden Forest (188 paragraphs)
   Through the Trapdoor (305 paragraphs)
   The Man with Two Faces (233 paragraphs)
Harry Potter and the Chamber of Secrets
   The Worst Birthday (95 paragraphs)
   The Burrow (183 paragraphs)
   At Flourish and Blotts (192 paragraphs)
   The Whomping Willow (196 paragraphs)
   Gilderoy Lockhart (148 paragraphs)
   Mudbloods and Murmurs (170 parag

### US edition (Scholastic)

*Note: manually remove these files, which have not been properly inserted: part0024.html, part0113.html*

In [18]:
def AmericanHarry():
    HP = OrderedDict()
    us_path = '../../data/rowling/potter/US/text/'
    
    book_title = ''
    
    for fn in sorted(glob.glob(us_path + '*.html')):
        with open(fn, 'r') as f:
            tree = BeautifulSoup(f.read(), 'lxml')
            
            title = tree.title.text.split(' - ')[0].strip()
            title = title.replace('’', "'")
            
            if 'collection' in title.lower():
                continue
            
            # detect start of new book:
            if title is not None and title != book_title:
                book_title = title
                HP[book_title] = OrderedDict()

            chapter = tree.html.body.h3
            if not chapter:
                chapter = tree.html.body.h2
            
            if chapter:
                chapter_title = chapter.text.strip()
                chapter_title = chapter_title.replace('’', "'")
                
                # skip ToC
                if 'contents' in chapter_title.lower():
                    continue
                
                HP[book_title][chapter_title] = []
            
                for element in chapter.next_siblings:
                    if not element.name == 'p':
                        continue
                    paragraph = ' '.join(element.text.split())
                    HP[book_title][chapter_title].append(paragraph)
    
    return HP

In [19]:
US_HP = AmericanHarry()

for book in US_HP:
    print(book)
    for chapter in US_HP[book]:
        print(f'   {chapter} ({len(US_HP[book][chapter])} paragraphs)')

Harry Potter and the Sorcerer's Stone
   THE BOY WHO LIVED (110 paragraphs)
   THE VANISHING GLASS (100 paragraphs)
   THE LETTERS FROM NO ONE (121 paragraphs)
   THE KEEPER OF THE KEYS (137 paragraphs)
   DIAGON ALLEY (252 paragraphs)
   THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS (282 paragraphs)
   THE SORTING HAT (143 paragraphs)
   THE POTIONS MASTER (84 paragraphs)
   THE MIDNIGHT DUEL (211 paragraphs)
   HALLOWEEN (161 paragraphs)
   QUIDDITCH (137 paragraphs)
   THE MIRROR OF ERISED (203 paragraphs)
   NICOLAS FLAMEL (120 paragraphs)
   NORBERT THE NORWEGIAN RIDGEBACK (134 paragraphs)
   THE FORBIDDEN FOREST (184 paragraphs)
   THROUGH THE TRAPDOOR (304 paragraphs)
   THE MAN WITH TWO FACES (229 paragraphs)
Harry Potter and the Chamber of Secrets
   THE WORST BIRTHDAY (96 paragraphs)
   THE BURROW (184 paragraphs)
   AT FLOURISH AND BLOTTS (198 paragraphs)
   THE WHOMPING WILLOW (199 paragraphs)
   GILDEROY LOCKHART (148 paragraphs)
   MUDBLOODS AND MURMURS (170 paragraph

### Compare

First at the level of paragraphs - or rather text blocks - which we extracted per chapter:

In [20]:
diffs = []
for uk_book, us_book in zip(UK_HP, US_HP):
    print(uk_book, 'vs', us_book)
    for uk_chap, us_chap in zip(UK_HP[uk_book], US_HP[us_book]):
        print('   ', uk_chap, 'vs', us_chap)
        us_len = len(UK_HP[uk_book][uk_chap])
        uk_len = len(US_HP[us_book][us_chap])
        diff = abs(us_len - uk_len)
        print('   ', us_len, 'vs', uk_len, '-> diff of ', diff)
        diffs.append(diff)

print('Maximum difference in text blocks between chapters:', max(diffs))

Harry Potter and the Philosopher's Stone vs Harry Potter and the Sorcerer's Stone
    The Boy Who Lived vs THE BOY WHO LIVED
    111 vs 110 -> diff of  1
    The Vanishing Glass vs THE VANISHING GLASS
    100 vs 100 -> diff of  0
    The Letters from No One vs THE LETTERS FROM NO ONE
    128 vs 121 -> diff of  7
    The Keeper of the Keys vs THE KEEPER OF THE KEYS
    137 vs 137 -> diff of  0
    Diagon Alley vs DIAGON ALLEY
    260 vs 252 -> diff of  8
    The Journey from Platform Nine and Three-Quarters vs THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS
    288 vs 282 -> diff of  6
    The Sorting Hat vs THE SORTING HAT
    143 vs 143 -> diff of  0
    The Potions Master vs THE POTIONS MASTER
    84 vs 84 -> diff of  0
    The Midnight Duel vs THE MIDNIGHT DUEL
    214 vs 211 -> diff of  3
    Hallowe’en vs HALLOWEEN
    162 vs 161 -> diff of  1
    Quidditch vs QUIDDITCH
    138 vs 137 -> diff of  1
    The Mirror of Erised vs THE MIRROR OF ERISED
    210 vs 203 -> diff of  7
   

Now compare at the per-chapter character level:

In [21]:
for uk_book, us_book in zip(UK_HP, US_HP):
    print(uk_book, 'vs', us_book)
    for uk_chap, us_chap in zip(UK_HP[uk_book], US_HP[us_book]):
        print('   ', uk_chap, 'vs', us_chap)
        us_len = len('\n'.join(UK_HP[uk_book][uk_chap]))
        uk_len = len('\n'.join(US_HP[us_book][us_chap]))
        diff = abs(us_len - uk_len)
        print('   ', us_len, 'vs', uk_len, '-> diff of ', diff)
        diffs.append(diff)

print('Maximum character difference between two chapters: ', max(diffs))

Harry Potter and the Philosopher's Stone vs Harry Potter and the Sorcerer's Stone
    The Boy Who Lived vs THE BOY WHO LIVED
    25677 vs 25842 -> diff of  165
    The Vanishing Glass vs THE VANISHING GLASS
    18983 vs 19070 -> diff of  87
    The Letters from No One vs THE LETTERS FROM NO ONE
    21246 vs 21338 -> diff of  92
    The Keeper of the Keys vs THE KEEPER OF THE KEYS
    19558 vs 19627 -> diff of  69
    Diagon Alley vs DIAGON ALLEY
    35890 vs 36080 -> diff of  190
    The Journey from Platform Nine and Three-Quarters vs THE JOURNEY FROM PLATFORM NINE AND THREE-QUARTERS
    34257 vs 34277 -> diff of  20
    The Sorting Hat vs THE SORTING HAT
    23648 vs 23809 -> diff of  161
    The Potions Master vs THE POTIONS MASTER
    16332 vs 16425 -> diff of  93
    The Midnight Duel vs THE MIDNIGHT DUEL
    28013 vs 28026 -> diff of  13
    Hallowe’en vs HALLOWEEN
    23695 vs 23737 -> diff of  42
    Quidditch vs QUIDDITCH
    18919 vs 19025 -> diff of  106
    The Mirror of Er