### Imports

In [1]:
import spacy
from spacy.tokenizer import Tokenizer
import re

In [2]:
import requests
url = "https://raw.githubusercontent.com/formcept/whiteboard/2feafb5818784118cf19efda29b5525aca4255cd/nbviewer/notebooks/data/harrypotter/Book%201%20-%20The%20Philosopher's%20Stone.txt"
page = requests.get(url)
first_book = page.text

### EDA

In [3]:
first_book[0:1000]

'/ \n\n\n\n\nTHE BOY WHO LIVED \n\nMr. and Mrs. Dursley, of number four, Privet Drive, \nwere proud to say that they were perfectly normal, \nthank you very much. They were the last people you’d \nexpect to be involved in anything strange or \nmysterious, because they just didn’t hold with such \nnonsense. \n\nMr. Dursley was the director of a firm called \nGrunnings, which made drills. He was a big, beefy \nman with hardly any neck, although he did have a \nvery large mustache. Mrs. Dursley was thin and \nblonde and had nearly twice the usual amount of \nneck, which came in very useful as she spent so \nmuch of her time craning over garden fences, spying \non the neighbors. The Dursley s had a small son \ncalled Dudley and in their opinion there was no finer \nboy anywhere. \n\nThe Dursleys had everything they wanted, but they \nalso had a secret, and their greatest fear was that \nsomebody would discover it. They didn’t think they \ncould bear it if anyone found out about the Potters

In [4]:
# Can see in the second paragraph "The Dursley s had a small son...",
# but then in the third paragraph "The Dursleys had everything..."
# Need to find how many times this file has the plural "s" separated
# from the root word...

# Lists out all the occurrences of a stand-alone "s" or "es"
matches = re.finditer("[\s]e*s[\s]", first_book)
starts = [match.start() for match in matches]
starts

[673]

### Cleaning

In [5]:
first_book = page.text

def clean_first_book(first_book):
    # Now removing the space before that stand-alone "s"
    first_book = re.sub(r"[\s][s][\s]", "s ", first_book)
    # Turns all the occurrences of double "\n" into a <new_para> tag
    first_book = re.sub(r"\n\n", "<new_para> ", first_book)
    # Removes the rest of the occurrences of single"\n"
    first_book = re.sub(r"\n", "", first_book)
    # Removes double <new_para>
    first_book = re.sub(r"<new_para> <new_para> ", "<new_para> ", first_book)
    # Replaces the page markers with a "<new_page>" tag
    first_book = re.sub(r"Page [|] [\d]+ Harry Potter and the Philosophers " \
                        "Stone -[\s]?J.K. Rowling", "<new_page>", first_book)
    # New pages have lots of '\n\n', so we'll remove the "<new_para>" that
    # automatically comes with "<new_page>"
    first_book = re.sub("<new_para> <new_page>", "<new_page>", first_book)
    first_book = re.sub("<new_page> <new_para>", "<new_page>", first_book)
    return first_book

first_book = clean_first_book(first_book)
first_book[:5000]

'/ <new_para> THE BOY WHO LIVED <new_para> Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense. <new_para> Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. <new_para> The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn’t think they could bear it if anyone found out about the Potters. Mrs. Potter

### Tests

In [6]:
# Tests to make sure there are no pipes ("|")
pipes = re.finditer(r"[|]+", first_book)
pipes = [pipe.start() for pipe in pipes]
assert pipes == []

### EDA, round 2

In [7]:
# Finds index positions of all the places where one or more slashes
# or backslashes appear
backslashes = re.finditer(r"[\\]{1,2}", first_book)
backslashes = [backslash.start() for backslash in backslashes]
print("Backslash Locations: ", backslashes)
slashes = re.finditer(r"[/]{1,2}", first_book)
slashes = [slash.start() for slash in slashes]
print("Slash Locations: ", slashes)

Backslash Locations:  [53610, 60402, 80769, 147783, 152029, 179135, 179171, 179259, 243300, 267351, 304264, 332526, 333828, 349322, 397677, 401627, 422308, 426247, 438468]
Slash Locations:  [0, 78370, 88025, 100062, 236628, 439118]


In [8]:
# Shows the context of each slash and backslash
print("BACKSLASHES:\n")
for i in backslashes:
    print(first_book[i-20:i+20])
print("\nSLASHES:")
for i in slashes:
    print(first_book[i-20:i+20])

BACKSLASHES:

w_para> “That’s mine\” said Harry, tryin
at — something alive\ <new_para> Lights 
unia suddenly. “Knew\ Of course we knew!
<new_para> “ Georg e\” <new_para> “Only 
 You-Know-Who’s name\” said Ron, soundin
n! Don’t be afraid l\ <new_para> And don
 don’t get in a flap\ <new_para> You’re 
r I’m a Thinking Cap\” <new_para> The wh
hispered, “Alohomora\” <new_para> The lo
> “Come on, run, run\” Harry yelled at H
. <new_para> “ Weird\ ” he said, “What a
ara> “I’ve found him\” he whispered. “I’
the Sorcerer’s Stone\” <new_para> This d
 ” <new_para> “Shhhh\” Hagrid looked aro
stop saying the name\” Ron hissed. <new_
knew what this means\” he burst out angr
n. <new_para> “Lucky\” shrieked Hermione
denly. “They’re keys\ Winged keys — look
ell. <new_para> “You\” gasped Harry. <ne

SLASHES:

a> HOGWARTS SCHOOL o/WITCHCRAFT and WIZA
hadn’t counted on — / dunno what it was,
a> HOGWARTS SCHOOL o/WITCHCRAFT and WIZA
e about yourselves, / don’t want Slyther
_page> “No, no, no. / tried to k

<font color="hotpink" face="Times New Roman">
    
1) Looks like each backslash is meant to be an exclamation mark.
    
2) Need to fix "Georg e" and "afraid l!"
    
3) Each forward slash is meant to be either an apostrophe or "I".

In [None]:
backslashes = re.finditer(r"[\\]{1,2}", first_book)
backslashes = [backslash.start() for backslash in backslashes]
print("Backslash Locations: ", backslashes)
slashes = re.finditer(r"[/]{1,2}", first_book)
slashes = [slash.start() for slash in slashes]
print("Slash Locations: ", slashes)

### Cleaning, round 2

In [10]:
def clean_first_book_2(first_book):
    # Fixes backslashes (should be exclamation marks)
    first_book = re.sub(r"[\\]{1,2}", "!", first_book)
    # Fixes forward slashes
    first_book = re.sub(r"o[/]WITCHCRAFT", "o'WITCHCRAFT", first_book)
    first_book = re.sub(r"[/][\s]", "I ", first_book)

    return first_book

first_book = clean_first_book_2(first_book)

In [11]:
print("BACKSLASHES:\n")
for i in backslashes:
    print(first_book[i-20:i+20])
print("SLASHES:\n")
for i in slashes:
    print(first_book[i-20:i+20])

BACKSLASHES:

w_para> “That’s mine!” said Harry, tryin
at — something alive! <new_para> Lights 
unia suddenly. “Knew! Of course we knew!
<new_para> “ Georg e!” <new_para> “Only 
 You-Know-Who’s name!” said Ron, soundin
n! Don’t be afraid l! <new_para> And don
 don’t get in a flap! <new_para> You’re 
r I’m a Thinking Cap!” <new_para> The wh
hispered, “Alohomora!” <new_para> The lo
> “Come on, run, run!” Harry yelled at H
. <new_para> “ Weird! ” he said, “What a
ara> “I’ve found him!” he whispered. “I’
the Sorcerer’s Stone!” <new_para> This d
 ” <new_para> “Shhhh!” Hagrid looked aro
stop saying the name!” Ron hissed. <new_
knew what this means!” he burst out angr
n. <new_para> “Lucky!” shrieked Hermione
denly. “They’re keys! Winged keys — look
ell. <new_para> “You!” gasped Harry. <ne
SLASHES:


a> HOGWARTS SCHOOL o'WITCHCRAFT and WIZA
hadn’t counted on — I dunno what it was,
a> HOGWARTS SCHOOL o'WITCHCRAFT and WIZA
e about yourselves, I don’t want Slyther
_page> “No, no, no. I tried to k

In [12]:
nlp = spacy.load('en_core_web_lg')
tokenizer = Tokenizer(nlp.vocab)
doc = tokenizer(first_book)