In [24]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from collections import Counter
%matplotlib inline

In [4]:
from nltk.corpus import gutenberg, stopwords

In [7]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [8]:
persuasion = gutenberg.raw(fileids='austen-persuasion.txt')
alice = gutenberg.raw(fileids='carroll-alice.txt')

In [11]:
print(alice[:100]) #first 100 characters

[Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [12]:
# This pattern matches all text between square brackets.
pattern = "[\[].*?[\]]"
persuasion = re.sub(pattern, "", persuasion)
alice = re.sub(pattern, "", alice)

# Print the first 100 characters of Alice again.
print(alice[0:100])



CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [13]:
# Now we'll match and remove chapter headings.
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

# Ok, what's it look like now?
print('Chapter headings removed:\n', alice[0:100])

Chapter headings removed:
 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin


In [14]:
# Remove newlines and other extra whitespace by splitting and rejoining.
persuasion = ' '.join(persuasion.split())
alice = ' '.join(alice.split())

# All done with cleanup? Let's see how it looks.
print('Extra whitespace removed:\n', alice[0:100])

Extra whitespace removed:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


In [18]:
nlp = spacy.load('en_core_web_sm')

In [21]:
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [23]:
# Let's explore the objects we've built.
print("The alice_doc object is a {} object.".format(type(alice_doc)))
print("It is {} tokens long".format(len(alice_doc)))
print("The first three tokens are '{}'".format(alice_doc[:3]))
print("The type of each token is {}".format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34408 tokens long
The first three tokens are 'Alice was beginning'
The type of each token is <class 'spacy.tokens.token.Token'>


In [31]:
# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)
    
# The most frequent words:
alice_freq = word_frequencies(alice_doc).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('the', 1524), ('and', 796), ('to', 724), ('a', 611), ('I', 533), ('it', 524), ('she', 508), ('of', 499), ('said', 453), ('Alice', 394)]
Persuasion: [('the', 3120), ('to', 2775), ('and', 2738), ('of', 2563), ('a', 1529), ('in', 1346), ('was', 1329), ('had', 1177), ('her', 1159), ('I', 1118)]


In [32]:
# Use our optional keyword argument to remove stop words.
alice_freq = word_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('said', 453), ('Alice', 394), ('little', 124), ('like', 84), ('went', 83), ('know', 83), ('thought', 74), ('Queen', 73), ('time', 68), ('King', 61)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 254), ('Wentworth', 217), ('Lady', 191), ('good', 181), ('little', 175), ('Charles', 166)]


In [34]:
alice_common = [pair[0] for pair in alice_freq]
persuasion_common = [pair[0] for pair in persuasion_freq]

In [42]:
print('Alice unqiues: {}'.format(set(alice_common) - set(persuasion_common)))
print('Persuasion uniques: {}'.format(set(persuasion_common) - set(alice_common)))

Alice unqiues: {'said', 'like', 'went', 'Alice', 'Queen', 'thought', 'time', 'King', 'know'}
Persuasion uniques: {'Captain', 'Wentworth', 'Mr', 'Elliot', 'Lady', 'Anne', 'Charles', 'Mrs', 'good'}


In [46]:
# Utility function to calculate how frequently lemas appear in the text.
def lemma_frequencies(text, include_stop=True):
    
    # Build a list of lemas.
    # Strip out punctuation and, optionally, stop words.
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts.
    return Counter(lemmas)

# Instantiate our list of most common lemmas.
alice_lemma_freq = lemma_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('\nAlice:', alice_lemma_freq)
print('Persuasion:', persuasion_lemma_freq)

# Again, identify the lemmas common to one text but not the other.
alice_lemma_common = [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]
print('Unique to Alice:', set(alice_lemma_common) - set(persuasion_lemma_common))
print('Unique to Persuasion:', set(persuasion_lemma_common) - set(alice_lemma_common))


Alice: [('say', 476), ('Alice', 394), ('think', 130), ('go', 130), ('little', 124), ('look', 105), ('know', 103), ('come', 96), ('like', 92), ('begin', 91)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('think', 258), ('Mr', 254), ('know', 252), ('good', 222), ('Wentworth', 215), ('Lady', 191)]
Unique to Alice: {'go', 'begin', 'like', 'look', 'Alice', 'say', 'little', 'come'}
Unique to Persuasion: {'Captain', 'Wentworth', 'Mr', 'Elliot', 'Lady', 'Anne', 'Mrs', 'good'}


In [73]:
# Initial exploration of sentences.
sentences = list(alice_doc.sents)
print("Alice in Wonderland has {} sentences.".format(len(sentences)))

example_sentence = sentences[2]
print("Here is an example: \n{}\n".format(example_sentence))

Alice in Wonderland has 1989 sentences.
Here is an example: 
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!



In [74]:
# Look at some metrics around this sentence.
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(("There are {} words in this sentence, and {} of them are"
       " unique.").format(len(example_words), len(unique_words)))

There are 29 words in this sentence, and 25 of them are unique.


In [93]:
for token in sentences[4]:
    print(token, token.pos_, (token.dep_, token.head.orth_))

I PRON ('nsubj', 'be')
shall VERB ('aux', 'be')
be AUX ('ROOT', 'be')
late ADJ ('acomp', 'be')
! PUNCT ('punct', 'be')
' PUNCT ('punct', 'be')


In [97]:
for entity in list(alice_doc.ents)[0:20]:
    print(entity.label_, ' '.join(t.orth_ for t in entity))

PERSON Alice
PERSON Alice
ORG White Rabbit
PERSON Alice
ORG WAISTCOAT - POCKET
PERSON Alice
PERSON Alice
PERSON Alice
ORDINAL First
WORK_OF_ART ' ORANGE MARMALADE '
CARDINAL one
PERSON Alice
QUANTITY four thousand miles
PERSON Alice
PERSON Latitude
PERSON Longitude
PERSON Alice
PERSON Latitude
PERSON Longitude
WORK_OF_ART Antipathies


In [98]:
# All of the uniqe entities spaCy thinks are people.
people = [entity.text for entity in list(alice_doc.ents) if entity.label_ == "PERSON"]
print(set(people))

{'Adventures', 'Seaography', 'Mine', '--it', 'Mabel', 'Somebody', 'Frog', 'Mouse', 'Owl', 'Alice', 'Cat', 'Cheshire Puss', 'Edwin', 'Fury', 'William the Conqueror', 'Mary Ann', 'Lobster', 'Dinah', 'Shark', 'Knave', 'William', 'Bill', 'Hatter', 'riddles.--I', 'Alice aloud', 'Soo', 'Curiouser', 'Latin Grammar', 'then!--Bill', 'Jack', 'Boots', 'Sha', 'Tillie', 'Latitude', 'Morcar', 'Pat', 'ALICE', 'the Queen of Hearts', 'Miss', 'Beau', 'Shakespeare', 'Queen', 'Swim', 'Wonderland', 'Lizard', 'Duchess', 'Lacie', 'Edgar Atheling', 'Herald', 'Run', "I'LL", 'to--', 'Ou', 'Gryphon', 'Rabbit', 'WILLIAM', 'Tut', 'Laughing', 'hippopotamus', 'Mercia', 'Longitude', "Dinah'll", 'Grief', 'Geography', 'Lory', 'Ma', 'Treacle', 'Footman', 'Pray', 'Dodo'}


In [100]:
a = [1,2,3,4]
b = ['a','b','c','d']
set(a + b)

{1, 2, 3, 4, 'a', 'b', 'c', 'd'}