<a href="https://colab.research.google.com/github/loupendley/Thinkful/blob/master/.%5Cstuff%5CTextPreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from collections import Counter
import nltk
import spacy
import re

In [6]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [0]:
### Why, oh why are all of the corpuses lowercase after Removing stopwords in the lesson????

In [8]:
# import the data we just downloaded
from nltk.corpus import gutenberg

persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# print the first 100 characters of Alice
print('\nRaw:\n', alice[0:100])


Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


## Basic Text Cleaning

In [9]:
# this pattern matches all text between square brackets
pattern = "[\[].*?[\]]"
persuasion = re.sub(pattern, "", persuasion)
alice = re.sub(pattern, "", alice)

# print the first 100 characters of Alice again
print("Title removed:", alice[0:100])

Title removed: 

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [10]:
# now we'll match and remove chapter headings
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

# ok, what does it look like now
print('Chapter headings removed from alice:', alice[0:102])

Chapter headings removed from alice: 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing 


In [11]:
# remove newlines and other extra whitespaces by splitting and rejoining
persuasion = ' '.join(persuasion.split())
alice = ' '.join(alice.split())

# almost done with cleanup?  let's check it out
print('Extra whitespace removed:\n', alice[0:103])

Extra whitespace removed:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do


## Tokenization

In [0]:
nlp = spacy.load('en')

# all the processing work is done below, sit it may take a while, Lou.

alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [13]:
# let's explore the objects that we've built...

print("the alice_doc is a {} object.".format(type(alice_doc)))
print("It is {} tokens long.".format(len(alice_doc)))
print("the first three tokens are: '{}'".format(alice_doc[:3]))
print("the type of each token is {}".format(type(alice_doc[0])))

the alice_doc is a <class 'spacy.tokens.doc.Doc'> object.
It is 34408 tokens long.
the first three tokens are: 'Alice was beginning'
the type of each token is <class 'spacy.tokens.token.Token'>


## Removing stopwords

In [0]:
alice_without_stopwords = [token for token in alice_doc if not token.is_stop]
persuasion_without_stopwords = [token for token in persuasion_doc if not token.is_stop]

In [15]:
print(alice_without_stopwords[0:30])

[Alice, beginning, tired, sitting, sister, bank, ,, having, :, twice, peeped, book, sister, reading, ,, pictures, conversations, ,, ', use, book, ,, ', thought, Alice, ', pictures, conversation, ?, ']


In [16]:
# utility function to calculate how frequently words appear in the text
def word_frequencies(text):
    # build a list of words
    # strip out punctuation
    words = []
    for token in text:
        if not token.is_punct:
            words.append(token.text) ## I just added lower() onto this function, and now all text items are lower!
    
    # build and return a Counter object containing the word counts
    return Counter(words)

# instantiate out list of most common words.
alice_word_freq = word_frequencies(alice_without_stopwords).most_common(10)
persuasion_word_freq = word_frequencies(persuasion_without_stopwords).most_common(10)
print('\nAlice', alice_word_freq)
print('Persuasion:', persuasion_word_freq)


Alice [('said', 453), ('Alice', 394), ('little', 124), ('like', 84), ('went', 83), ('know', 83), ('thought', 74), ('Queen', 73), ('time', 68), ('King', 61)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 254), ('Wentworth', 217), ('Lady', 191), ('good', 181), ('little', 175), ('Charles', 166)]


## Lemmatization

In [17]:
def lemma_frequencies(text):
    # build a list of lemas
    # strip out punctuation
    lemmas = []
    for token in text:
        if not token.is_punct:
            lemmas.append(token.lemma_)
            
    # build and return a Counter object containing lemma counts
    return Counter(lemmas)

# instantiate out list of most common lemmas
alice_lemma_freq = lemma_frequencies(alice_without_stopwords).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_without_stopwords).most_common(10)

print('\nAlice:' ,alice_lemma_freq)
print('Persuasion:' ,persuasion_lemma_freq)


Alice: [('say', 476), ('Alice', 394), ('think', 130), ('go', 130), ('little', 124), ('look', 105), ('know', 103), ('come', 96), ('like', 92), ('begin', 91)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('think', 258), ('Mr', 254), ('know', 252), ('good', 222), ('Wentworth', 215), ('Lady', 191)]


In [18]:
alice_lemma_common = [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]

print('Unique to Alice:', set(alice_lemma_common) - set(persuasion_lemma_common))
print('Unique to Persuasion:', set(persuasion_lemma_common) - set(alice_lemma_common))

Unique to Alice: {'Alice', 'little', 'say', 'like', 'look', 'come', 'go', 'begin'}
Unique to Persuasion: {'Lady', 'Elliot', 'Mrs', 'good', 'Mr', 'Wentworth', 'Captain', 'Anne'}


## Sentences

In [19]:
# initial exploration of sentences
sentences = list(alice_doc.sents)
print("Alice in Wonderland has {} sentences.".format(len(sentences)))

example_sentence = sentences[2]
print("Here is an example: \n{}\n".format(example_sentence))

Alice in Wonderland has 1989 sentences.
Here is an example: 
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!



In [20]:
# look at some metrics around this sentence

example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print("this sentence is {}".format(example_words))
print(("There are {} words in this sentence, and {} of them are unique".format(len(example_words), len(unique_words))))

this sentence is [There, was, nothing, so, VERY, remarkable, in, that, nor, did, Alice, think, it, so, VERY, much, out, of, the, way, to, hear, the, Rabbit, say, to, itself, Oh, dear]
There are 29 words in this sentence, and 25 of them are unique
