# Practice
Sources for more natural language processing techniques:
    Universal Sentence Encoder: https://tfhub.dev/google/universal-sentence-encoder/4
Source for Visualizing Matplot library & Word Clouds: https://colorcet.holoviz.org/user_guide/Categorical.html

In [41]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [42]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [43]:
import nltk
import re
# Launch the installer to download "gutenberg" and "stop words" corpora.

In [44]:
# Import the data we just downloaded and installed.
from nltk.corpus import gutenberg, stopwords

# Grab and process the raw data.
print(gutenberg.fileids())

persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# Print the first 100 characters of Alice in Wonderland.
print('\nRaw:\n', alice[0:100])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']

Raw:
 [Alice's Adventures in Wonderland by Lewis Carroll 1865]

CHAPTER I. Down the Rabbit-Hole

Alice was


In [45]:
# This pattern matches all text between square brackets.
pattern = "[\[].*?[\]]"
persuasion = re.sub(pattern, "", persuasion)
alice = re.sub(pattern, "", alice)

# Print the first 100 characters of Alice again.
print('Title removed:\n', alice[0:100])

Title removed:
 

CHAPTER I. Down the Rabbit-Hole

Alice was beginning to get very tired of sitting by her sister on


In [46]:
# Now we'll match and remove chapter headings.
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)

# Ok, what's it look like now?
print('Chapter headings removed:\n', alice[0:100])

Chapter headings removed:
 



Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothin


In [47]:
# Remove newlines and other extra whitespace by splitting and rejoining.
persuasion = ' '.join(persuasion.split())
alice = ' '.join(alice.split())

# All done with cleanup? Let's see how it looks.
print('Extra whitespace removed:\n', alice[0:100])

Extra whitespace removed:
 Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to


In [8]:
# Here is a list of the stopwords identified by NLTK.
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [48]:
import spacy
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

# All the processing work is done here, so it may take a while.
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [49]:
# Let's explore the objects we've built.
print("The alice_doc object is a {} object.".format(type(alice_doc)))
print("It is {} tokens long".format(len(alice_doc)))
print("The first three tokens are '{}'".format(alice_doc[:3]))
print("The type of each token is {}".format(type(alice_doc[0])))

The alice_doc object is a <class 'spacy.tokens.doc.Doc'> object.
It is 34408 tokens long
The first three tokens are 'Alice was beginning'
The type of each token is <class 'spacy.tokens.token.Token'>


In [50]:
from collections import Counter

# Utility function to calculate how frequently words appear in the text.
def word_frequencies(text, include_stop=True):
    
    # Build a list of words.
    # Strip out punctuation and, optionally, stop words.
    words = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            words.append(token.text)
            
    # Build and return a Counter object containing word counts.
    return Counter(words)
    
# The most frequent words:
alice_freq = word_frequencies(alice_doc).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('the', 1524), ('and', 796), ('to', 724), ('a', 611), ('I', 533), ('it', 524), ('she', 508), ('of', 499), ('said', 453), ('Alice', 394)]
Persuasion: [('the', 3120), ('to', 2775), ('and', 2738), ('of', 2563), ('a', 1529), ('in', 1346), ('was', 1329), ('had', 1177), ('her', 1159), ('I', 1118)]


In [51]:
# Use our optional keyword argument to remove stop words.
alice_freq = word_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_freq = word_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('Alice:', alice_freq)
print('Persuasion:', persuasion_freq)

Alice: [('said', 453), ('Alice', 394), ('little', 124), ('like', 84), ('went', 83), ('know', 83), ('thought', 74), ('Queen', 73), ('time', 68), ('King', 61)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('Mr', 254), ('Wentworth', 217), ('Lady', 191), ('good', 181), ('little', 175), ('Charles', 166)]


In [13]:
# Pull out just the text from our frequency lists.
alice_common = [pair[0] for pair in alice_freq]
persuasion_common = [pair[0] for pair in persuasion_freq]

# Use sets to find the unique values in each top ten.
print('Unique to Alice:', set(alice_common) - set(persuasion_common))
print('Unique to Persuasion:', set(persuasion_common) - set(alice_common))

Unique to Alice: {'King', 'Queen', 'went', 'Alice', 'thought', 'know', 'time', 'like', 'said'}
Unique to Persuasion: {'Captain', 'Mr', 'Anne', 'Wentworth', 'Mrs', 'Lady', 'Charles', 'good', 'Elliot'}


In [14]:
# Utility function to calculate how frequently lemas appear in the text.
def lemma_frequencies(text, include_stop=True):
    
    # Build a list of lemas.
    # Strip out punctuation and, optionally, stop words.
    lemmas = []
    for token in text:
        if not token.is_punct and (not token.is_stop or include_stop):
            lemmas.append(token.lemma_)
            
    # Build and return a Counter object containing word counts.
    return Counter(lemmas)

# Instantiate our list of most common lemmas.
alice_lemma_freq = lemma_frequencies(alice_doc, include_stop=False).most_common(10)
persuasion_lemma_freq = lemma_frequencies(persuasion_doc, include_stop=False).most_common(10)
print('\nAlice:', alice_lemma_freq)
print('Persuasion:', persuasion_lemma_freq)

# Again, identify the lemmas common to one text but not the other.
alice_lemma_common = [pair[0] for pair in alice_lemma_freq]
persuasion_lemma_common = [pair[0] for pair in persuasion_lemma_freq]
print('Unique to Alice:', set(alice_lemma_common) - set(persuasion_lemma_common))
print('Unique to Persuasion:', set(persuasion_lemma_common) - set(alice_lemma_common))


Alice: [('say', 476), ('Alice', 394), ('think', 130), ('go', 130), ('little', 124), ('look', 105), ('know', 103), ('come', 96), ('like', 92), ('begin', 91)]
Persuasion: [('Anne', 496), ('Captain', 297), ('Mrs', 291), ('Elliot', 288), ('think', 258), ('Mr', 254), ('know', 252), ('good', 222), ('Wentworth', 215), ('Lady', 191)]
Unique to Alice: {'say', 'little', 'look', 'Alice', 'begin', 'like', 'come', 'go'}
Unique to Persuasion: {'Captain', 'Mr', 'Anne', 'Wentworth', 'Mrs', 'Lady', 'good', 'Elliot'}


In [15]:
# Initial exploration of sentences.
sentences = list(alice_doc.sents)
print("Alice in Wonderland has {} sentences.".format(len(sentences)))

example_sentence = sentences[2]
print("Here is an example: \n{}\n".format(example_sentence))

Alice in Wonderland has 1989 sentences.
Here is an example: 
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!



In [16]:
# Look at some metrics around this sentence.
example_words = [token for token in example_sentence if not token.is_punct]
unique_words = set([token.text for token in example_words])

print(("There are {} words in this sentence, and {} of them are"
       " unique.").format(len(example_words), len(unique_words)))

There are 29 words in this sentence, and 25 of them are unique.


In [17]:
print(nlp("I need a break")[3].pos_)
print(nlp("I need to break the glass")[3].pos_)

NOUN
VERB


In [18]:
# View the part of speech for some tokens in our sentence.
print('\nParts of speech:')
for token in example_sentence[:9]:
    print(token.orth_, token.pos_)


Parts of speech:
There PRON
was AUX
nothing PRON
so ADV
VERY ADV
remarkable ADJ
in ADP
that DET
; PUNCT


In [19]:
# View the dependencies for some tokens.
print('\nDependencies:')
for token in example_sentence[:9]:
    print(token.orth_, token.dep_, token.head.orth_)


Dependencies:
There expl was
was ROOT was
nothing attr was
so advmod VERY
VERY advmod remarkable
remarkable amod nothing
in prep remarkable
that pobj in
; punct was


In [20]:
# Extract the first ten entities.
entities = list(alice_doc.ents)[0:10]
for entity in entities:
    print(entity.label_, ' '.join(t.orth_ for t in entity))

PERSON Alice
PERSON Alice
ORG White Rabbit
PERSON Alice
ORG WAISTCOAT - POCKET
PERSON Alice
PERSON Alice
PERSON Alice
ORDINAL First
WORK_OF_ART ' ORANGE MARMALADE '


In [21]:
# All of the unique entities spaCy thinks are people.
people = [entity.text for entity in list(alice_doc.ents) if entity.label_ == "PERSON"]
print(set(people))

{'Pray', 'Ma', 'Somebody', 'Mary Ann', 'Jack', 'Mine', 'Edgar Atheling', 'Alice', 'Dodo', 'Tut', 'Seaography', 'Beau', 'Mabel', 'Herald', 'Shark', 'Longitude', 'Lory', 'Duchess', 'Latitude', 'Adventures', 'hippopotamus', 'Run', 'Geography', 'Shakespeare', 'Lacie', 'William the Conqueror', 'Edwin', 'William', 'Grief', 'Miss', 'Lizard', "Dinah'll", 'Sha', 'Mercia', 'Pat', 'Hatter', 'Tillie', 'then!--Bill', 'Curiouser', 'Alice aloud', 'ALICE', 'Morcar', 'Gryphon', "I'LL", 'Footman', 'the Queen of Hearts', 'Cat', 'Lobster', 'Frog', 'Knave', 'Latin Grammar', 'Bill', 'Laughing', 'riddles.--I', 'Mouse', 'Fury', 'to--', 'Boots', 'Queen', 'Cheshire Puss', 'WILLIAM', 'Rabbit', '--it', 'Wonderland', 'Dinah', 'Soo', 'Treacle', 'Ou', 'Owl', 'Swim'}


# Challenge

In [52]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/mehrunisaqayyum/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/opt/anaconda3/lib/python3.7/site-packages/en_core_web_sm -->
/opt/anaconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [53]:
# Utility function for standard text cleaning. 
##We will take a 12th of the text.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice[:int(len(alice)/12)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/12)])

In [54]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [25]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [26]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [27]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350


Unnamed: 0,little,responsible,house,call,lock,society,indifference,Nay,candle,lose,...,example,II,prematurely,bespeak,receive,anxiously,hole,Gloucester,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


In [82]:
word_counts.loc[y_train.index]

Unnamed: 0,little,responsible,house,call,lock,society,indifference,Nay,candle,lose,...,example,II,prematurely,bespeak,receive,anxiously,hole,Gloucester,text_sentence,text_source
179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Then, might, she, again, take, up, the, book,...",Austen
374,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(At, this, moment, I, can, not, recollect, his...",Austen
228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Every, emendation, of, Anne, 's, had, been, o...",Austen
341,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(`, Forty, ,, ', replied, Sir, Basil, ,, `, fo...",Austen
108,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,"(She, ate, a, little, bit, ,, and, said, anxio...",Carroll
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"("", Very, true, ,, very, true, .)",Austen
192,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Mr, Elliot, had, attempted, no, apology, ,, a...",Austen
117,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,(*),Carroll
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, here, Alice, began, to, get, rather, sle...",Carroll


## BOW Method

In [83]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
# X = np.array(word_counts.drop(['text_sentence','text_source'], 1))
X = word_counts.drop(columns=['text_sentence','text_source'])

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9737991266375546

Test set score: 0.7987012987012987


## Bag of Words with Logistic Model

In [84]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(229, 1417) (229,)
Training set score: 0.9737991266375546

Test set score: 0.8831168831168831


## Bag of Words with Gradient Boosting

In [85]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8820960698689956

Test set score: 0.7792207792207793


### New example: Emma

In [86]:
#Process # Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma[:int(len(emma)/60)])
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [87]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [88]:
# Group into sentences.
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

In [89]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Processing row 50
Processing row 100
done


In [95]:
X_Emma_test = pd.concat([
    X_train.loc[y_train[y_train=='Carroll'].index],
    emma_bow.drop(columns=['text_sentence','text_source'])])

In [97]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Emma sentence data with the Alice data from the test set.
# X_Emma_test = np.concatenate((
#     X_train[y_train[y_train=='Carroll'].index],
#     emma_bow.drop(['text_sentence','text_source'], 1)
# ), axis=0)

X_Emma_test = pd.concat([
    X_train.loc[y_train[y_train=='Carroll'].index],
    emma_bow.drop(columns=['text_sentence','text_source'])])

y_Emma_test = pd.concat([y_train[y_train=='Carroll'],
                         pd.Series(['Austen'] * emma_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
lr_Emma_predicted = lr.predict(X_Emma_test)
pd.crosstab(y_Emma_test, lr_Emma_predicted)


Test set score: 0.630901287553648


col_0,Austen,Carroll
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,64,86
Carroll,0,83


# Challenge 0: Support Vector Model

In [98]:
#Process # Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma[:int(len(emma)/60)])
print(emma[:100])

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [99]:
# Parse our cleaned data.
emma_doc = nlp(emma)

In [100]:
# Group into sentences.
#Every single token uses a lemma 
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]
emma_sents = [[sent, "Austen"] for sent in emma_doc.sents]

In [101]:
# Build a new Bag of Words data frame for Emma word counts.
# We'll use the same common words from Alice and Persuasion.
emma_sentences = pd.DataFrame(emma_sents)
emma_bow = bow_features(emma_sentences, common_words)

print('done')

Processing row 0
Processing row 50
Processing row 100
done


In [105]:
from sklearn.svm import SVC
svr = SVC()
#Y = word_counts['text_source']
#X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

Y = word_counts['text_source']
X = word_counts.drop(columns=['text_sentence','text_source'])
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
svr.fit(X_train, y_train)


# Model.
#print('\nTest set score:', lr.score(X_Emma_test, y_Emma_test))
#lr_Emma_predicted = lr.predict(X_Emma_test)
#pd.crosstab(y_Emma_test, lr_Emma_predicted)

SVC()

In [106]:
#Obtain score for SVR model
svr.score(X, Y)

0.8485639686684073

In [107]:
from sklearn.model_selection import cross_val_score
cross_val_score(svr, X, Y, cv=5)

array([0.79220779, 0.77922078, 0.66233766, 0.68421053, 0.77631579])

# Challenge 1
Find out whether your new model is good at identifying Alice in Wonderland vs any other work, Persuasion vs any other work, or Austen vs any other work. This will involve pulling a new book from the Project Gutenberg corpus (print(gutenberg.fileids()) for a list) and processing it.

In [108]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [109]:
#Selecting "Sense and Sensibility" from Austen to compare with Persuasion: 
    # 'austen-sense.txt'
# Utility function for standard text cleaning. 
##We will take a 12th of the text.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
sense = gutenberg.raw('austen-sense.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
sense = re.sub(r'CHAPTER .*', '', sense)
    
sense = text_cleaner(sense[:int(len(sense)/10)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/10)])

In [110]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
sense_doc = nlp(sense)
persuasion_doc = nlp(persuasion)

In [111]:
# Group into sentences.
sense_sents = [[sent, "Austen"] for sent in sense_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(sense_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(The, family, of, Dashwood, had, long, been, s...",Austen
1,"(Their, estate, was, large, ,, and, their, res...",Austen
2,"(The, late, owner, of, this, estate, was, a, s...",Austen
3,"(But, her, death, ,, which, happened, ten, yea...",Austen
4,"(In, the, society, of, his, nephew, and, niece...",Austen


In [112]:
# Now we can model it!
# Let's use logistic regression again.

# Combine the Sense sentence data with the Persuasion data from the test set.
X_persuasion_test = np.concatenate((
    X_train[y_train[y_train=='Austen'].index],
    persuasion_bow.drop(['text_sentence','text_source'], 1)
), axis=0)
y_sense_test = pd.concat([y_train[y_train=='Austen'],
                         pd.Series(['Austen'] * persuasion_bow.shape[0])])

# Model.
print('\nTest set score:', lr.score(X_sense_test, y_sense_test))
lr_sense_predicted = lr.predict(X_sense_test)
pd.crosstab(y_sense_test, lr_sense_predicted)

KeyError: "None of [Int64Index([179, 374, 228, 341, 306, 382, 199, 364, 360, 186,\n            ...\n            292, 242, 277, 211, 359, 195, 251, 323, 192, 172],\n           dtype='int64', length=146)] are in the [columns]"

In [113]:
y_train[y_train=='Austen'].index

Int64Index([179, 374, 228, 341, 306, 382, 199, 364, 360, 186,
            ...
            292, 242, 277, 211, 359, 195, 251, 323, 192, 172],
           dtype='int64', length=146)

In [114]:
X_train[y_train[y_train=='Austen'].index]

KeyError: "None of [Int64Index([179, 374, 228, 341, 306, 382, 199, 364, 360, 186,\n            ...\n            292, 242, 277, 211, 359, 195, 251, 323, 192, 172],\n           dtype='int64', length=146)] are in the [columns]"

In [115]:
type(X_train)

pandas.core.frame.DataFrame

In [69]:
type(y_train)

pandas.core.series.Series

In [116]:
y_train.index

Int64Index([179, 374, 228, 341, 108,  46, 306, 382, 199,  73,
            ...
            211,   9, 359, 195, 251, 323, 192, 117,  47, 172],
           dtype='int64', length=229)

# Word2Vec or "Word to Vector": word2vec has two options that are the inverse of one another:

Continuous Bag of Words (CBOW): the identity of a word is predicted using the words near it in a sentence.
_Skip-gram_: The identities of words are predicted from the word they surround. Skip-gram seems to work better for larger corpuses.

In [117]:
pip install gensim 

Collecting gensim
[?25l  Downloading https://files.pythonhosted.org/packages/70/cf/87b25b265d23498b2b70ce873495cf7ef91394c4baff240210e26f3bc18a/gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl (24.2MB)
[K     |████████████████████████████████| 24.2MB 863kB/s eta 0:00:01    |█████████▉                      | 7.5MB 3.6MB/s eta 0:00:05     |███████████████████▌            | 14.7MB 586kB/s eta 0:00:17     |██████████████████████▎         | 16.8MB 650kB/s eta 0:00:12
Collecting smart-open>=1.8.1 (from gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/74/77/744c79da6e66691e3500b6dffff29bdd787015eae817d594791edc7b719b/smart_open-2.0.0.tar.gz (103kB)
[K     |████████████████████████████████| 112kB 3.6MB/s eta 0:00:01
Collecting boto3 (from smart-open>=1.8.1->gensim)
[?25l  Downloading https://files.pythonhosted.org/packages/ed/17/9370fbd932444ea77de83543b4ac3338816e5202f7d89b3184165f285ae4/boto3-1.14.10-py2.py3-none-any.whl (128kB)
[K     |████████████████████████████████|

In [118]:
# Utility function to clean text.
def text_cleaner(text):
    
    # Visual inspection shows spaCy does not recognize the double dash '--'.
    # Better get rid of it now!
    text = re.sub(r'--',' ',text)
    
    # Get rid of headings in square brackets.
    text = re.sub("[\[].*?[\]]", "", text)
    
    # Get rid of chapter titles.
    text = re.sub(r'Chapter \d+','',text)
    
    # Get rid of extra whitespace.
    text = ' '.join(text.split())
    
    return text[0:900000]


# Import all the Austen in the Project Gutenberg corpus.
austen = ""
for novel in ['persuasion','emma','sense']:
    work = gutenberg.raw('austen-' + novel + '.txt')
    austen = austen + work

# Clean the data.
austen_clean = text_cleaner(austen)

In [119]:
# Parse the data. This can take some time.
nlp = spacy.load('en')
austen_doc = nlp(austen_clean)

In [120]:
# Organize the parsed doc into sentences, while filtering out punctuation
# and stop words, and converting words to lower case lemmas.
sentences = []
for sentence in austen_doc.sents:
    sentence = [
        token.lemma_.lower()
        for token in sentence
        if not token.is_stop
        and not token.is_punct
    ]
    sentences.append(sentence)


print(sentences[20])
print('We have {} sentences and {} tokens.'.format(len(sentences), len(austen_clean)))

['daughter', 'eld', 'give', 'thing', 'tempt']
We have 8323 sentences and 900000 tokens.


In [121]:
import gensim
from gensim.models import word2vec

model = word2vec.Word2Vec(
    sentences,
    workers=4,     # Number of threads to run in parallel (if your computer does parallel processing).
    min_count=10,  # Minimum word count threshold.
    window=6,      # Number of words around target word to consider.
    sg=0,          # Use CBOW because our corpus is small.
    sample=1e-3 ,  # Penalize frequent words.
    size=300,      # Word vector length.
    hs=1           # Use hierarchical softmax.
)

print('done!')

done!


In [122]:
# List of words in model.
vocab = model.wv.vocab.keys()

print(model.wv.most_similar(positive=['lady', 'man'], negative=['woman']))

# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.
print(model.wv.similarity('mr', 'mrs'))

# One of these things is not like the other...
print(model.doesnt_match("breakfast marriage dinner lunch".split()))

[('goddard', 0.9235806465148926), ('musgrove', 0.9205349087715149), ('benwick', 0.9172942042350769), ('harville', 0.9101818799972534), ('clay', 0.9054336547851562), ('wentworth', 0.8567952513694763), ('smith', 0.8566604852676392), ('excessively', 0.8455789685249329), ('colonel', 0.8392648100852966), ('navy', 0.8337574005126953)]
0.888222
marriage


  # This is added back by InteractiveShellApp.init_path()
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


In [123]:
# Load Google's pre-trained Word2Vec model.
model_google = gensim.models.KeyedVectors.load_word2vec_format 
('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)

SyntaxError: invalid syntax (<ipython-input-123-84ccc2c5adc8>, line 3)