In [55]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk import pos_tag
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import random
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.simplefilter(action='ignore')

Supervised NLP requires a pre-labelled dataset for training and testing, and is generally interested in categorizing text in various ways. In this case, we are going to try to predict whether a sentence comes from _Alice in Wonderland_ by Lewis Carroll or _Persuasion_ by Jane Austen. We can use any of the supervised models we've covered previously, as long as they allow categorical outcomes. In this case, we'll try Random Forests, SVM, and KNN.

Our feature-generation approach will be something called _BoW_, or _Bag of Words_. BoW is quite simple: For each sentence, we count how many times each word appears. We will then use those counts as features.  

In [4]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [5]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [6]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.columns = ['text', 'author']
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


Time to bag some words!  Since spaCy has already tokenized and labelled our data, we can move directly to recording how often various words occur.  We will exclude stopwords and punctuation.  In addition, in an attempt to keep our feature space from exploding, we will work with lemmas (root words) rather than the raw text terms, and we'll only use the 2000 most common words for each text.

In [7]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    
    
def bag_of_lemma(sentences, common_token_lemmas, verbose=False, batch_size=250):
    # use word counts
    lemma_counts = pd.DataFrame(columns=common_token_lemmas)
    i = 0
    for sentence in sentences:
        # repeat this for every sentence
        x = pd.Series(sentence.lemma_.split(' ')).value_counts()
        x = x[pd.Series([str(x) for x in x.index]).isin(common_token_lemmas).values]
        lemma_counts.loc[i,:] = 0
        lemma_counts.loc[i,[str(x) for x in x.index]] = x.values
        if verbose:
            if i % batch_size == 0:
                    print("Processing row {}".format(i))
        i += 1
    return lemma_counts

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [8]:
len(common_words)

3062

In [9]:
# Convert tokens to lemmas
lmtzr = nltk.WordNetLemmatizer()
tokens = nltk.word_tokenize(alice)
token_lemma = [ lmtzr.lemmatize(token) for token in tokens ]
token_lemma_alice = set(token_lemma)

lmtzr = nltk.WordNetLemmatizer()
tokens = nltk.word_tokenize(persuasion)
token_lemma = [ lmtzr.lemmatize(token) for token in tokens ]
token_lemma_persuasion = set(token_lemma)
common_token_lemmas = token_lemma_alice.union(token_lemma_persuasion)

In [10]:
len(common_token_lemmas)

7146

**Bag of Words**

In [11]:
def my_bag_of_words(sentences_df, common_words, verbose=False, batch_size=250):
    word_counts = pd.DataFrame(columns=common_words)
    i = 0
    for sentence in sentences_df.iloc[:,0]:
        # repeat this for every sentence
        x = pd.Series(sentence).value_counts()
        x = x[pd.Series([str(x) for x in x.index]).isin(common_words).values]
        word_counts.loc[i,:] = 0
        word_counts.loc[i,[str(x) for x in x.index]] = x.values
        if verbose:
            if i % batch_size == 0:
                print("Processing row {}".format(i))
        i += 1
    return word_counts

In [12]:
word_counts = my_bag_of_words(sentences, common_words, verbose=True)

Processing row 0
Processing row 250
Processing row 500
Processing row 750
Processing row 1000
Processing row 1250
Processing row 1500
Processing row 1750
Processing row 2000
Processing row 2250
Processing row 2500
Processing row 2750
Processing row 3000
Processing row 3250
Processing row 3500
Processing row 3750
Processing row 4000
Processing row 4250
Processing row 4500
Processing row 4750
Processing row 5000
Processing row 5250


In [60]:
# # use word counts
# word_counts = pd.DataFrame(columns=common_words)
# i = 0
# for sentence in sentences.iloc[:,0]:
#     # repeat this for every sentence
#     x = pd.Series(sentence).value_counts()
#     x = x[pd.Series([str(x) for x in x.index]).isin(common_words).values]
#     word_counts.loc[i,:] = 0
#     word_counts.loc[i,[str(x) for x in x.index]] = x.values
#     i += 1
#     if i % 250 == 0:
#             print("Processing row {}".format(i))

Processing row 300
Processing row 600
Processing row 900
Processing row 1200
Processing row 1500
Processing row 1800
Processing row 2100
Processing row 2400
Processing row 2700
Processing row 3000
Processing row 3300
Processing row 3600
Processing row 3900
Processing row 4200
Processing row 4500
Processing row 4800
Processing row 5100


In [13]:
lemma_counts = bag_of_lemma(sentences.iloc[:,0], common_token_lemmas, verbose=True)

Processing row 0
Processing row 250
Processing row 500
Processing row 750
Processing row 1000
Processing row 1250
Processing row 1500
Processing row 1750
Processing row 2000
Processing row 2250
Processing row 2500
Processing row 2750
Processing row 3000
Processing row 3250
Processing row 3500
Processing row 3750
Processing row 4000
Processing row 4250
Processing row 4500
Processing row 4750
Processing row 5000
Processing row 5250


In [99]:
# # use word counts
# lemma_counts = pd.DataFrame(columns=common_token_lemmas)
# i = 0
# for sentence in sentences.iloc[:,0]:
#     # repeat this for every sentence
#     x = pd.Series(0sentence.lemma_.split(' ')).value_counts()
#     x = x[pd.Series([str(x) for x in x.index]).isin(common_token_lemmas).values]
#     lemma_counts.loc[i,:] = 0
#     lemma_counts.loc[i,[str(x) for x in x.index]] = x.values
#     i += 1
#     if i % 250 == 0:
#             print("Processing row {}".format(i))

Processing row 100
Processing row 200
Processing row 300
Processing row 400
Processing row 500
Processing row 600
Processing row 700
Processing row 800
Processing row 900
Processing row 1000
Processing row 1100
Processing row 1200
Processing row 1300
Processing row 1400
Processing row 1500
Processing row 1600
Processing row 1700
Processing row 1800
Processing row 1900
Processing row 2000
Processing row 2100
Processing row 2200
Processing row 2300
Processing row 2400
Processing row 2500
Processing row 2600
Processing row 2700
Processing row 2800
Processing row 2900
Processing row 3000
Processing row 3100
Processing row 3200
Processing row 3300
Processing row 3400
Processing row 3500
Processing row 3600
Processing row 3700
Processing row 3800
Processing row 3900
Processing row 4000
Processing row 4100
Processing row 4200
Processing row 4300
Processing row 4400
Processing row 4500
Processing row 4600
Processing row 4700
Processing row 4800
Processing row 4900
Processing row 5000
Processin

In [14]:
# create target variable
# Here 0 indicates that Caroll was the author
# and if 1 then Austen is the author
Y = (sentences.author == 'Austen').astype(int).values
X = word_counts.astype(int).values

rfc = ensemble.RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.977742946708464

Test set score: 0.7570488721804511


In [15]:
# now use lemma instead of token
# create target variable
Y_lemma = Y
X_lemma = lemma_counts.astype(int).values

rfc_lemma = ensemble.RandomForestClassifier()

X_lemma_train, X_lemma_test, y_lemma_train, y_lemma_test = train_test_split(X_lemma, 
                                                    Y_lemma,
                                                    test_size=0.4,
                                                    random_state=0)
train_lemma = rfc_lemma.fit(X_lemma_train, y_lemma_train)

print('Training set score:', rfc_lemma.score(X_lemma_train, y_lemma_train))
print('\nTest set score:', rfc_lemma.score(X_lemma_test, y_lemma_test))

Training set score: 0.9937304075235109

Test set score: 0.8834586466165414


## Trying out BoW

Now let's give the bag of words features a whirl by trying a random forest.

Holy overfitting, Batman! Overfitting is a known problem when using bag of words, since it basically involves throwing a massive number of features at a model – some of those features (in this case, word frequencies) will capture noise in the training set. Since overfitting is also a known problem with Random Forests, the divergence between training score and test score is expected.


## BoW with Logistic Regression

Let's try a technique with some protection against overfitting due to extraneous features – logistic regression with ridge regularization (from ridge regression, also called L2 regularization).

In [16]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3190, 3062) (3190,)
Training set score: 0.9178683385579938

Test set score: 0.8369360902255639


In [17]:
lr_lemma = LogisticRegression()
train_lemma = lr_lemma.fit(X_lemma_train, y_lemma_train)
print(X_lemma_train.shape, y_lemma_train.shape)
print('Training set score:', lr_lemma.score(X_lemma_train, y_lemma_train))
print('\nTest set score:', lr_lemma.score(X_lemma_test, y_lemma_test))

(3190, 7146) (3190,)
Training set score: 0.9733542319749217

Test set score: 0.943609022556391


Logistic regression performs a bit better than the random forest.  

# BoW with Gradient Boosting

And finally, let's see what gradient boosting can do:

In [18]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.8213166144200627

Test set score: 0.7800751879699248


In [19]:
clf_lemma = ensemble.GradientBoostingClassifier()
train_lemma = clf_lemma.fit(X_lemma_train, y_lemma_train)

print('Training set score:', clf_lemma.score(X_lemma_train, y_lemma_train))
print('\nTest set score:', clf_lemma.score(X_lemma_test, y_lemma_test))

Training set score: 0.9282131661442006

Test set score: 0.9295112781954887


Looks like logistic regression is the winner, but there's room for improvement.

# Same model, new inputs

What if we feed the model a different novel by Jane Austen, like _Emma_?  Will it be able to distinguish Austen from Carroll with the same level of accuracy if we insert a different sample of Austen's writing?

First, we need to process _Emma_ the same way we processed the other data, and combine it with the Alice data:

In [20]:
# Clean the Emma data.
emma = gutenberg.raw('austen-emma.txt')
emma = re.sub(r'VOLUME \w+', '', emma)
emma = re.sub(r'CHAPTER \w+', '', emma)
emma = text_cleaner(emma)
print(emma[:100])
# Parse our cleaned data.
emma_doc = nlp(emma)
emma_sents = [sent for sent in emma_doc.sents]
# instead of truncating the sentences at the total number in Allice In Wonderland,
# let's sample from the entire text instead. 
# It is my belief that the other technique introduces bias.
emma_sents = random.sample(emma_sents, len(alice_sents))

Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to


In [23]:
word_counts_emma = my_bag_of_words(emma_sents, common_words, verbose=True)

Processing row 0
Processing row 250
Processing row 500
Processing row 750
Processing row 1000
Processing row 1250
Processing row 1500


In [41]:
X_emma_test = np.concatenate((X_test, word_counts_emma.astype(int).values))
y_emma_test = np.concatenate((y_test, np.ones((word_counts_emma.shape[0], ))))
# Model.
print('\nTest set score:', lr.score(X_emma_test, y_emma_test))
lr_emma_predicted = lr.predict(X_emma_test)
pd.crosstab(y_emma_test, lr_emma_predicted)


Test set score: 0.9046615749275744


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,431,225
1.0,137,3004


Well look at that!  NLP approaches are generally effective on the same type of material as they were trained on. It looks like this model is actually able to differentiate multiple works by Austen from Alice in Wonderland.  Now the question is whether the model is very good at identifying Austen, or very good at identifying Alice in Wonderland, or both...

# Challenge 0:

Recall that the logistic regression model's best performance on the test set was 93%.  See what you can do to improve performance.  Suggested avenues of investigation include: Other modeling techniques (SVM?), making more features that take advantage of the spaCy information (include grammar, phrases, POS, etc), making sentence-level features (number of words, amount of punctuation), or including contextual information (length of previous and next sentences, words repeated from one sentence to the next, etc), and anything else your heart desires.  Make sure to design your models on the test set, or use cross_validation with multiple folds, and see if you can get accuracy above 90%.  

An easy way to improve the overall accuracy is to use **lemma counts**, not just token/word counts. This will increase our feature space from **3062 to 7146**.   

In [45]:
lemma_emma_counts = bag_of_lemma(emma_sents, common_token_lemmas)
# add the emma stuff to the X_test and y_test matrices
X_lemma_emma_test = np.concatenate((X_lemma_test, lemma_emma_counts.astype(int).values))
y_lemma_emma_test = np.concatenate((y_lemma_test, np.ones((lemma_emma_counts.shape[0], ))))
# Model.
print('\nTest set score:', lr_lemma.score(X_lemma_emma_test, y_lemma_emma_test))
lr_emma_lemma_predicted = lr_lemma.predict(X_lemma_emma_test)
pd.crosstab(y_lemma_emma_test, lr_emma_lemma_predicted)


Test set score: 0.9515406900184356


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,560,96
1.0,88,3053


**All I did here was use lemma counts instead of token (word) counts.**

# Evaluation

In [95]:
# SVC
svc_lemma = SVC(C=0.05, kernel='linear', random_state=0)
svc_lemma.fit(X_lemma_train, y_lemma_train)
svc_lemma.score(X_lemma_emma_test, y_lemma_emma_test)

0.9570713721358968

**XGBoost**  

In [63]:
xgbc = XGBClassifier()
xgbc.fit(X_lemma_train, y_lemma_train)
xgbc.score(X_lemma_emma_test, y_lemma_emma_test)

0.9363148479427549

In [96]:
nb = MultinomialNB()
nb.fit(X_lemma_train, y_lemma_train)
nb.score(X_lemma_emma_test, y_lemma_emma_test)

0.9328417171451145

In the above cases, our models were trained on 3190 sentences and used to predict 3797 examples.

# Challenge 1:
Find out whether your new model is good at identifying Alice in Wonderland vs any other work, Persuasion vs any other work, or Austen vs any other work.  This will involve pulling a new book from the Project Gutenberg corpus (print(gutenberg.fileids()) for a list) and processing it.

Record your work for each challenge in a notebook and submit it below.

In [260]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


In [67]:
# Clean the Emma data.
milton = gutenberg.raw('milton-paradise.txt')
milton = re.sub(r'VOLUME \w+', '', milton)
milton = re.sub(r'CHAPTER \w+', '', milton)
milton = text_cleaner(milton)
print(milton[:100])
# Parse our cleaned data.
milton_doc = nlp(milton)
milton_sents = [sent for sent in milton_doc.sents]
# instead of truncating the sentences at the total number in Allice In Wonderland,
# let's sample from the entire text instead. 
# It is my belief that the other technique introduces bias.
milton_sents = random.sample(milton_sents, len(alice_sents))

lemma_milton_counts = bag_of_lemma(milton_sents, common_token_lemmas)
# add the emma stuff to the X_test and y_test matrices
X_lemma_milton_test = np.concatenate((X_lemma_test, lemma_milton_counts.astype(int).values))
y_lemma_milton_test = np.concatenate((y_lemma_test, np.ones((lemma_milton_counts.shape[0], ))))
# Model.
print('\nTest set score:', lr_lemma.score(X_lemma_milton_test, y_lemma_milton_test))
lr_milton_lemma_predicted = lr_lemma.predict(X_lemma_milton_test)
pd.crosstab(y_lemma_milton_test, lr_milton_lemma_predicted)

Book I Of Man's first disobedience, and the fruit Of that forbidden tree whose mortal taste Brought 

Test set score: 0.8964972346589413


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,560,96
1.0,297,2844


In [99]:
# SVC
svc_lemma_milton = SVC(C=0.05, kernel='linear', random_state=0)
svc_lemma_milton.fit(X_lemma_train, y_lemma_train)
svc_lemma_milton.score(X_lemma_milton_test, y_lemma_milton_test)

0.9362654727416382

# Conclusion  

Just tokenization resulted in poor performance compared to lemmatization. This simply change resulted in multiple models that achieved over 90% accuracy. With a little tuning, an SVM classifier was the most accurate (on the test set) at 95.7%. The features that I engineered did not add any predictive power, so I did not include that code in this notebook but is available on request. Those features included: sentence length, average word length, POS tagging (nouns, verbs, adjectives, etc.), polarity (negative, neutral, positive, compound).  

It should be noted that a Naive Bayes classifier worked very well for this data and was especially fast to train. The logistic regression model was also quick to train and was the very accurate. 