In [12]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import shakespeare, stopwords, gutenberg
import nltk
from nltk.collocations import *
from collections import Counter

from sklearn import ensemble
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

import xml.etree.ElementTree as ET

In [13]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [20]:
caesar = gutenberg.raw('shakespeare-caesar.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')

In [21]:
print('\nRaw:\n', caesar[0:250])


Raw:
 [The Tragedie of Julius Caesar by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Flauius, Murellus, and certaine Commoners ouer the Stage.

  Flauius. Hence: home you idle Creatures, get you home:
Is this a Holiday? What, know you not


In [22]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [23]:
#Clean the docs
caesar = text_cleaner(caesar)
hamlet = text_cleaner(hamlet)
macbeth = text_cleaner(macbeth)

In [24]:
print('\nRaw:\n', caesar[0:250])


Raw:
 Actus Primus. Scoena Prima. Enter Flauius, Murellus, and certaine Commoners ouer the Stage. Flauius. Hence: home you idle Creatures, get you home: Is this a Holiday? What, know you not (Being Mechanicall) you ought not walke Vpon a labouring day, wit


In [25]:
#Parse the docs
nlp = spacy.load('en')
caesar_doc = nlp(caesar)
hamlet_doc = nlp(hamlet)
macbeth_doc = nlp(macbeth)

In [26]:
# Group into sentences.
caesar_sents = [[sent, "Caesar"] for sent in caesar_doc.sents]
hamlet_sents = [[sent, "Hamlet"] for sent in hamlet_doc.sents]
macbeth_sents = [[sent, "MacBeth"] for sent in macbeth_doc.sents]

In [27]:
sentences = pd.DataFrame(caesar_sents + hamlet_sents + macbeth_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Actus, Primus, .)",Caesar
1,"(Scoena, Prima, .)",Caesar
2,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Caesar
3,"(Flauius, .)",Caesar
4,"(Hence, :, home, you, idle, Creatures, ,, get,...",Caesar


In [28]:
# Create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [30]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

In [31]:
caesar_words = bag_of_words(caesar_doc)
hamlet_words = bag_of_words(hamlet_doc)
macbeth_words = bag_of_words(macbeth_doc)

In [32]:
# Combine bags to create a set of unique words.
common_words = set(caesar_words + hamlet_words + macbeth_words)

In [33]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700
Processing row 750
Processing row 800
Processing row 850
Processing row 900
Processing row 950
Processing row 1000
Processing row 1050
Processing row 1100
Processing row 1150
Processing row 1200
Processing row 1250
Processing row 1300
Processing row 1350
Processing row 1400
Processing row 1450
Processing row 1500
Processing row 1550
Processing row 1600
Processing row 1650
Processing row 1700
Processing row 1750
Processing row 1800
Processing row 1850
Processing row 1900
Processing row 1950
Processing row 2000
Processing row 2050
Processing row 2100
Processing row 2150
Processing row 2200
Processing row 2250
Processing row 2300
Processing row 2350
Processing row 2400
Processing row 2450
Processing row 2500
Pro

Unnamed: 0,knottie,dishonourable,neu'r,of,briefe,rore,bar,expedition,meeting,lapt,...,fierie,brow,weaknesse,article,terror,naughty,stalke,vnmannerly,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Actus, Primus, .)",Caesar
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Scoena, Prima, .)",Caesar
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Caesar
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Flauius, .)",Caesar
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Hence, :, home, you, idle, Creatures, ,, get,...",Caesar


In [199]:
# Define X and Y and the train, test split
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)

### Models

In [34]:
# Random Forest
rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9574518100611189

Test set score: 0.6826516220028209


As with every other corpus, the random forest suffers from gross overfitting.

In [35]:
# Logistic Regression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(4254, 4061) (4254,)
Training set score: 0.8951574988246357

Test set score: 0.7291960507757405


Logistic Regression is better than Random Forest, but still pretty bad. In these exercises, one model has always beaten them out. And that is...

In [36]:
# Gradient Boosting

clf1 = ensemble.GradientBoostingClassifier()
train1 = clf1.fit(X_train, y_train)

print('Training set score:', clf1.score(X_train, y_train))
print('\nTest set score:', clf1.score(X_test, y_test))

Training set score: 0.6817113305124589

Test set score: 0.6734837799717912


As with every other one of these supervised learning methods for NLP, Gradient Boosting well outperforms the Logistic Progression and Random Forest. I'll tune this to maximize my score without overfitting. 

In [204]:
# Gradient Boosting with double the n_estimators
# This took several minutes

clf2 = ensemble.GradientBoostingClassifier(n_estimators=200, max_depth=3)
train2 = clf2.fit(X_train, y_train)

print('Training set score:', clf2.score(X_train, y_train))
print('\nTest set score:', clf2.score(X_test, y_test))

Training set score: 0.7512929007992478

Test set score: 0.7009873060648801


In [201]:
# Gradient Boosting with max_depth=4 instead of 3

clf3 = ensemble.GradientBoostingClassifier(n_estimators=100, max_depth=4)
train3 = clf3.fit(X_train, y_train)

print('Training set score:', clf3.score(X_train, y_train))
print('\nTest set score:', clf3.score(X_test, y_test))

Training set score: 0.7186177715091678

Test set score: 0.688293370944993


So the increase in estimators had the better effect. increasing depth only added to the overfitting. Adding more estimators might increase it. But the processingn time will increase as well. 

In [205]:
# Gradient Boosting with triple the n_estimators
# This took several minutes

clf4 = ensemble.GradientBoostingClassifier(n_estimators=300, max_depth=3)
train4 = clf4.fit(X_train, y_train)

print('Training set score:', clf4.score(X_train, y_train))
print('\nTest set score:', clf4.score(X_test, y_test))

Training set score: 0.7837329572167372

Test set score: 0.7165021156558533


In [203]:
# Gradient Boosting with quadruple the n_estimators
# This took several minutes

clf5 = ensemble.GradientBoostingClassifier(n_estimators=400, max_depth=3)
train5 = clf5.fit(X_train, y_train)

print('Training set score:', clf5.score(X_train, y_train))
print('\nTest set score:', clf5.score(X_test, y_test))

Training set score: 0.8178185237423601

Test set score: 0.7182651622002821


In [207]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf2, X_test, y_test, cv=5)

array([0.66725352, 0.67253521, 0.68838028, 0.68077601, 0.68849558])

In [208]:
cross_val_score(clf4, X_test, y_test, cv=5)

array([0.66901408, 0.67429577, 0.71126761, 0.6984127 , 0.70088496])

Determining which play among three plays written by the same author was always going to be a lower scoring model. However, the Gradient Boosting with a higher number of estimators seems to outperform the other models and yield good results. 

## TFIDF

In [37]:
word_counts.shape

(7090, 4063)

In [None]:
emma=gutenberg.paras('austen-emma.txt')
#processing
emma_paras=[]
for paragraph in emma:
    para=paragraph[0]
    #removing the double-dash from all words
    para=[re.sub(r'--','',word) for word in para]
    #Forming each paragraph into a string and adding it to the list of strings.
    emma_paras.append(' '.join(para))

print(emma_paras[0:4])

In [159]:
caesar_p = gutenberg.paras('shakespeare-caesar.txt')
macbeth_p = gutenberg.paras('shakespeare-macbeth.txt')
hamlet_p = gutenberg.paras('shakespeare-hamlet.txt')

In [160]:
print(len(caesar_p))
print(len(macbeth_p))
print(len(caesar_p + macbeth_p + hamlet_p))

744
678
2372


In [161]:
caesar_p[10:500]

[[['Fla', '.'], ['But', 'wherefore', 'art', 'not', 'in', 'thy', 'Shop', 'to', 'day', '?'], ['Why', 'do', "'", 'st', 'thou', 'leade', 'these', 'men', 'about', 'the', 'streets', '?'], ['Cob', '.'], ['Truly', 'sir', ',', 'to', 'weare', 'out', 'their', 'shooes', ',', 'to', 'get', 'my', 'selfe', 'into', 'more', 'worke', '.'], ['But', 'indeede', 'sir', ',', 'we', 'make', 'Holyday', 'to', 'see', 'Caesar', ',', 'and', 'to', 'reioyce', 'in', 'his', 'Triumph']], [['Mur', '.'], ['Wherefore', 'reioyce', '?'], ['What', 'Conquest', 'brings', 'he', 'home', '?'], ['What', 'Tributaries', 'follow', 'him', 'to', 'Rome', ',', 'To', 'grace', 'in', 'Captiue', 'bonds', 'his', 'Chariot', 'Wheeles', '?'], ['You', 'Blockes', ',', 'you', 'stones', ',', 'you', 'worse', 'then', 'senslesse', 'things', ':', 'O', 'you', 'hard', 'hearts', ',', 'you', 'cruell', 'men', 'of', 'Rome', ',', 'Knew', 'you', 'not', 'Pompey', 'many', 'a', 'time', 'and', 'oft', '?'], ['Haue', 'you', 'climb', "'", 'd', 'vp', 'to', 'Walles', 'and

In [162]:
len(caesar_p[5])

3

In [163]:
caesar_p[5]

[['Mur', '.'],
 ['But', 'what', 'Trade', 'art', 'thou', '?'],
 ['Answer', 'me', 'directly']]

In [164]:
# Important to note that the vectorizer has to iterate over a corpus, not a string, list, or even a parsed doc
type(caesar_p)

nltk.corpus.reader.util.StreamBackedCorpusView

In [185]:
caesar_paras = []
macbeth_paras = []
hamlet_paras = []

In [186]:
for paragraph in caesar_p:
    para = paragraph[0]
    caesar_paras.append(' '.join(para))
    
for paragraph in macbeth_p:
    para = paragraph[0]
    macbeth_paras.append(' '.join(para))

for paragraph in hamlet_p:
    para = paragraph[0]
    hamlet_paras.append(' '.join(para))

In [187]:
print(caesar_paras[0:4])

print(macbeth_paras[0:4])

print(hamlet_paras[100:110])

['[ The Tragedie of Julius Caesar by William Shakespeare 1599 ]', 'Actus Primus .', 'Enter Flauius , Murellus , and certaine Commoners ouer the Stage .', 'Flauius .']
['[ The Tragedie of Macbeth by William Shakespeare 1603 ]', 'Actus Primus .', 'Thunder and Lightning .', '1 .']
['Ham .', 'Ham .', 'Ham .', 'Ham .', 'Ham .', 'Ham .', 'Ham .', 'Ham .', 'Hor .', 'Ham .']


In [168]:
type(caesar_paras)

list

In [172]:
s_paras = list(caesar_paras+macbeth_paras+hamlet_paras)

In [173]:
type(s_paras)

list

In [188]:
X_train, X_test = train_test_split(caesar_paras, test_size=0.4, random_state=0)

In [189]:
vectorizer = TfidfVectorizer(max_df=0.2, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

In [190]:
#Applying the vectorizer for all three docs
s_paras_tfidf=vectorizer.fit_transform(s_paras)
print("Number of features: %d" % s_paras_tfidf.get_shape()[1])

Number of features: 330


In [180]:
#Applying the vectorizer for Ceasar
caesar_paras_tfidf=vectorizer.fit_transform(caesar_paras)
print("Number of features: %d" % caesar_paras_tfidf.get_shape()[1])

Number of features: 138


In [182]:
#Applying the vectorizer for Macbeth
macbeth_paras_tfidf=vectorizer.fit_transform(macbeth_paras)
print("Number of features: %d" % macbeth_paras_tfidf.get_shape()[1])

Number of features: 109


In [183]:
#Applying the vectorizer for Hamlet
# Look how few features it generated compared to the others. 
hamlet_paras_tfidf=vectorizer.fit_transform(hamlet_paras)
print("Number of features: %d" % hamlet_paras_tfidf.get_shape()[1])

Number of features: 69


In [184]:
# Here is the length of Hamlet
len(hamlet_paras)

950

In [209]:
# WHich is longer than Caesar, which yielded more features than Hamlet. Interesting
len(caesar_paras)

744

In [191]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(s_paras_tfidf, test_size=0.4, random_state=0)

In [192]:
#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

In [193]:
#number of paragraphs
n = X_train_tfidf_csr.shape[0]

In [194]:
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]

In [195]:
#List of features
terms = vectorizer.get_feature_names()

In [196]:
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

In [197]:
#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[50])
print('Tf_idf vector:', tfidf_bypara[50])

Original sentence: Luc .
Tf_idf vector: {'ham': 1.0}


THe paragraphs are breaking up in to words, each with a vector of 1.0. Useless clearly. Maybe I'll try sentences. Clean them first. 

In [117]:
caesar = text_cleaner(caesar)
hamlet = text_cleaner(hamlet)
macbeth = text_cleaner(macbeth)

In [118]:
type(caesar)

str

In [119]:
type(caesar_p)

nltk.corpus.reader.util.StreamBackedCorpusView

In [126]:
len(caesar)

109364

In [127]:
#Parse the docs
nlp = spacy.load('en')
caesar_doc = nlp(caesar)
hamlet_doc = nlp(hamlet)
macbeth_doc = nlp(macbeth)

In [128]:
# Group into sentences.
caesar_sents = [[sent] for sent in caesar_doc.sents]
hamlet_sents = [[sent] for sent in hamlet_doc.sents]
macbeth_sents = [[sent] for sent in macbeth_doc.sents]

In [130]:
caesar_sents[:10]

[[Actus Primus.],
 [Scoena Prima.],
 [Enter Flauius, Murellus, and certaine Commoners ouer the Stage.],
 [Flauius.],
 [Hence: home you idle Creatures, get you home:],
 [Is this a Holiday?],
 [What, know you not (Being Mechanicall)],
 [you ought not walke],
 [Vpon a labouring day, without the signe Of your Profession?],
 [Speake, what Trade art thou?]]

In [133]:
type(caesar_sents)

list

In [131]:
len(caesar_sents)

2144

In [145]:
caesar_tfidf=vectorizer.fit_transform(caesar)
print("Number of features: %d" % caesar_tfidf.get_shape()[1])

ValueError: Iterable over raw text documents expected, string object received.

I've started running into errors where my vectorizer won't iterate over the other forms of the docs. If this is the case, then this unsupervised learning method may not be effective. 

In [144]:
caesar = gutenberg.raw('shakespeare-caesar.txt')

In [136]:
type(caesar)

str

In [137]:
caesar_paras = []
macbeth_paras = []
hamlet_paras = []

In [138]:
for paragraph in caesar:
    para = paragraph[0]
    caesar_paras.append(' '.join(para))

In [139]:
print(caesar_paras[:10])

['[', 'T', 'h', 'e', ' ', 'T', 'r', 'a', 'g', 'e']


It's the way the .paras function of the text doc right at the beginning is reading over it. Anything else isn't working.

In [146]:
len(caesar)

112310

In [147]:
print(caesar[:10000])

[The Tragedie of Julius Caesar by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Flauius, Murellus, and certaine Commoners ouer the Stage.

  Flauius. Hence: home you idle Creatures, get you home:
Is this a Holiday? What, know you not
(Being Mechanicall) you ought not walke
Vpon a labouring day, without the signe
Of your Profession? Speake, what Trade art thou?
  Car. Why Sir, a Carpenter

   Mur. Where is thy Leather Apron, and thy Rule?
What dost thou with thy best Apparrell on?
You sir, what Trade are you?
  Cobl. Truely Sir, in respect of a fine Workman, I am
but as you would say, a Cobler

   Mur. But what Trade art thou? Answer me directly

   Cob. A Trade Sir, that I hope I may vse, with a safe
Conscience, which is indeed Sir, a Mender of bad soules

   Fla. What Trade thou knaue? Thou naughty knaue,
what Trade?
  Cobl. Nay I beseech you Sir, be not out with me: yet
if you be out Sir, I can mend you

   Mur. What mean'st thou by that? Mend mee, thou
sawcy Fellow?

Bru


In [148]:
print(caesar_p[:200])

[[['[', 'The', 'Tragedie', 'of', 'Julius', 'Caesar', 'by', 'William', 'Shakespeare', '1599', ']']], [['Actus', 'Primus', '.'], ['Scoena', 'Prima', '.']], ...]


In [149]:
type(caesar)

str

In [150]:
type(caesar_p)

nltk.corpus.reader.util.StreamBackedCorpusView

In [151]:
type(caesar_doc)

spacy.tokens.doc.Doc

In [152]:
caesar_doc_tfidf=vectorizer.fit_transform(caesar_doc)
print("Number of features: %d" % caesar_doc_tfidf.get_shape()[1])

TypeError: 'int' object is not callable

In [153]:
caesar_paras = []

for paragraph in caesar_doc:
    para = paragraph[0]
    caesar_paras.append(' '.join(para))

TypeError: 'spacy.tokens.token.Token' object does not support indexing

This cevtorizer hasn't worked on anything else. THat's odd. It's only worked on a:

- corpus .paras function (nltk.corpus.reader.util.StreamBackedCorpusView)
- to a list of those paragraphs through a for loop (list)

### TF-IDF is not working for these docs. It's just a bad fit. The supervised method is proving superior in this set of corpuses. 