In [5]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords

## Load and clean data

In [6]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load our datasets
caesar = gutenberg.raw('shakespeare-caesar.txt')
moby_dick = gutenberg.raw('melville-moby_dick.txt')

caesar = text_cleaner(caesar)
moby_dick = text_cleaner(moby_dick)

In [7]:
nlp = spacy.load('en_core_web_sm')
caesar_doc = nlp(caesar)
moby_doc = nlp(moby_dick[:100000])

In [8]:
caesar_sent = [[sentence, 'Shakespeare'] for sentence in caesar_doc.sents ]
moby_sent = [[sentence, 'Melville'] for sentence in moby_doc.sents ]

In [9]:
sentences = pd.DataFrame(caesar_sent + moby_sent, columns=['sentence', 'author'])
sentences.head()

Unnamed: 0,sentence,author
0,"(Actus, Primus, .)",Shakespeare
1,"(Scoena, Prima, .)",Shakespeare
2,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Shakespeare
3,"(Flauius, .)",Shakespeare
4,"(Hence, :, home, you, idle, Creatures, ,, get,...",Shakespeare


In [10]:
#convert our sentnces from spaCy spans to str
sentences.sentence = sentences.sentence.apply(lambda x: str(x))

## TFIDF Tokenizer

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english',
                             max_df=10,
                             min_df=2 )

vector = vectorizer.fit_transform(sentences.sentence)

columns = vectorizer.get_feature_names()

X = pd.DataFrame(vector.toarray(), columns=columns)
y = sentences.author

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

## Building Models

In [13]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)

print('The train score is {}'.format(rfc.score(X_train, y_train)))
print('The test score is {}'.format(rfc.score(X_test, y_test)))



The train score is 0.9054820415879017
The test score is 0.7854107648725213


In [14]:
from sklearn.svm import SVC

svc = SVC()

svc.fit(X_train, y_train)

print('The train score is {}'.format(svc.score(X_train, y_train)))
print('The test score is {}'.format(svc.score(X_test, y_test)))



The train score is 0.6625708884688091
The test score is 0.6692634560906515


In [15]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(400)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

Percent variance captured by all components: 60.79290995428792


In [16]:
X_train_lsa.shape

(2116, 400)

In [17]:
X_test_lsa = lsa.fit_transform(X_test)

rfc.fit(X_train_lsa, y_train)
svc.fit(X_train_lsa, y_train)

print('The train score is {}'.format(rfc.score(X_train_lsa, y_train)))
print('The test score is {}'.format(rfc.score(X_test_lsa, y_test)))
print('')
print('The train score is {}'.format(svc.score(X_train_lsa, y_train)))
print('The test score is {}'.format(svc.score(X_test_lsa, y_test)))



The train score is 0.9215500945179584
The test score is 0.646600566572238

The train score is 0.6625708884688091
The test score is 0.6692634560906515


So, using just our tfidf vectorizer and our lsa seem to produce similar results.

Let's see if we can increase our svc to above 70%.

In [18]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

svc = SVC()

clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(X_train, y_train)

print('The score is {}'.format(clf.score(X_test, y_test)))
print('The best parameters are:')
print(clf.best_estimator_)



The score is 0.7861189801699717
The best parameters are:
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)


In [19]:
print('Training Score:{}'.format(clf.score(X_train, y_train)))
print('Test Score:{}'.format(clf.score(X_test, y_test)))

Training Score:0.9064272211720227
Test Score:0.7861189801699717


Hmm, this seems to overfit. Let's try to hit our 70% benchmark in other ways.

## SpaCY and BOW

In [35]:
#revert back to before we cleaned our 'sentence' column
sentences = pd.DataFrame(caesar_sent + moby_sent, columns=['sentence', 'author'])

In [36]:
sentences.head()

Unnamed: 0,sentence,author
0,"(Actus, Primus, .)",Shakespeare
1,"(Scoena, Prima, .)",Shakespeare
2,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Shakespeare
3,"(Flauius, .)",Shakespeare
4,"(Hence, :, home, you, idle, Creatures, ,, get,...",Shakespeare


In [37]:
from collections import Counter

#creates 2000 most used words
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['sentence'] = sentences.sentence
    df['author'] = sentences.author
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
caesar_words = bag_of_words(caesar_doc)
moby_words = bag_of_words(moby_doc)

# Combine bags to create a set of unique words.
common_words = set(caesar_words + moby_words)

In [38]:
#create our df
df = bow_features(sentences, common_words)
df.head()

Processing row 0
Processing row 50
Processing row 100
Processing row 150
Processing row 200
Processing row 250
Processing row 300
Processing row 350
Processing row 400
Processing row 450
Processing row 500
Processing row 550
Processing row 600
Processing row 650
Processing row 700
Processing row 750
Processing row 800
Processing row 850
Processing row 900
Processing row 950
Processing row 1000
Processing row 1050
Processing row 1100
Processing row 1150
Processing row 1200
Processing row 1250
Processing row 1300
Processing row 1350
Processing row 1400
Processing row 1450
Processing row 1500
Processing row 1550
Processing row 1600
Processing row 1650
Processing row 1700
Processing row 1750
Processing row 1800
Processing row 1850
Processing row 1900
Processing row 1950
Processing row 2000
Processing row 2050
Processing row 2100
Processing row 2150
Processing row 2200
Processing row 2250
Processing row 2300
Processing row 2350
Processing row 2400
Processing row 2450
Processing row 2500
Pro

Unnamed: 0,Tho,suite,Spirits,LUCIAN,heap'd,father,Flint,eye,plunge,obey,...,flatter,oft,petticoat,Job,ARCTIC,presse,lend,swet,need,sentence
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(Actus, Primus, .)"
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(Scoena, Prima, .)"
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(Enter, Flauius, ,, Murellus, ,, and, certaine..."
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(Flauius, .)"
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"(Hence, :, home, you, idle, Creatures, ,, get,..."


In [45]:
for columns in df.columns:
    print(columns)

Tho
suite
Spirits
LUCIAN
heap'd
father
Flint
eye
plunge
obey
ECKERMANN
VESSEL
hugest
counsel
let
euer
anchor
Commoners
Moluccas
reioyce
Mother
like
Tapor
Pompey
dreamy
MEMORIAL
woodland
comical
gill
heere
Earth
grim
writ
bench
magic
drawne
window
ring
know'st
Slumber
tablet
intend
finish
neuer
dusky
Seas
statement
preuayl'd
mortal
avenue
SPERMACETI
Gray
bright
abundant
stuff
seaward
Cin
sweatie
vp
legge
breeding
black
worm
Floundered
Manhattoes
selfe
corn
-PRON-
joy
largely
loose
beleeue
APOLOGY
Alchymie
pocket
wee
sense
deadly
Master
Prairies
deale
desert
arm'd
teach
final
lexicon
soule
cowper
right
help
trust
agony
HOMEWARD
Workman
person
piggledy
choake
BENNETT
english
Balaene
Kerchiefe
ERECTION
behold
Chariot
LETTERS
satisfactory
willing
passion
reciprocal
aliue
drag
bid
swiftness
preuaile
drinke
swam
Sacrifice
businesse
noise
YORK
Vaticans
returne
public
Romane
Skie
pipe
vnderling
JOB
Neighbors
swear
prevent
bread
Oathes
perisheth
triumph
breath
hound
OCEAN
button
daye
prove
spend

SCORESBY
tongue
Insurrection
hook
spile
bird
call'd
deck
howl
restless
answere
tattoo
coast
Vnder
Iulius
prickt
Womanish
fate
Lucillius
hold
harke
murther
CHARLES
seruile
incompast
roll
pulpit
Whiles
Free
storm
tempt
iust
vnaccustom'd
cloth
woollen
dreadful
view
Physicall
endeavor
benefit
vrge
moe
Speeches
Treb
Ser
arch
ancient
remain'd
pale
Noble
league
retaking
Indian
unbroken
bulwark
maske
attention
RABELAIS
dost
enrol
tomahawk
insupportable
Tempests
broil
whit
Suburbs
sound
breed
impossible
creature
Whil'st
Closet
Ciuill
forecastle
whilst
braue
big
fare
scan
theology
tweene
opinion
withall
inform
tyding
boat
Inn
hate
Round
Wee'l
vse
fear'd
Sweare
HVALT
eld
sawe
cover
boldness
Niagara
man
tell
extremity
sutor
Course
Brasse
preferre
nightmare
Winds
region
telling
did'st
Point
fleare
Glories
mention
incredible
stranger
DAVENANT
humble
Glasse
thine
pike
mee
chest
authentic
monkey
sore
calm
vile
SPEECH
wonder
field
strew
applause
ioy
corner
anglo
behauiour
wherefore
weigh
Seale
Captiuit

limitation
Wit
Flowers
steady
teare
DICTIONARY
forme
saue
foure
mile
high
craft
HOLLAND
Crossed
Villaines
East
northern
1690
October
work
2
handful
Queequeg
bruise
random
certainly
Ledyard
Friends
fold
devilish
cottage
middle
annals
Hosmannus
Sennit
mock'd
cride
Death
strength
repute
backe
sowre
wave
puzzle
Primus
note
seeme
MONTGOMERY
Vnto
ASIA
Run
Octa
vnscorch'd
discover
merry
Cicero
watery
try
WILLIAM
Instrument
Cassius
velocity
ceti
stabb'd
awful
RAPE
foam
shark
qualitie
sting
repair
weepe
thee
Publius
honour'd
beneath
object
Children
brutus
magnetic
strange
WAL
thriue
American
chapter
red
Brutus
gowne
rest
FIFE
loud
plague
stern
vnknowne
silent
matter
skill
SEBOND
aduantage
monstrous
Kingdome
Liberty
leave
mend
Limbes
SIR
see
MINISTER
bow
deliberate
Ant
saw
bloodie
drawe
seaport
HWAL
worse
Octauius
shewes
youthfull
generally
mattress
Instruments
meditate
neck
painstaking
strengthen
execution
Watch
possession
watch
waistcoat
mooue
sailor
mariner
exasperate
vngentle
artificial
Vali

on't
lock
rash
JOHN
TOOKE
proper
root
remaine
1668
marry
previously
truth
bumpkin
Conspiracie
turne
describe
go
spar
spring
elizabeth
IBID
CAPTORS
Motiues
cetology
holiday
numerous
ay
1
brow
coward
Cas
bespeak
expensive
disperse
natiue
explain
knowne
huge
Street
Cachalot
hideous
Nantuckois
contagion
Brow
shew
dog
quietly
motion
1729
COLNETT
start
hit
spot
fill
Freedome
Calender
veritable
commit
outlandish
tremble
wrong'd
modestly
Warre
gate
BROWNE
vpon
watchfull
augment
FREDERICK
ayme
patient
thought
tomb
vicinity
Exit
gouern'd
car
Gold
Spaine
Steele
sway'd
sheet
persevering
euery
wall
finny
moneth
Beach
mildly
begin
Friend
ascend
hugge
clinch
intermit
save
preserve
cattle
Meane
veyl'd
tiger
Accoutred
grego
Arme
store
greefe
grace
Ishmael
suppose
BEALE
amount
vnderneath
raise
suit
rate
emerge
say
scene
baleen
WHALING
Meales
palter
create
hist
HUNTER
resurrection
comfort
rul'd
scratch'd
nation
property
hamlet
plaster
Green
push
wine
i'th
Sunne
meete
wear
convivial
bin
small
assault
cons

In [47]:
X = df.drop(columns=['sentence', 'author'])
y = sentences['author']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

svc = SVC()

svc.fit(X_train, y_train)
print('Training Score: {}'.format(svc.score(X_train, y_train)))
print('Test Score: {}'.format(svc.score(X_test, y_test)))



Training Score: 0.6593762656946132
Test Score: 0.6789423984891407


Well, that's a little better...