In [194]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re 
from nltk.corpus import gutenberg, stopwords
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
pd.options.display.float_format = '{:.4f}'.format

In [204]:
def text_cleaner(text):
    text = re.sub(r'--', ' ', text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r'Chapter \d+', '', text)
    text = re.sub('[^A-Za-z]', ' ', text)
    text = ' '.join(text.split())
    return text

moby_dick = ""
moby_dick = gutenberg.raw('melville-moby_dick.txt')
moby_clean = text_cleaner(moby_dick)

macbeth = ""
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
macbeth_clean = text_cleaner(macbeth)

In [205]:
nlp = spacy.load('en', max_length = 1209177)
moby_doc = nlp(moby_clean)
macbeth_doc = nlp(macbeth_clean)

In [206]:
moby_sents = [[sent, 'Moby_Dick'] for sent in moby_doc.sents]
macbeth_sents = [[sent, 'Macbeth'] for sent in macbeth_doc.sents]
sentences = pd.DataFrame(moby_sents + macbeth_sents)
sentences.head(10)

Unnamed: 0,0,1
0,(ETYMOLOGY),Moby_Dick
1,"(Supplied, by, a, Late, Consumptive, Usher, to...",Moby_Dick
2,"(The, pale, Usher, threadbare, in, coat, heart...",Moby_Dick
3,"(I, see, him, now)",Moby_Dick
4,"(He, was, ever, dusting, his, old, lexicons, a...",Moby_Dick
5,"(He, loved, to, dust, his, old, grammars, it, ...",Moby_Dick
6,"(This, animal, is, named, from, roundness, or,...",Moby_Dick
7,"(It, is, more, immediately, from, the, Dut, an...",Moby_Dick
8,"(WHOEL, ANGLO)",Moby_Dick
9,"(SAXON, HVALT, DANISH, WAL, DUTCH, HWAL, SWEDI...",Moby_Dick


In [207]:
df = pd.DataFrame()
df['text_sentence'] = sentences[0]
df['text_source'] = sentences[1]
data = df['text_sentence'].astype('str')

In [227]:
for i, sentence in enumerate(df['text_sentence']):
    words = [token.lemma
            for token in sentence
            if (
                not token.is_punct
                and not token.is_stop)]
    


We will performe in measure time and place So thankes to all at once and to each one Whom we inuite to see vs Crown d at Scone Flourish Exeunt Omnes FINIS THE TRAGEDIE OF MACBETH

In [208]:
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000, stop_words='english')
X = matrix.fit_transform(data).toarray()
columns = matrix.get_feature_names()
array = pd.DataFrame(X, columns=[columns])
array.shape

(9118, 1000)

In [209]:
array['text_sentence'] = df['text_sentence']
array['text_source'] = df['text_source']
array.tail()

Unnamed: 0,aboard,according,account,act,added,advance,advancing,aft,againe,age,...,wound,yards,ye,year,years,yellow,yes,young,text_sentence,text_source
9113,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(That, fled, the, Snares, of, watchfull, Tyran...",Macbeth
9114,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Who, as, tis, thought, by, selfe, and, violen...",Macbeth
9115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(This, and, what, need, full, else)",Macbeth
9116,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(That, call, s, vpon, vs, by, the, Grace, of, ...",Macbeth
9117,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(We, will, performe, in, measure, time, and, p...",Macbeth


In [210]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = np.ravel(array['text_source'])
X = np.array(array.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9422303473491773

Test set score: 0.8758223684210527


In [162]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(7301, 1000) (7301,)
Training set score: 0.9312422955759485

Test set score: 0.9309778142974527


In [211]:
from sklearn.feature_extraction.text import TfidfVectorizer

X = data
y = np.ravel(array['text_source'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

word_vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

words = word_vectorizer.fit_transform(data).toarray()
features = word_vectorizer.get_feature_names()
huh = pd.DataFrame(words, columns=[features])


In [214]:
huh['text_source'] = df['text_source']
huh.head()

Unnamed: 0,aback,abaft,abandon,abandoned,abandonment,abased,abashed,abated,abating,abed,...,youth,youthful,zeal,zealand,zodiac,zone,zoned,zones,zoology,text_source
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Moby_Dick
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Moby_Dick
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Moby_Dick
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Moby_Dick
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Moby_Dick


In [216]:
rfc = ensemble.RandomForestClassifier()
Y = np.ravel(huh['text_source'])
X = np.array(huh.drop(['text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9817184643510055

Test set score: 0.9202302631578947


In [217]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(5470, 9868) (5470,)
Training set score: 0.9120658135283364

Test set score: 0.8884320175438597
