In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk

In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


# CLEAN

In [3]:
ch_b = gutenberg.raw('chesterton-ball.txt')
ch_br = gutenberg.raw('chesterton-brown.txt')
ch_t = gutenberg.raw('chesterton-thursday.txt')

au_e = gutenberg.raw('austen-emma.txt')
au_p = gutenberg.raw('austen-persuasion.txt')
au_s = gutenberg.raw('austen-sense.txt')

ch_raws = [ch_b, ch_br, ch_t]
au_raws = [au_e, au_p, au_s]
raws = [ch_raws + au_raws]

In [4]:
end_ch_b = re.search("End of Project Gutenberg's The Ball and The Cross, by G.K. Chesterton", ch_raws[0]).span()[0]
ch_raws[0] = ch_raws[0][:end_ch_b]

In [5]:
def text_clean(text):
    text = re.sub('[\[].*?[\]]', '', text) #get rid of book titles
    text = re.sub('CHAPTER [IVXLCDM]+\n\n.*\n', '', text) #get rid of chapter titles
    text = re.sub('[IVXLCDM]+\..*\n', '', text) #get rid of other chapter titles
    text = re.sub('volume [IVXLCDM]+\n', '', text, flags=2) #get rid of volume titles
    text = re.sub('chapter [IVXLCDM]+\n', '', text, flags=2) # get rid of more chapter titles (case insensitive)
    text = re.sub('chapter [1-9]+\n', '', text, flags=2) #get rid of even more chapter titles
    text = re.sub('\n.*finis', '', text, flags=2) #get rid of the end 'finis'
    text = re.sub('\nthe end', '', text, flags=2) #get rid of 'the end'
    text = re.sub('--', ' ', text) #get rid of any double dashes
    text = ' '.join(text.split())
    return text

In [6]:
ch_clean = []
au_clean = []
for i, text in enumerate(ch_raws):
    ch_clean.append(text_clean(text))
for i, text in enumerate(au_raws):
    au_clean.append(text_clean(text))

In [7]:
ch_clean[2] = ch_clean[2][re.search('G. K.', ch_clean[2]).span()[1]+1:]

# DO NLP

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
%%time
nlp_ch = []
nlp_au = []
for text1, text2 in zip(ch_clean, au_clean):
    nlp_ch.append(nlp(text1))
    nlp_au.append(nlp(text2))

Wall time: 3min 14s


In [10]:
chest_sents = []
for i in range(len(nlp_ch)):
    book_sents = [[sent, 'Chesterton'] for sent in nlp_ch[i].sents]
    chest_sents.append(book_sents)
    
aust_sents = []
for i in range(len(nlp_au)):
    book_sents = [[sent, 'Austen'] for sent in nlp_au[i].sents]
    aust_sents.append(book_sents)

In [11]:
ball_sents = pd.DataFrame(chest_sents[0])
brown_sents = pd.DataFrame(chest_sents[1])
thurs_sents = pd.DataFrame(chest_sents[2])

emma_sents = pd.DataFrame(aust_sents[0])
pers_sents = pd.DataFrame(aust_sents[1])
sense_sents = pd.DataFrame(aust_sents[2])

In [12]:
all_sents = [ball_sents, brown_sents, thurs_sents, emma_sents, pers_sents, sense_sents]

In [13]:
sent_df = pd.concat(all_sents).reset_index(drop=True)
sent_df.columns = ['Sentence', 'Author']

In [14]:
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [15]:
ch_bows = []
for doc in nlp_ch:
    ch_bows.append(bag_of_words(doc))
    
au_bows = []
for doc in nlp_au:
    au_bows.append(bag_of_words(doc))

In [16]:
common_words = list(set(ch_bows[0] + ch_bows[1] +ch_bows[2] + au_bows[0] + au_bows[1] + au_bows[2]))

In [17]:
df = pd.DataFrame()
df['text_sentence'] = sent_df['Sentence']
df['text_source'] = sent_df['Author']

In [18]:
def lemma_convert(sentence):
    words = [token.lemma_
                     for token in sentence
                     if (
                         not token.is_punct
                         and not token.is_stop
                         and token.lemma_ in common_words
                     )]
    return words

In [19]:
df['sent_lemmas'] = df.text_sentence.apply(lemma_convert)

In [20]:
for i, word in enumerate(common_words):    
    df[word] = df.sent_lemmas.apply(lambda x: np.sum([1 for z in x if z==word]))
    
    if i % 500 == 0:
        print('Done with {} columns'.format(i+1))

Done with 1 columns
Done with 501 columns
Done with 1001 columns
Done with 1501 columns
Done with 2001 columns
Done with 2501 columns
Done with 3001 columns
Done with 3501 columns
Done with 4001 columns
Done with 4501 columns


In [34]:
int_df = df[['text_source']].join(df.iloc[:, 3:].astype(np.int32)).join(df[['text_sentence', 'sent_lemmas']]).copy()

In [36]:
target_features = int_df.drop(columns=['text_sentence', 'sent_lemmas']).copy()

In [37]:
target_features

Unnamed: 0,text_source,essential,case,king,cavern,recommend,single,hand,contain,greasy,...,resolve,trace,engaged,raillery,hide,assassin,concerned,polished,alloy,door
0,Chesterton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Chesterton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chesterton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Chesterton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Chesterton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31634,Austen,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31635,Austen,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31636,Austen,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31637,Austen,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [139]:
Y = target_features['text_source'].apply(lambda x: 1 if x=='Chesterton' else 0)

In [140]:
X = target_features.drop(columns='text_source')

In [141]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

from sklearn.preprocessing import StandardScaler

In [142]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=3434)

In [143]:
log = LogisticRegression(max_iter=1000)

In [144]:
log.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [145]:
%%time
cross_val_score(log, X_train, Y_train, cv=5, n_jobs=-1)

Wall time: 3min 4s


array([0.86213707, 0.8709996 , 0.87870407, 0.87732122, 0.87653102])

In [146]:
log.score(X_test, Y_test)

0.8780025284450064

Pretty solid. Took pretty long time for vectorization though...maybe there is a better library or something to do this stuff on? More direct method?

In [147]:
rf = RandomForestClassifier(max_depth=5, n_estimators=1000)

In [148]:
%%time
rf.fit(X_train, Y_train)

Wall time: 1min 25s


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [149]:
rf.score(X_test, Y_test)

0.6073008849557522

In [150]:
%%time
cross_val_score(rf, X_train, Y_train, cv=5, n_jobs=-1)

Wall time: 2min 42s


array([0.60616235, 0.6074674 , 0.6066772 , 0.60726985, 0.6070723 ])

In [151]:
mnb = MultinomialNB()
bnb = BernoulliNB()

In [152]:
mnb.fit(X_train, Y_train)
bnb.fit(X_train, Y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [153]:
mnb.score(X_test, Y_test), bnb.score(X_test, Y_test)

(0.8878002528445006, 0.8847977243994943)

In [154]:
cross_val_score(mnb, X_train, Y_train)

array([0.88188821, 0.88680363, 0.88937179, 0.88542078, 0.89035954])

In [155]:
cross_val_score(bnb, X_train, Y_train)

array([0.88307328, 0.88660608, 0.88798894, 0.87850652, 0.88877914])

Looks like Naive Bayes may rule supreme for NLP (text classification). Quite impressive indeed. Granted, it does use 3 entire books worth of information from each author. Even so, quite impressive.

In [156]:
from sklearn.metrics import confusion_matrix

In [157]:
confusion_matrix(Y_test, mnb.predict(X_test))

array([[3604,  224],
       [ 486, 2014]], dtype=int64)

In [159]:
mnb_preds = mnb.predict(X_train)

In [166]:
mnb_preds_df = pd.DataFrame(mnb_preds, index=X_train.index, columns=['MNB_PREDS'])

In [169]:
X_cooked_train = X_train.join(mnb_preds_df)

In [170]:
mnb_test_preds = pd.DataFrame(mnb.predict(X_test), index=X_test.index, columns=['MNB_PREDS'])
X_cooked_test = X_test.join(mnb_test_preds)

In [171]:
log.fit(X_cooked_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [172]:
log.score(X_cooked_test, Y_test)

0.8879582806573957

In [173]:
confusion_matrix(Y_test, log.predict(X_cooked_test))

array([[3600,  228],
       [ 481, 2019]], dtype=int64)

Vanilla naive bayes still looks like it's the best. Maybe just need more cooks? But how many cooks is too many?