Dataset is 20 (or 19, for BoW testing) novels by four authors.

tf-idf and BoW were attempted as NLP methods. BoW and proved to be _far_ worse than tf-idf. SVD was performed on the tf-idf data, but proved to be harmful to performance rather than helpful.

Current best-attempt is an RFC model attempting to predict the author of a text using tf-idf data collected from all of the texts.  

RFC results:
 
Model score:
0.7949438202247191
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.84      0.85      0.85      2525
     Dickens       0.79      0.87      0.83      4375
   Stevenson       0.79      0.46      0.58      1376
       Twain       0.76      0.78      0.77      3116

   micro avg       0.79      0.79      0.79     11392
   macro avg       0.80      0.74      0.76     11392
weighted avg       0.79      0.79      0.79     11392

 
Model cross-valuation:
[0.70307018 0.67954346 0.68261633 0.69315189 0.70851624]



In [346]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 50)
import math
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report

from IPython.display import display

import spacy
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk
nltk.download('gutenberg')

from spacy.lang.en import English

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)

from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.decomposition import PCA
from sklearn import neighbors
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor

from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.preprocessing import LabelEncoder, Imputer
from sklearn.model_selection import train_test_split

from timeit import default_timer as timer

from nltk.tokenize import BlanklineTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import urllib.request
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

import os

import pydotplus
from sklearn import tree
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/morgankauffman/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


In [6]:
def clean_punct(text):
    text = text.replace('"','')
    text = text.replace('“','')
    text = text.replace('”','')
    text = text.replace('\'','')
    text = text.replace('.',' .')
    text = text.replace(',',' ,')
    text = text.replace('?',' ?')
    text = text.replace('!',' !')
    text = text.replace(':',' :')
    text = text.replace(';',' ;')
    text = text.replace('(','( ')
    text = text.replace(')',' )')
    text = text.replace('-', ' - ')
    return text

def get_texts(text_url):
    values = {'q' : 'python programming tutorials'}
    data = urllib.parse.urlencode(values)
    data = data.encode('utf-8') # data should be bytes
    req = urllib.request.Request(text_url, data)
    resp = urllib.request.urlopen(req)
    respData = resp.read()
    text = respData.decode("utf-8")
    text = text.replace('_','')
    text = text.replace('\xa0', '')
    text = text.replace(',—', ', ')
    text = text.replace('--', ' - ')
    text = clean_punct(text)
    return text
    
def get_paragraphs(text_url):
    values = {'q' : 'python programming tutorials'}
    data = urllib.parse.urlencode(values)
    data = data.encode('utf-8') # data should be bytes
    req = urllib.request.Request(text_url, data)
    resp = urllib.request.urlopen(req)
    respData = resp.read()
    text = respData.decode("utf-8")
    text = text.replace('_','')
    text = text.replace('\xa0', '')
    text = text.replace(',—', ', ')
    text = text.replace('--', ' - ')
    text = clean_punct(text)
    paragraphs = BlanklineTokenizer().tokenize(text)
    cleaned_paras = []
    for para in paragraphs:
        cleaned_paras.append(para.replace("\r\n", " "))
    return cleaned_paras


In [67]:
def tfidf_vectorizer(text,maxdf,mindf):
    vectorizer = TfidfVectorizer(max_df=maxdf, # drop words that occur in more than X% the paragraphs
                                 min_df=mindf, # only use words that appear at least Y times
                                 stop_words='english', 
                                 lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                                 use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                                 norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                                 smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                                )


    #Applying the vectorizer
    text_tfidf=vectorizer.fit_transform(text)
    print("Number of features: %d" % text_tfidf.get_shape()[1])

    #Reshapes the vectorizer output into something people can read
    text_tfidf_csr = text_tfidf.tocsr()
    #number of paragraphs
    n = text_tfidf_csr.shape[0]
    #A list of dictionaries, one per paragraph
    tfidf_bypara = [{} for _ in range(0,n)]
    #List of features
    terms = vectorizer.get_feature_names()
    #for each paragraph, lists the feature words and their tf-idf scores
    for i, j in zip(*text_tfidf_csr.nonzero()):
        tfidf_bypara[i][terms[j]] = text_tfidf_csr[i, j]

    vocabulary_list = list(vectorizer.vocabulary_.keys())

    #Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present 
    #once in that sentence.
    return text_tfidf_csr, text_tfidf, tfidf_bypara, vocabulary_list

In [89]:
def svd_reducer(text_tfidf, text, svdnum):
    #Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
    svd= TruncatedSVD(svdnum)
    lsa = make_pipeline(svd, Normalizer(copy=False))
    # Run SVD on the training data, then project the training data.
    text_lsa = lsa.fit_transform(text_tfidf)

    variance_explained=svd.explained_variance_ratio_
    total_variance = variance_explained.sum()
    print("Percent variance captured by all components:",total_variance*100)

    return text_lsa

In [90]:
def write_dfidf_dataframe(author, title, paragraph_list, book_text):
    df_idf_dataframe = pd.DataFrame(columns = ['author','title','paragraph'])
    counter = 0

    for para in paragraph_list:
        df_idf_dataframe.loc[counter,'paragraph'] = book_text[counter]

        word_list = list(para.keys())

        for word in word_list:
            df_idf_dataframe.loc[counter,word] = para[word]

        counter += 1

    df_idf_dataframe['author'] = author
    df_idf_dataframe['title'] = title

    df_idf_dataframe = df_idf_dataframe.fillna(0)
    return df_idf_dataframe

In [91]:
def grab_text_make_df(url,author,title,max_df,min_df):
    book_text = get_paragraphs(url)
    tfidf_csr, text_tfidf, book_paras, vocabulary_list = tfidf_vectorizer(book_text, max_df, min_df)
    book_df = write_dfidf_dataframe(author,title,book_paras,book_text)
    os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done
    return book_df


In [93]:
max_df = .1
min_df = 7

In [94]:
prideandprejudice_Austen = grab_text_make_df('http://www.gutenberg.org/files/1342/1342-0.txt',
                                             'Austen',
                                             'Pride and Prejudice',
                                             max_df, min_df)

Number of features: 1416



In [100]:
ladysusan_Austen = grab_text_make_df('http://www.gutenberg.org/cache/epub/21839/pg21839.txt', 
                                            'Austen', 
                                            'Lady Susan',
                                             max_df, min_df)

Number of features: 1398



In [101]:
mansfieldpark_Austen = grab_text_make_df('http://www.gutenberg.org/files/141/141-0.txt', 
                                            'Austen', 
                                            'Mansfield Park',
                                             max_df, min_df)

Number of features: 1660



In [102]:
emma_Austen = grab_text_make_df('http://www.gutenberg.org/files/158/158-0.txt', 
                                            'Austen', 
                                            'Emma',
                                             max_df, min_df)

Number of features: 1582



In [103]:
sensesensibility_Austen = grab_text_make_df('http://www.gutenberg.org/cache/epub/161/pg161.txt', 
                                            'Austen', 
                                            'Sense and Sensibility',
                                             max_df, min_df)

Number of features: 1361



In [104]:
huckleberryfinn_Twain = grab_text_make_df('http://www.gutenberg.org/files/76/76-0.txt', 
                                            'Twain', 
                                            'The Adventures of Huckleberry Finn',
                                             max_df, min_df)

Number of features: 1135



In [105]:
princeandpauper_Twain = grab_text_make_df('http://www.gutenberg.org/files/1837/1837-0.txt', 
                                            'Twain', 
                                            'The Prince and the Pauper',
                                             max_df, min_df)

Number of features: 1005



In [106]:
trampabroad_Twain = grab_text_make_df('http://www.gutenberg.org/files/119/119-0.txt', 
                                            'Twain', 
                                            'A Tramp Abroad',
                                             max_df, min_df)

Number of features: 1995



In [107]:
yankeeinkingarthurscourt_Twain = grab_text_make_df('http://www.gutenberg.org/files/86/86-0.txt', 
                                            'Twain', 
                                            'A Connecticut Yankee in King Arthurs Court',
                                             max_df, min_df)

Number of features: 1398



In [108]:
tomsawyer_Twain = grab_text_make_df('http://www.gutenberg.org/files/74/74-0.txt', 
                                            'Twain', 
                                            'The Adventures of Tom Sawyer',
                                             max_df, min_df)

Number of features: 933



In [109]:
lifeonthemississippi_Twain = grab_text_make_df('http://www.gutenberg.org/files/245/245-0.txt', 
                                            'Twain', 
                                            'Life on the Mississippi',
                                             max_df, min_df)

Number of features: 1819



In [110]:
greatexpectations_Dickens = grab_text_make_df('http://www.gutenberg.org/files/1400/1400-0.txt',
                                             'Dickens',
                                             'Great Expectations',
                                             max_df, min_df)

Number of features: 1837



In [96]:
taleoftwocities_Dickens = grab_text_make_df('http://www.gutenberg.org/files/98/98-0.txt', 
                                            'Dickens', 
                                            'A Tale of Two Cities',
                                             max_df, min_df)

Number of features: 1562



In [111]:
christmascarol_Dickens = grab_text_make_df('http://www.gutenberg.org/files/46/46-0.txt', 
                                            'Dickens', 
                                            'A Christmas Carol',
                                             max_df, min_df)

Number of features: 337



In [112]:
hardtimes_Dickens = grab_text_make_df('http://www.gutenberg.org/files/786/786-0.txt', 
                                            'Dickens', 
                                            'Hard Times',
                                             max_df, min_df)

Number of features: 1106



In [113]:
olivertwist_Dickens = grab_text_make_df('http://www.gutenberg.org/cache/epub/730/pg730.txt', 
                                            'Dickens', 
                                            'Oliver Twist',
                                             max_df, min_df)

Number of features: 1946



In [114]:
drjekyllmrhyde_Stevenson = grab_text_make_df('http://www.gutenberg.org/files/43/43-0.txt', 
                                            'Stevenson', 
                                            'Treasure Island',
                                             max_df, min_df)

Number of features: 287



In [115]:
treasureisland_Stevenson = grab_text_make_df('http://www.gutenberg.org/files/120/120-0.txt', 
                                            'Stevenson', 
                                            'Treasure Island',
                                             max_df, min_df)

Number of features: 868



In [116]:
blackarrow_Stevenson = grab_text_make_df('http://www.gutenberg.org/cache/epub/32954/pg32954.txt', 
                                            'Stevenson', 
                                            'The Black Arrow',
                                             max_df, min_df)

Number of features: 1087



In [117]:
kidnapped_Stevenson = grab_text_make_df('http://www.gutenberg.org/files/421/421-0.txt', 
                                            'Stevenson', 
                                            'Kidnapped',
                                             max_df, min_df)

Number of features: 931



In [118]:
merrymen_Stevenson = grab_text_make_df('http://www.gutenberg.org/cache/epub/344/pg344.txt', 
                                            'Stevenson', 
                                            'The Merry Men',
                                             max_df, min_df)

Number of features: 994



In [119]:
small_book_list = [prideandprejudice_Austen, 
                   huckleberryfinn_Twain,
                   greatexpectations_Dickens,
                   drjekyllmrhyde_Stevenson]

small_combined_df = pd.DataFrame(columns = ['author','title','paragraph'])
small_combined_df = combined_df.append(small_book_list, sort=False)
small_combined_df = small_combined_df.fillna(0)
os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done
small_combined_df.shape




(21158, 3201)

In [123]:
big_book_list = [prideandprejudice_Austen,
             ladysusan_Austen,
             mansfieldpark_Austen,
             emma_Austen,
             sensesensibility_Austen,
             huckleberryfinn_Twain,
             princeandpauper_Twain,
             trampabroad_Twain,
             yankeeinkingarthurscourt_Twain,
             tomsawyer_Twain,
             lifeonthemississippi_Twain,
             greatexpectations_Dickens,
             taleoftwocities_Dickens,
             christmascarol_Dickens,
             hardtimes_Dickens,
             olivertwist_Dickens,
             drjekyllmrhyde_Stevenson,
             treasureisland_Stevenson,
             blackarrow_Stevenson,
             kidnapped_Stevenson,
             merrymen_Stevenson]

big_combined_df = pd.DataFrame(columns = ['author','title','paragraph'])
big_combined_df = combined_df.append(big_book_list, sort=False)
big_combined_df = big_combined_df.fillna(0)
os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done
big_combined_df.shape




(56960, 5916)

In [136]:
def accuracy_report(testing_X, testing_Y, model, cross_valuation_yesno):
    predictions = model.predict(testing_X)
    print('Model score:')
    print(model.score(testing_X, testing_Y))
    print(" ")
    print("Classification Report:")
    y_prediction = model.predict(testing_X)
    print(classification_report(testing_Y, y_prediction))
    
# Sometimes we don't want to spend the processor time calculating the cross-valuation, so we need a way to toggle it.
    if cross_valuation_yesno == 1:
        print(" ")
        print('Model cross-valuation:')
        print(sklearn.model_selection.cross_val_score(model, testing_X, testing_Y, cv = 5))
    return

In [140]:
Y = small_combined_df['author']
X = small_combined_df.drop(['author','title','paragraph'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [138]:
rfc = ensemble.RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.9187145557655955
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.96      0.90      0.93       869
     Dickens       0.90      0.97      0.93      2294
   Stevenson       0.71      0.11      0.20        87
       Twain       0.95      0.88      0.91       982

   micro avg       0.92      0.92      0.92      4232
   macro avg       0.88      0.72      0.74      4232
weighted avg       0.92      0.92      0.91      4232

 
Model cross-valuation:
[0.79009434 0.78773585 0.80614657 0.8250591  0.78909953]



In [137]:
gbc = ensemble.GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print('GBC results:')
print(' ')
accuracy_report(X_test, y_test, gbc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

GBC results:
 
Model score:
0.7651228733459358
 
Classification Report:
              precision    recall  f1-score   support

      Austen       1.00      0.49      0.66       869
     Dickens       0.70      0.99      0.82      2294
   Stevenson       0.85      0.33      0.48        87
       Twain       0.98      0.52      0.68       982

   micro avg       0.77      0.77      0.77      4232
   macro avg       0.88      0.58      0.66      4232
weighted avg       0.83      0.77      0.75      4232

 
Model cross-valuation:
[0.72169811 0.74646226 0.74231678 0.76122931 0.73815166]



In [139]:
neighbors = KNeighborsClassifier()
neighbors.fit(X_train, y_train)
print('KNN results:')
print(' ')
accuracy_report(X_test, y_test, neighbors, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

KNN results:
 
Model score:
0.581758034026465
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.54      0.24      0.33       869
     Dickens       0.65      0.75      0.70      2294
   Stevenson       0.25      0.02      0.04        87
       Twain       0.44      0.55      0.49       982

   micro avg       0.58      0.58      0.58      4232
   macro avg       0.47      0.39      0.39      4232
weighted avg       0.57      0.58      0.56      4232

 
Model cross-valuation:
[0.4504717  0.45636792 0.46808511 0.4964539  0.47156398]



In [143]:
Y = big_combined_df['author']
X = big_combined_df.drop(['author','title','paragraph'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [144]:
rfc = ensemble.RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.7949438202247191
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.84      0.85      0.85      2525
     Dickens       0.79      0.87      0.83      4375
   Stevenson       0.79      0.46      0.58      1376
       Twain       0.76      0.78      0.77      3116

   micro avg       0.79      0.79      0.79     11392
   macro avg       0.80      0.74      0.76     11392
weighted avg       0.79      0.79      0.79     11392

 
Model cross-valuation:
[0.70307018 0.67954346 0.68261633 0.69315189 0.70851624]



# Bag of Words

In [166]:
nlp = spacy.load('en')

In [167]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text


In [182]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words, title):
    column_names = ['author', 'title', 'text_sentence']
    column_names = column_names.append(common_words)
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=column_names)
    df['text_sentence'] = sentences[1]
    df['author'] = sentences[0]
    df['title'] = title
    for word in common_words:
        df.loc[:, word] = 0

    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence'][::100]):
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 50 == 0:
            print("Processing row {}".format(i))

    return df



In [188]:

def get_bow_df(url,author,title):
    bow_df_txt = get_texts(url)  # extract the text from the url
    bow_df_txt = text_cleaner(bow_df_txt)  # do a basic clean of the text
    bow_df_doc = nlp(bow_df_txt)  # convert the massive str into a nlp document
    bow_df_sents = [[author, sent] for sent in bow_df_doc.sents]  # convert the document into sentences
    sentences = pd.DataFrame(bow_df_sents)  # create a dataframe of those sentences
    bow_df_words = bag_of_words(bow_df_doc)  # Create a bag of words with the top 2000 most common words
#    common_words = set(bow_df_words)
    
    # Create a new data frame with the combined features of words & sentences; this can take a while to run.
    bow_df = bow_features(sentences, common_words, title)
    bow_df['author'] = author
    return bow_df

    

In [192]:
prideandprejudice_Austen_bow = get_bow_df('http://www.gutenberg.org/files/1342/1342-0.txt',
                                             'Austen',
                                             'Pride and Prejudice')

Processing row 0
Processing row 50


In [193]:
ladysusan_Austen_bow = get_bow_df('http://www.gutenberg.org/cache/epub/21839/pg21839.txt', 
                                            'Austen', 
                                            'Lady Susan')

Processing row 0
Processing row 50


In [194]:
mansfieldpark_Austen_bow = get_bow_df('http://www.gutenberg.org/files/141/141-0.txt', 
                                            'Austen', 
                                            'Mansfield Park')

Processing row 0
Processing row 50


In [195]:
emma_Austen_bow = get_bow_df('http://www.gutenberg.org/files/158/158-0.txt', 
                                            'Austen', 
                                            'Emma')

Processing row 0
Processing row 50
Processing row 100


In [196]:
sensesensibility_Austen_bow = get_bow_df('http://www.gutenberg.org/cache/epub/161/pg161.txt', 
                                            'Austen', 
                                            'Sense and Sensibility')

Processing row 0
Processing row 50


In [197]:
huckleberryfinn_Twain_bow = get_bow_df('http://www.gutenberg.org/files/76/76-0.txt', 
                                            'Twain', 
                                            'The Adventures of Huckleberry Finn')

Processing row 0
Processing row 50


In [198]:
princeandpauper_Twain_bow = get_bow_df('http://www.gutenberg.org/files/1837/1837-0.txt', 
                                            'Twain', 
                                            'The Prince and the Pauper')

Processing row 0


In [199]:
trampabroad_Twain_bow = get_bow_df('http://www.gutenberg.org/files/119/119-0.txt', 
                                            'Twain', 
                                            'A Tramp Abroad')

Processing row 0
Processing row 50


In [200]:
yankeeinkingarthurscourt_Twain_bow = get_bow_df('http://www.gutenberg.org/files/86/86-0.txt', 
                                            'Twain', 
                                            'A Connecticut Yankee in King Arthurs Court')

Processing row 0
Processing row 50


In [201]:
tomsawyer_Twain_bow = get_bow_df('http://www.gutenberg.org/files/74/74-0.txt', 
                                            'Twain', 
                                            'The Adventures of Tom Sawyer')

Processing row 0
Processing row 50


In [202]:
lifeonthemississippi_Twain_bow = get_bow_df('http://www.gutenberg.org/files/245/245-0.txt', 
                                            'Twain', 
                                            'Life on the Mississippi')

Processing row 0
Processing row 50


In [None]:
# Great Expectations exceeded the text limit for python, and was removed from the working lists.

In [204]:
taleoftwocities_Dickens_bow = get_bow_df('http://www.gutenberg.org/files/98/98-0.txt', 
                                            'Dickens', 
                                            'A Tale of Two Cities')

Processing row 0
Processing row 50


In [205]:
christmascarol_Dickens_bow = get_bow_df('http://www.gutenberg.org/files/46/46-0.txt', 
                                            'Dickens', 
                                            'A Christmas Carol')

Processing row 0


In [206]:
hardtimes_Dickens_bow = get_bow_df('http://www.gutenberg.org/files/786/786-0.txt', 
                                            'Dickens', 
                                            'Hard Times')

Processing row 0
Processing row 50


In [207]:
olivertwist_Dickens_bow = get_bow_df('http://www.gutenberg.org/cache/epub/730/pg730.txt', 
                                            'Dickens', 
                                            'Oliver Twist')

Processing row 0
Processing row 50
Processing row 100


In [208]:
drjekyllmrhyde_Stevenson_bow = get_bow_df('http://www.gutenberg.org/files/43/43-0.txt', 
                                            'Stevenson', 
                                            'Treasure Island')

Processing row 0


In [209]:
treasureisland_Stevenson_bow = get_bow_df('http://www.gutenberg.org/files/120/120-0.txt', 
                                            'Stevenson', 
                                            'Treasure Island')

Processing row 0


In [210]:
kidnapped_Stevenson_bow = get_bow_df('http://www.gutenberg.org/files/421/421-0.txt', 
                                            'Stevenson', 
                                            'Kidnapped')

Processing row 0


In [211]:
merrymen_Stevenson_bow = get_bow_df('http://www.gutenberg.org/cache/epub/344/pg344.txt', 
                                            'Stevenson', 
                                            'The Merry Men')

Processing row 0


In [212]:
blackarrow_Stevenson_bow = get_bow_df('http://www.gutenberg.org/cache/epub/32954/pg32954.txt', 
                                            'Stevenson', 
                                            'The Black Arrow')

Processing row 0
Processing row 50


In [213]:
os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done





### Setting up the BoW compiled dataframes

In [216]:
small_bow_list = [prideandprejudice_Austen_bow, 
                   huckleberryfinn_Twain_bow,
                   christmascarol_Dickens_bow,
                   drjekyllmrhyde_Stevenson_bow]

combined_df = pd.DataFrame(columns = ['text_sentence','author','title'])
small_bow_df = combined_df.append(small_bow_list, sort=False)
small_bow_df = small_bow_df.fillna(0)
os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done
small_bow_df.shape




(18158, 2002)

In [229]:
big_bow_list = [prideandprejudice_Austen_bow,
             ladysusan_Austen_bow,
             mansfieldpark_Austen_bow,
             emma_Austen_bow,
             sensesensibility_Austen_bow,
             huckleberryfinn_Twain_bow,
             princeandpauper_Twain_bow,
             trampabroad_Twain_bow,
             yankeeinkingarthurscourt_Twain_bow,
             tomsawyer_Twain_bow,
             lifeonthemississippi_Twain_bow,
             taleoftwocities_Dickens_bow,
             christmascarol_Dickens_bow,
             hardtimes_Dickens_bow,
             olivertwist_Dickens_bow,
             drjekyllmrhyde_Stevenson_bow,
             treasureisland_Stevenson_bow,
             blackarrow_Stevenson_bow,
             kidnapped_Stevenson_bow,
             merrymen_Stevenson_bow]

big_bow_df = pd.DataFrame(columns = ['text_sentence','author','title'])
big_bow_df = big_bow_df.append(big_bow_list, sort=False)
big_bow_df = big_bow_df.fillna(0)
os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done
big_combined_df.shape




(56960, 5916)

### BoW targeting author

In [242]:
Y = small_bow_df['author']
X = small_bow_df.drop(['author','title','text_sentence'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [243]:
rfc = ensemble.RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.4107929515418502
 
Classification Report:


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      Austen       0.41      1.00      0.58      1483
     Dickens       0.67      0.00      0.01       426
   Stevenson       0.00      0.00      0.00       319
       Twain       0.71      0.01      0.01      1404

   micro avg       0.41      0.41      0.41      3632
   macro avg       0.45      0.25      0.15      3632
weighted avg       0.52      0.41      0.24      3632

 
Model cross-valuation:
[0.40934066 0.40715268 0.41127923 0.40909091 0.41022099]



In [244]:
gbc = ensemble.GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print('GBC results:')
print(' ')
accuracy_report(X_test, y_test, gbc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

GBC results:
 
Model score:
0.4107929515418502
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.41      1.00      0.58      1483
     Dickens       0.67      0.00      0.01       426
   Stevenson       0.00      0.00      0.00       319
       Twain       0.82      0.01      0.01      1404

   micro avg       0.41      0.41      0.41      3632
   macro avg       0.47      0.25      0.15      3632
weighted avg       0.56      0.41      0.24      3632

 
Model cross-valuation:
[0.41071429 0.40715268 0.41127923 0.40909091 0.40883978]



In [224]:
neighbors = KNeighborsClassifier()
neighbors.fit(X_train, y_train)
print('KNN results:')
print(' ')
accuracy_report(X_test, y_test, neighbors, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

KNN results:
 
Model score:
0.4088656387665198
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.41      1.00      0.58      1483
     Dickens       0.00      0.00      0.00       426
   Stevenson       0.00      0.00      0.00       319
       Twain       0.40      0.00      0.00      1404

   micro avg       0.41      0.41      0.41      3632
   macro avg       0.20      0.25      0.15      3632
weighted avg       0.32      0.41      0.24      3632

 
Model cross-valuation:
[0.38324176 0.40577717 0.40990371 0.4077135  0.40883978]



In [245]:
Y = big_bow_df['author']
X = big_bow_df.drop(['author','title','text_sentence'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [246]:
rfc = ensemble.RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.3018530052943008
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.48      0.01      0.01      7944
     Dickens       0.31      0.00      0.01      6054
   Stevenson       0.47      0.00      0.00      3961
       Twain       0.30      0.99      0.46      7729

   micro avg       0.30      0.30      0.30     25688
   macro avg       0.39      0.25      0.12     25688
weighted avg       0.38      0.30      0.14     25688

 
Model cross-valuation:
[0.30920413 0.3106267  0.31004282 0.30945893 0.3090555 ]



### BoW targeting book title
(it does much worse in the big list)

In [237]:
Y = small_bow_df['title']
X = small_bow_df.drop(['author','title','text_sentence'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [238]:
rfc = ensemble.RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.41051762114537443
 
Classification Report:


  'precision', 'predicted', average, warn_for)


                                    precision    recall  f1-score   support

                 A Christmas Carol       0.50      0.00      0.00       426
               Pride and Prejudice       0.41      1.00      0.58      1483
The Adventures of Huckleberry Finn       0.67      0.01      0.01      1404
                   Treasure Island       0.00      0.00      0.00       319

                         micro avg       0.41      0.41      0.41      3632
                         macro avg       0.39      0.25      0.15      3632
                      weighted avg       0.48      0.41      0.24      3632

 
Model cross-valuation:
[0.40934066 0.40577717 0.41127923 0.4077135  0.41022099]



In [239]:
gbc = ensemble.GradientBoostingClassifier()
gbc.fit(X_train, y_train)
print('GBC results:')
print(' ')
accuracy_report(X_test, y_test, gbc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

GBC results:
 
Model score:
0.4107929515418502
 
Classification Report:
                                    precision    recall  f1-score   support

                 A Christmas Carol       0.67      0.00      0.01       426
               Pride and Prejudice       0.41      1.00      0.58      1483
The Adventures of Huckleberry Finn       0.82      0.01      0.01      1404
                   Treasure Island       0.00      0.00      0.00       319

                         micro avg       0.41      0.41      0.41      3632
                         macro avg       0.47      0.25      0.15      3632
                      weighted avg       0.56      0.41      0.24      3632

 
Model cross-valuation:
[0.41071429 0.40715268 0.41127923 0.40909091 0.40883978]



In [306]:
Y = big_bow_df['title']
X = big_bow_df.drop(['author','title','text_sentence'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [307]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, random_state=23)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.08727810650887574
 
Classification Report:
                                            precision    recall  f1-score   support

                         A Christmas Carol       0.14      0.00      0.00       443
A Connecticut Yankee in King Arthurs Court       0.08      0.00      0.00      1269
                      A Tale of Two Cities       0.10      0.00      0.00      1760
                            A Tramp Abroad       0.00      0.00      0.00      1676
                                      Emma       0.09      0.99      0.16      2232
                                Hard Times       0.00      0.00      0.00      1734
                                 Kidnapped       0.22      0.00      0.00       858
                                Lady Susan       0.00      0.00      0.00      1280
                   Life on the Mississippi       0.14      0.00      0.00      1569
                            Mansfield Park       0.05      0.00      0.00      1607
  

Ick.  That did even worse.

# Improving tf-idf algorithm

First let's try tweaking the max_df and min_df variables, as well as which books we include in our test sample.  (One of the issues we ran into with the first run was that the Jekyll & Hyde book was so much shorter than the rest, so there was a sampling imbalance that affected performance.)

* max_df = .1, min_df = 7 --> RFC 91%, cross-val ~80%
* max_df = .25, min_df = 10 --> RFC 84%, cross-val ~78%
* max_df = .25, min_df = 7 --> RFC 83%, cross-val ~77%
* max_df = .4, min_df = 8 --> RFC 84.8%, cross-val ~79%
* max_df = .4, min_df = 15 --> RFC 85.5%, cross-val ~79%
* max_df = .6, min_df = 15 --> RFC 85.3%, cross-val ~79%
* max_df = .6, min_df = 10 --> RFC 85.4%, cross-val ~80.5%
* max_df = .4, min_df = 10 --> RFC 85.4%, cross-val ~80.5%
* max_df = .5, min_df = 10 --> RFC 85.4%, cross-val ~80.5%
* max_df = .7, min_df = 10 --> RFC 85.4%, cross-val ~80.5%
* max_df = .3, min_df = 10 --> RFC 84%, cross-val ~78.6%
* max_df = .5, min_df = 12 --> RFC 85.6%, cross-val ~79.7%

Looks like .5/10 is a reasonable balance between test result and cross-val result.

In [333]:
max_df = .5
min_df = 10

In [334]:
print('Sense and Sensibility:')
sensesensibility_Austen = grab_text_make_df('http://www.gutenberg.org/cache/epub/161/pg161.txt', 
                                            'Austen', 
                                            'Sense and Sensibility',
                                             max_df, min_df)
print('The Prince and the Pauper:')
princeandpauper_Twain = grab_text_make_df('http://www.gutenberg.org/files/1837/1837-0.txt', 
                                            'Twain', 
                                            'The Prince and the Pauper',
                                             max_df, min_df)
print('Hard Times:')
hardtimes_Dickens = grab_text_make_df('http://www.gutenberg.org/files/786/786-0.txt', 
                                            'Dickens', 
                                            'Hard Times',
                                             max_df, min_df)
print('Black Arrow:')
blackarrow_Stevenson = grab_text_make_df('http://www.gutenberg.org/cache/epub/32954/pg32954.txt', 
                                            'Stevenson', 
                                            'The Black Arrow',
                                             max_df, min_df)


Sense and Sensibility:
Number of features: 979

The Prince and the Pauper:
Number of features: 623

Hard Times:
Number of features: 731

Black Arrow:
Number of features: 720



In [411]:
small_book_list = [blackarrow_Stevenson, hardtimes_Dickens, princeandpauper_Twain, sensesensibility_Austen]

small_combined_df = pd.DataFrame(columns = ['author','title','paragraph'])
small_combined_df = small_combined_df.append(small_book_list, sort=False)
small_combined_df = small_combined_df.fillna(0)
os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done
small_combined_df.shape




(8143, 1766)

In [336]:
Y = small_combined_df['author']
X = small_combined_df.drop(['author','title','paragraph'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [337]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, random_state=23)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.85451197053407
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.91      0.84      0.87       382
     Dickens       0.87      0.87      0.87       455
   Stevenson       0.88      0.87      0.87       430
       Twain       0.76      0.85      0.80       362

   micro avg       0.85      0.85      0.85      1629
   macro avg       0.86      0.85      0.85      1629
weighted avg       0.86      0.85      0.86      1629

 
Model cross-valuation:
[0.81957187 0.79816514 0.81846154 0.79384615 0.79692308]



In [338]:
print('Pride and Prejudice:')
prideandprejudice_Austen = grab_text_make_df('http://www.gutenberg.org/files/1342/1342-0.txt',
                                             'Austen',
                                             'Pride and Prejudice',
                                             max_df, min_df)
print('Lady Susan:')
ladysusan_Austen = grab_text_make_df('http://www.gutenberg.org/cache/epub/21839/pg21839.txt', 
                                            'Austen', 
                                            'Lady Susan',
                                             max_df, min_df)
print('Mansfield Park:')
mansfieldpark_Austen = grab_text_make_df('http://www.gutenberg.org/files/141/141-0.txt', 
                                            'Austen', 
                                            'Mansfield Park',
                                             max_df, min_df)
print('Emma:')
emma_Austen = grab_text_make_df('http://www.gutenberg.org/files/158/158-0.txt', 
                                            'Austen', 
                                            'Emma',
                                             max_df, min_df)
print('The Adventures of Huckleberry Finn:')
huckleberryfinn_Twain = grab_text_make_df('http://www.gutenberg.org/files/76/76-0.txt', 
                                            'Twain', 
                                            'The Adventures of Huckleberry Finn',
                                             max_df, min_df)
print('A Tramp Abroad:')
trampabroad_Twain = grab_text_make_df('http://www.gutenberg.org/files/119/119-0.txt', 
                                            'Twain', 
                                            'A Tramp Abroad',
                                             max_df, min_df)
print('A Connecticut Yankee in King Arthurs Court:')
yankeeinkingarthurscourt_Twain = grab_text_make_df('http://www.gutenberg.org/files/86/86-0.txt', 
                                            'Twain', 
                                            'A Connecticut Yankee in King Arthurs Court',
                                             max_df, min_df)
print('The Adventures of Tom Sawyer:')
tomsawyer_Twain = grab_text_make_df('http://www.gutenberg.org/files/74/74-0.txt', 
                                            'Twain', 
                                            'The Adventures of Tom Sawyer',
                                             max_df, min_df)
print('Life on the Mississippi:')
lifeonthemississippi_Twain = grab_text_make_df('http://www.gutenberg.org/files/245/245-0.txt', 
                                            'Twain', 
                                            'Life on the Mississippi',
                                             max_df, min_df)
print('Great Expectations:')
greatexpectations_Dickens = grab_text_make_df('http://www.gutenberg.org/files/1400/1400-0.txt',
                                             'Dickens',
                                             'Great Expectations',
                                             max_df, min_df)
print('A Tale of Two Cities:')
taleoftwocities_Dickens = grab_text_make_df('http://www.gutenberg.org/files/98/98-0.txt', 
                                            'Dickens', 
                                            'A Tale of Two Cities',
                                             max_df, min_df)
print('A Christmas Carol:')
christmascarol_Dickens = grab_text_make_df('http://www.gutenberg.org/files/46/46-0.txt', 
                                            'Dickens', 
                                            'A Christmas Carol',
                                             max_df, min_df)
print('Oliver Twist:')
olivertwist_Dickens = grab_text_make_df('http://www.gutenberg.org/cache/epub/730/pg730.txt', 
                                            'Dickens', 
                                            'Oliver Twist',
                                             max_df, min_df)
print('Dr. Jekyll and Mr. Hyde:')
drjekyllmrhyde_Stevenson = grab_text_make_df('http://www.gutenberg.org/files/43/43-0.txt', 
                                            'Stevenson', 
                                            'Treasure Island',
                                             max_df, min_df)
print('Treasure Island:')
treasureisland_Stevenson = grab_text_make_df('http://www.gutenberg.org/files/120/120-0.txt', 
                                            'Stevenson', 
                                            'Treasure Island',
                                             max_df, min_df)
print('Kidnapped:')
kidnapped_Stevenson = grab_text_make_df('http://www.gutenberg.org/files/421/421-0.txt', 
                                            'Stevenson', 
                                            'Kidnapped',
                                             max_df, min_df)
print('Merry Men:')
merrymen_Stevenson = grab_text_make_df('http://www.gutenberg.org/cache/epub/344/pg344.txt', 
                                            'Stevenson', 
                                            'The Merry Men',
                                             max_df, min_df)

Pride and Prejudice:
Number of features: 1011

Lady Susan:
Number of features: 1000

Mansfield Park:
Number of features: 1240

Emma:
Number of features: 1179

The Adventures of Huckleberry Finn:
Number of features: 807

A Tramp Abroad:
Number of features: 1358

A Connecticut Yankee in King Arthurs Court:
Number of features: 917

The Adventures of Tom Sawyer:
Number of features: 572

Life on the Mississippi:
Number of features: 1193

Great Expectations:
Number of features: 1314

A Tale of Two Cities:
Number of features: 1061

A Christmas Carol:
Number of features: 219

Oliver Twist:
Number of features: 1363

Dr. Jekyll and Mr. Hyde:
Number of features: 176

Treasure Island:
Number of features: 602

Kidnapped:
Number of features: 616

Merry Men:
Number of features: 621



In [339]:
big_book_list = [prideandprejudice_Austen,
             ladysusan_Austen,
             mansfieldpark_Austen,
             emma_Austen,
             sensesensibility_Austen,
             huckleberryfinn_Twain,
             princeandpauper_Twain,
             trampabroad_Twain,
             yankeeinkingarthurscourt_Twain,
             tomsawyer_Twain,
             lifeonthemississippi_Twain,
             greatexpectations_Dickens,
             taleoftwocities_Dickens,
             christmascarol_Dickens,
             hardtimes_Dickens,
             olivertwist_Dickens,
             drjekyllmrhyde_Stevenson,
             treasureisland_Stevenson,
             blackarrow_Stevenson,
             kidnapped_Stevenson,
             merrymen_Stevenson]

big_combined_df = pd.DataFrame(columns = ['author','title','paragraph'])
big_combined_df = combined_df.append(big_book_list, sort=False)
big_combined_df = big_combined_df.fillna(0)
os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done
big_combined_df.shape




(44898, 4204)

In [340]:
Y = big_combined_df['author']
X = big_combined_df.drop(['author','title','paragraph'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [342]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, random_state=23)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.7894209354120267
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.92      0.83      0.87      2100
     Dickens       0.76      0.81      0.79      2869
   Stevenson       0.85      0.62      0.72      1399
       Twain       0.72      0.82      0.77      2612

   micro avg       0.79      0.79      0.79      8980
   macro avg       0.81      0.77      0.78      8980
weighted avg       0.80      0.79      0.79      8980

 
Model cross-valuation:
[0.70061213 0.72732332 0.73830735 0.74109131 0.72408027]



### Analysis
The first version got a model score of 0.7949, with cross-val scores of 0.7031, 0.6795, 0.6826, 0.6932, and 0.7085.

This got slightly _lower_ model score, but a better cross-val score, which fits with what we knew about the smaller test sample at this min/max_df setting.

Now to tinker with condensing the feature-set.

## SVD

In [343]:
small_book_list = [blackarrow_Stevenson, hardtimes_Dickens, princeandpauper_Twain, sensesensibility_Austen]

small_combined_df = pd.DataFrame(columns = ['author','title','paragraph'])
small_combined_df = combined_df.append(small_book_list, sort=False)
small_combined_df = small_combined_df.fillna(0)
os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done
small_combined_df.shape




(8143, 1767)

In [344]:
Y = small_combined_df['author']
X = small_combined_df.drop(['author','title','paragraph'], 1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

In [370]:
def svd_converter(svd_num, X):
    #Our SVD data reducer.  We are going to reduce the feature space to 300 features to start with.
    svd= TruncatedSVD(svd_num)
    lsa = make_pipeline(svd, Normalizer(copy=False))
    # Run SVD on the training data, then project the training data.
    X_lsa = lsa.fit_transform(X)

    variance_explained=svd.explained_variance_ratio_
    total_variance = variance_explained.sum()
    print("Percent variance captured by all components:",total_variance*100)

    #Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
    paras_by_component=pd.DataFrame(X_lsa)
    print('output shape:' + str(paras_by_component.shape))
    return paras_by_component


In [365]:
Y = small_combined_df['author']
X = svd_converter(300, small_combined_df.drop(['author','title','paragraph'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

Percent variance captured by all components: 58.96173068948395
output shape:(8143, 300)


In [366]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, random_state=23)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.8115408225905464
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.87      0.82      0.84       382
     Dickens       0.77      0.83      0.80       455
   Stevenson       0.83      0.86      0.84       430
       Twain       0.79      0.72      0.75       362

   micro avg       0.81      0.81      0.81      1629
   macro avg       0.81      0.81      0.81      1629
weighted avg       0.81      0.81      0.81      1629

 
Model cross-valuation:
[0.80428135 0.79510703 0.77230769 0.70769231 0.77846154]



In [367]:
Y = big_combined_df['author']
X = svd_converter(300, big_combined_df.drop(['author','title','paragraph'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

Percent variance captured by all components: 50.162294031414575
output shape:(44898, 300)


In [368]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, random_state=23)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.6606904231625835
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.81      0.75      0.78      2100
     Dickens       0.58      0.72      0.64      2869
   Stevenson       0.77      0.28      0.41      1399
       Twain       0.64      0.73      0.68      2612

   micro avg       0.66      0.66      0.66      8980
   macro avg       0.70      0.62      0.63      8980
weighted avg       0.68      0.66      0.65      8980

 
Model cross-valuation:
[0.58931553 0.59599332 0.6174833  0.61358575 0.61928651]



Well.  That made it _worse_.  Maybe 300 features & 50% variance is insufficient.  Let's try 600 and see what happens with the smaller sample.

* SVD = 300, model score = 81.15%
* SVD = 600, model score = 80.48%
* SVD = 900, model score = 79.68%

Okay, duly noted: increasing the SVD makes the score worse.  Let's try dropping it.

* SVD = 150, model score = 81.35%
* SVD = 100, model score = 81.77%
* SVD = 50, model score = 81.95%, 82.26%
* SVD = 25, model score = 81.4%
* SVD = 40, model score = 81.03%
* SVD = 15, model score = 79.87%
* SVD = 60, model score = 82.08%, 81.89%, 82.01%
* SVD = 70, model score = 81.65%
* SVD = 65, model score = 81.89%
* SVD = 55, model score = 81.4%



In [413]:
Y = small_combined_df['author']
X = svd_converter(60, small_combined_df.drop(['author','title','paragraph'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

Percent variance captured by all components: 25.117792645466647
output shape:(8143, 60)


In [414]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, random_state=23)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.8201350521792511
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.87      0.82      0.84       382
     Dickens       0.80      0.82      0.81       455
   Stevenson       0.84      0.87      0.85       430
       Twain       0.77      0.76      0.77       362

   micro avg       0.82      0.82      0.82      1629
   macro avg       0.82      0.82      0.82      1629
weighted avg       0.82      0.82      0.82      1629

 
Model cross-valuation:
[0.81345566 0.75229358 0.77846154 0.74461538 0.78153846]



In [401]:
Y = big_combined_df['author']
X = svd_converter(50, big_combined_df.drop(['author','title','paragraph'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=23)

Percent variance captured by all components: 19.826417881455622
output shape:(44898, 50)


In [402]:
rfc = ensemble.RandomForestClassifier(n_estimators=200, random_state=23)
rfc.fit(X_train, y_train)
print('RFC results:')
print(' ')
accuracy_report(X_test, y_test, rfc, 1)
os.system('say "all done."'); print('\a')  # this is going to take a while, let me know when it's done

RFC results:
 
Model score:
0.6506681514476614
 
Classification Report:
              precision    recall  f1-score   support

      Austen       0.75      0.75      0.75      2100
     Dickens       0.60      0.66      0.63      2869
   Stevenson       0.68      0.34      0.45      1399
       Twain       0.63      0.72      0.67      2612

   micro avg       0.65      0.65      0.65      8980
   macro avg       0.66      0.62      0.63      8980
weighted avg       0.66      0.65      0.64      8980

 
Model cross-valuation:
[0.58987201 0.5998887  0.60244989 0.57349666 0.59253066]



Odd.  SVD seems to make things worse, in every test case.  Not what I was expecting.

In [194]:
# for reference, a full list of the urls used in this project

prideandprejudice_Austen = get_paragraphs('http://www.gutenberg.org/files/1342/1342-0.txt')
frankenstein_Shelley = get_paragraphs('http://www.gutenberg.org/files/84/84-0.txt')
taleoftwocities_Dickens = get_paragraphs('http://www.gutenberg.org/files/98/98-0.txt')
mobydick_Melville = get_paragraphs('http://www.gutenberg.org/files/2701/2701-0.txt')
modestproposal_Swift = get_paragraphs('http://www.gutenberg.org/cache/epub/1080/pg1080.txt')
print('5')
importanceofbeingearnest_Wilde = get_paragraphs('http://www.gutenberg.org/cache/epub/844/pg844.txt')
aliceinwonderland_Carroll = get_paragraphs('http://www.gutenberg.org/files/11/11-0.txt')
dollshouse_Ibsen = get_paragraphs('http://www.gutenberg.org/cache/epub/2542/pg2542.txt')
sherlockholmes_Doyle = get_paragraphs('http://www.gutenberg.org/cache/epub/1661/pg1661.txt')
heartofdarkness_Conrad = get_paragraphs('http://www.gutenberg.org/files/219/219-0.txt')
print('10')
warandpeace_Tolstoy = get_paragraphs('http://www.gutenberg.org/files/2600/2600-0.txt')
theawakening_Chopin = get_paragraphs('http://www.gutenberg.org/files/160/160-0.txt')
dracula_Stoker = get_paragraphs('http://www.gutenberg.org/cache/epub/345/pg345.txt')
crimeandpunishment_Dostoevsky = get_paragraphs('http://www.gutenberg.org/files/2554/2554-0.txt')
drjekyllmrhyde_Stevenson = get_paragraphs('http://www.gutenberg.org/files/43/43-0.txt')
print('15')
iliad_Homer = get_texts('http://www.gutenberg.org/cache/epub/6130/pg6130.txt')
greatexpectations_Dickens = get_paragraphs('http://www.gutenberg.org/files/1400/1400-0.txt')
metamorphosis_Kafka = get_paragraphs('http://www.gutenberg.org/cache/epub/5200/pg5200.txt')
huckleberryfinn_Twain = get_paragraphs('http://www.gutenberg.org/files/76/76-0.txt')
disappearanceofkimballwebb_Wright = get_paragraphs('http://www.gutenberg.org/files/59060/59060-0.txt')

os.system('say "all done."'); print('\a')  # this could take a while, let me know when it's done

#xxx = get_paragraphs('')


5
10
15

