In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools as it
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
% matplotlib inline

# Vectors and Models
We've more or less settled on using TFIDF as the vectorizer of record. This is because we are looking at frequencies across a range of documents. We went to all the trouble of creating corpora of various lengths in order to have a better experimental field for the research.

Edit of 4 Apr 2017: I'm still not sure that my method is the best method possible. I'd like to try doing the same thing with pipelines to perhaps expedite the process, or at least make it grid search ready.

Twain = 1 
Wilde = 2 
Lincoln = 3
///
D_Twain = 10 
D_Wilde = 20 
D_Lincoln = 30
///
Modern = 100

1000-, 500-, and 100-record samples for each original writer - Twain, Wilde, Lincoln, and Modern

In [2]:
df_1k_H = pd.read_csv('../assets/_CSVs/df_1k_H.csv', index_col=0)
df_1k_W = pd.read_csv('../assets/_CSVs/df_1k_W.csv', index_col=0)
df_1k_S = pd.read_csv('../assets/_CSVs/df_1k_S.csv', index_col=0)

# df_500_H = pd.read_csv('_CSVs/df_500_H.csv', index_col=0)
# df_500_W = pd.read_csv('_CSVs/df_500_W.csv', index_col=0)
# df_500_S = pd.read_csv('_CSVs/df_500_S.csv', index_col=0)

# df_100_H = pd.read_csv('_CSVs/df_100_H.csv', index_col=0)
# df_100_W = pd.read_csv('_CSVs/df_100_W.csv', index_col=0)
# df_100_S = pd.read_csv('_CSVs/df_100_S.csv', index_col=0)

## Pipelines
Since we're ultimately using the TF-IDF transformer, we need multiple documents to compare frequencies of. So, to simulate 'multiple documents' when I had already combined everything into one document, I split the several authors into groups of different numbers of observations (1000, 500, and 100 rows) at different levels (cHaracter, Word, and Sentence).

In [3]:
wild_data = pd.read_csv('../assets/_CSVs/quotes.csv')

In [4]:
X_1k_H = df_1k_H['0'].values #1000 observations at the character level
y_1k_H = df_1k_H['code'].values

In [5]:
X_1k_W = df_1k_W['0'].values #1000 observations at the word level
y_1k_W = df_1k_W['code'].values

In [6]:
X_1k_S = df_1k_S['0'].values #1000 observations at the sentence level
y_1k_S = df_1k_S['code'].values

In [7]:
pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB())
    ])

In [8]:
parameters = {
    #'vect__decode_error': ('ignore'),
    'vect__analyzer': ('word', 'char'),
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2), (2, 2), (3, 3), (4, 4)),  # individually checking uni- through tetragrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__fit_prior': (True, False),
    #'clf__n_iter': (10, 50, 80),
}

In [9]:
grid_search = GridSearchCV(pipeline, parameters, scoring='precision_macro', n_jobs=-1, verbose=1) # precision-- minimize false-positives

In [10]:
#grid_search.fit(X_1k_H, y_1k_H)

In [62]:
#grid_search.best_params_

{'clf__alpha': 1e-05,
 'clf__fit_prior': True,
 'vect__analyzer': 'char',
 'vect__max_df': 1.0,
 'vect__ngram_range': (3, 3)}

In [63]:
#grid_search.best_score_

0.93701123599435088

In [29]:
X_wild = wild_data['quote'].values

In [21]:
wild_pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB())
    ])

In [28]:
wild_pipeline.set_params(vect__analyzer='char', vect__decode_error='ignore', vect__max_df=1.0, vect__ngram_range=(3,3), 
                         clf__alpha=1e-05, clf__fit_prior=True).fit(X_1k_H, y_1k_H)

Pipeline(steps=[('vect', CountVectorizer(analyzer='char', binary=False, decode_error='ignore',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 3), preprocessor=None, stop_words=None,
        stri...lse,
         use_idf=True)), ('clf', MultinomialNB(alpha=1e-05, class_prior=None, fit_prior=True))])

In [30]:
wild_pred = wild_pipeline.predict(X_wild)

In [38]:
wild_data['pred'] = wild_pred

In [50]:
wild_data[wild_data.code=='30']

Unnamed: 0,quote,code,pred
2085,The best way to predict the future is to creat...,30,3
2093,And in the end it's not the years in your life...,30,1
2498,Nearly all men can stand adversity but if you ...,30,3
2499,Labor is prior to and independent of capital. ...,30,1
2500,He bores me. He ought to have stuck to his fly...,30,1
2501,You cannot help men permanently by doing for t...,30,3
2502,America will never be destroyed from the outsi...,30,3
2503,Now I say to you my fellow-citizens that in...,30,3
2504,As a result of the war corporations have been...,30,3
2505,The money powers prey upon the nation in times...,30,2


## Ouch