In [76]:
import pandas as pd
import numpy as np
from pathlib import Path

# Set styling parameters
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import seaborn as sns

rcParams['figure.figsize'] = 15, 6
plt.style.use('fivethirtyeight')
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score
from sklearn.cross_validation import KFold
from sklearn.pipeline import make_pipeline

In [16]:
sliced_df.head()

0    fell together as modest people will in the tai...
1    that he ran away as he were wood demented for ...
2    had ye not been therefore to yield us unto him...
3    the advantage so i judged it best to humor him...
4    girls were always naked but nobody seemed to k...
Name: text, dtype: object

### Getting & Processing Data

In [125]:
df = pd.read_csv(Path.cwd().joinpath('250Paragraphs_150Words.csv'), index_col = 0)

sliced_df = df.loc[:, 'text']
labels_sex = df.loc[:, 'sex']
labels_period = df.loc[:, 'period']
labels_author = df.loc[:, 'author']

#note that we can call all_labels and see our names, should we forget them later
all_labels = {'sex': labels_sex, 'period': labels_period, 'author': labels_author}

In [33]:
# Templates for various predictions 

# Predicting on Sex 

nb_X_train_sex, nb_X_test_sex, nb_y_train_sex, nb_y_test_sex = train_test_split(** transformed data **, nb_labels_sex)


# Predicting on period

nb_X_train_period, nb_X_test_period, nb_y_train_period, nb_y_test_period = train_test_split(** transformed data **, nb_labels_period)


# Predicting on author

nb_X_train_author, nb_X_test_author, nb_y_train_author, nb_y_test_author = train_test_split(** transformed data **, nb_labels_author)


### Bag of Words & TF-IDF

In [11]:
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# max_df and min_df work interestingly
# they can either be a float value between 0.0 and 1.0
# in which case they represent a proportion of the documents - i.e. .98 = 98% of the docs
# or they can be an integer
# in which case they can be an absolute count - i.e. 2 = 2 out of all documents

In [59]:
# I'm creating this dictionary so the transformed vectors can be easily referenced throughout the doc
transformed_vectors = {}

In [60]:
#Bag of Words without n-grams

count_vec = CountVectorizer(min_df = 2, max_df = .98, stop_words="english")
count_vec.fit(sliced_df)
count_vec_transformed = count_vec.transform(sliced_df)
transformed_vectors['count_vec_transformed'] = count_vec_transformed

In [15]:
# Taking a look at our transformed data - a sparse matrix 
count_vec_transformed

<3500x14125 sparse matrix of type '<class 'numpy.int64'>'
	with 184397 stored elements in Compressed Sparse Row format>

In [62]:
#Bag of Words - bigrams

count_vec_bigrams = CountVectorizer(min_df = 2, max_df = .98, stop_words="english", ngram_range=(1,2))
count_vec_bigrams.fit(sliced_df)
count_vec_bigrams_transformed = count_vec_bigrams.transform(sliced_df)
transformed_vectors['count_vec_bigrams_transformed'] = count_vec_bigrams_transformed

In [63]:
#Bag of Words - trigrams

count_vec_trigrams = CountVectorizer(min_df = 2, max_df = .98, stop_words="english", ngram_range=(1,3))
count_vec_trigrams.fit(sliced_df)
count_vec_trigrams_transformed = count_vec_trigrams.transform(sliced_df)
transformed_vectors['count_vec_trigrams_transformed'] = count_vec_trigrams_transformed

In [64]:
#TFIDF without n-grams

tfidf_vec = TfidfVectorizer(min_df = 2, max_df = .98, stop_words='english')
tfidf_vec.fit(sliced_df)
tfidf_vec_transformed = tfidf_vec.transform(sliced_df)
transformed_vectors['tfidf_vec_transformed'] = tfidf_vec_transformed

In [65]:
#TFIDF - bigrams

tfidf_vec_bigrams = TfidfVectorizer(min_df = 2, max_df = .98, stop_words='english', ngram_range=(1,2))
tfidf_vec_bigrams.fit(sliced_df)
tfidf_vec_bigrams_transformed = tfidf_vec.transform(sliced_df)
transformed_vectors['tfidf_vec_bigrams_transformed'] = tfidf_vec_bigrams_transformed

In [66]:
#TFIDF - trigrams

tfidf_vec_trigrams = TfidfVectorizer(min_df = 2, max_df = .98, stop_words='english', ngram_range=(1,3))
tfidf_vec_trigrams.fit(sliced_df)
tfidf_vec_trigrams_transformed = tfidf_vec.transform(sliced_df)
transformed_vectors['tfidf_vec_trigrams_transformed'] = tfidf_vec_trigrams_transformed

In [67]:
# worth noting that TfidfVectorizer first performs CountVectorizer and then TfidfTransfomer, so since we're
# already running countvectorizer, we could use the transformer on that output if we were so inclined

In [146]:
transformed_vectors_all = transformed_vectors
del transformed_vectors['tfidf_vec_trigrams_transformed']
del transformed_vectors['count_vec_trigrams_transformed']

In [147]:
transformed_vectors

{'count_vec_bigrams_transformed': <3500x23695 sparse matrix of type '<class 'numpy.int64'>'
 	with 209166 stored elements in Compressed Sparse Row format>,
 'count_vec_transformed': <3500x14125 sparse matrix of type '<class 'numpy.int64'>'
 	with 184397 stored elements in Compressed Sparse Row format>,
 'tfidf_vec_bigrams_transformed': <3500x14125 sparse matrix of type '<class 'numpy.float64'>'
 	with 184397 stored elements in Compressed Sparse Row format>,
 'tfidf_vec_transformed': <3500x14125 sparse matrix of type '<class 'numpy.float64'>'
 	with 184397 stored elements in Compressed Sparse Row format>}

### Naive Bayes

Ran multiple versions. All testing was classification on sex (binary). Multi-categorical classification (period and author) was done with ensemble methdods.

In [None]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

# we recognize GaussianNB isn't exaclty 'appropriate' here, but it was good practice to build one
# and it took about 10 seconds so we did it anyway for the first round, feel free to drop it

##### Bag of Words without n-grams

In [41]:
nb_X_train_sex, nb_X_test_sex, nb_y_train_sex, nb_y_test_sex = train_test_split(count_vec_transformed, 
                                                                                nb_labels_sex)

BernoulliNB

In [42]:
bernoulli_clf = BernoulliNB()
bernoulli_clf.fit(nb_X_train_sex, nb_y_train_sex)

bernoulli_train_preds = bernoulli_clf.predict(nb_X_train_sex)
bernoulli_test_preds = bernoulli_clf.predict(nb_X_test_sex)

bernoulli_train_accuracy = accuracy_score(nb_y_train_sex, bernoulli_train_preds)
bernoulli_test_accuracy = accuracy_score(nb_y_test_sex, bernoulli_test_preds)

print("Accuracy on Training Set from BernoulliNB Classifier: {:.4}%".format(bernoulli_train_accuracy * 100))
print("Accuracy on Testing Set from BernoulliNB Classifier: {:.4}%".format(bernoulli_test_accuracy * 100))

Accuracy on Training Set from BernoulliNB Classifier: 98.7%
Accuracy on Testing Set from BernoulliNB Classifier: 88.11%


MultinomialNB

In [43]:
mnb_clf = MultinomialNB()
mnb_clf.fit(nb_X_train_sex, nb_y_train_sex)

mnb_train_preds = mnb_clf.predict(nb_X_train_sex)
mnb_test_preds = mnb_clf.predict(nb_X_test_sex)

mnb_train_accuracy = accuracy_score(nb_y_train_sex, mnb_train_preds)
mnb_test_accuracy = accuracy_score(nb_y_test_sex, mnb_test_preds)

print("Accuracy on Training Set from MultinomialNB Classifier: {:.4}%".format(mnb_train_accuracy * 100))
print("Accuracy on Testing Set from MultinomialNB Classifier:: {:.4}%".format(mnb_test_accuracy * 100))

Accuracy on Training Set from MultinomialNB Classifier: 98.32%
Accuracy on Testing Set from MultinomialNB Classifier:: 87.66%


GaussianNB

In [44]:
gaussian_clf = GaussianNB()
gaussian_clf.fit(nb_X_train_sex.toarray(), nb_y_train_sex)

gaussian_train_preds = gaussian_clf.predict(nb_X_train_sex.toarray())
gaussian_test_preds = gaussian_clf.predict(nb_X_test_sex.toarray())

gaussian_train_accuracy = accuracy_score(nb_y_train_sex, gaussian_train_preds)
gaussian_test_accuracy = accuracy_score(nb_y_test_sex, gaussian_test_preds)

print("Accuracy on Training Set from GaussianNB Classifier: {:.4}%".format(gaussian_train_accuracy * 100))
print("Accuracy on Testing Set from GaussianNB Classifier::  {:.4}%".format(gaussian_test_accuracy * 100))

Accuracy on Training Set from GaussianNB Classifier: 99.24%
Accuracy on Testing Set from GaussianNB Classifier::  80.8%


##### Bag of Words - bigrams

In [45]:
nb_X_train_sex, nb_X_test_sex, nb_y_train_sex, nb_y_test_sex = train_test_split(count_vec_bigrams_transformed, 
                                                                                nb_labels_sex)

BernoulliNB

In [46]:
bernoulli_clf = BernoulliNB()
bernoulli_clf.fit(nb_X_train_sex, nb_y_train_sex)

bernoulli_train_preds = bernoulli_clf.predict(nb_X_train_sex)
bernoulli_test_preds = bernoulli_clf.predict(nb_X_test_sex)

bernoulli_train_accuracy = accuracy_score(nb_y_train_sex, bernoulli_train_preds)
bernoulli_test_accuracy = accuracy_score(nb_y_test_sex, bernoulli_test_preds)

print("Accuracy on Training Set from BernoulliNB Classifier: {:.4}%".format(bernoulli_train_accuracy * 100))
print("Accuracy on Testing Set from BernoulliNB Classifier: {:.4}%".format(bernoulli_test_accuracy * 100))

Accuracy on Training Set from BernoulliNB Classifier: 99.7%
Accuracy on Testing Set from BernoulliNB Classifier: 88.91%


MultinomialNB

In [47]:
mnb_clf = MultinomialNB()
mnb_clf.fit(nb_X_train_sex, nb_y_train_sex)

mnb_train_preds = mnb_clf.predict(nb_X_train_sex)
mnb_test_preds = mnb_clf.predict(nb_X_test_sex)

mnb_train_accuracy = accuracy_score(nb_y_train_sex, mnb_train_preds)
mnb_test_accuracy = accuracy_score(nb_y_test_sex, mnb_test_preds)

print("Accuracy on Training Set from MultinomialNB Classifier: {:.4}%".format(mnb_train_accuracy * 100))
print("Accuracy on Testing Set from MultinomialNB Classifier:: {:.4}%".format(mnb_test_accuracy * 100))

Accuracy on Training Set from MultinomialNB Classifier: 99.54%
Accuracy on Testing Set from MultinomialNB Classifier:: 89.14%


##### TFIDF without n-grams

In [48]:
nb_X_train_sex, nb_X_test_sex, nb_y_train_sex, nb_y_test_sex = train_test_split(tfidf_vec_transformed, 
                                                                                nb_labels_sex)

BernoulliNB

In [49]:
bernoulli_clf = BernoulliNB()
bernoulli_clf.fit(nb_X_train_sex, nb_y_train_sex)

bernoulli_train_preds = bernoulli_clf.predict(nb_X_train_sex)
bernoulli_test_preds = bernoulli_clf.predict(nb_X_test_sex)

bernoulli_train_accuracy = accuracy_score(nb_y_train_sex, bernoulli_train_preds)
bernoulli_test_accuracy = accuracy_score(nb_y_test_sex, bernoulli_test_preds)

print("Accuracy on Training Set from BernoulliNB Classifier: {:.4}%".format(bernoulli_train_accuracy * 100))
print("Accuracy on Testing Set from BernoulliNB Classifier: {:.4}%".format(bernoulli_test_accuracy * 100))

Accuracy on Training Set from BernoulliNB Classifier: 98.86%
Accuracy on Testing Set from BernoulliNB Classifier: 86.4%


MultinomialNB

In [50]:
mnb_clf = MultinomialNB()
mnb_clf.fit(nb_X_train_sex, nb_y_train_sex)

mnb_train_preds = mnb_clf.predict(nb_X_train_sex)
mnb_test_preds = mnb_clf.predict(nb_X_test_sex)

mnb_train_accuracy = accuracy_score(nb_y_train_sex, mnb_train_preds)
mnb_test_accuracy = accuracy_score(nb_y_test_sex, mnb_test_preds)

print("Accuracy on Training Set from MultinomialNB Classifier: {:.4}%".format(mnb_train_accuracy * 100))
print("Accuracy on Testing Set from MultinomialNB Classifier:: {:.4}%".format(mnb_test_accuracy * 100))

Accuracy on Training Set from MultinomialNB Classifier: 97.56%
Accuracy on Testing Set from MultinomialNB Classifier:: 84.46%


##### TFIDF - bigrams

In [51]:
nb_X_train_sex, nb_X_test_sex, nb_y_train_sex, nb_y_test_sex = train_test_split(tfidf_vec_bigrams_transformed, 
                                                                                nb_labels_sex)

BernoulliNB

In [52]:
bernoulli_clf = BernoulliNB()
bernoulli_clf.fit(nb_X_train_sex, nb_y_train_sex)

bernoulli_train_preds = bernoulli_clf.predict(nb_X_train_sex)
bernoulli_test_preds = bernoulli_clf.predict(nb_X_test_sex)

bernoulli_train_accuracy = accuracy_score(nb_y_train_sex, bernoulli_train_preds)
bernoulli_test_accuracy = accuracy_score(nb_y_test_sex, bernoulli_test_preds)

print("Accuracy on Training Set from BernoulliNB Classifier: {:.4}%".format(bernoulli_train_accuracy * 100))
print("Accuracy on Testing Set from BernoulliNB Classifier: {:.4}%".format(bernoulli_test_accuracy * 100))

Accuracy on Training Set from BernoulliNB Classifier: 98.74%
Accuracy on Testing Set from BernoulliNB Classifier: 88.23%


MultinomialNB

In [53]:
mnb_clf = MultinomialNB()
mnb_clf.fit(nb_X_train_sex, nb_y_train_sex)

mnb_train_preds = mnb_clf.predict(nb_X_train_sex)
mnb_test_preds = mnb_clf.predict(nb_X_test_sex)

mnb_train_accuracy = accuracy_score(nb_y_train_sex, mnb_train_preds)
mnb_test_accuracy = accuracy_score(nb_y_test_sex, mnb_test_preds)

print("Accuracy on Training Set from MultinomialNB Classifier: {:.4}%".format(mnb_train_accuracy * 100))
print("Accuracy on Testing Set from MultinomialNB Classifier:: {:.4}%".format(mnb_test_accuracy * 100))

Accuracy on Training Set from MultinomialNB Classifier: 97.52%
Accuracy on Testing Set from MultinomialNB Classifier:: 85.37%


### Pipeline Implementation - Classifiers for Author & Period

We've built functions here that will take in an intialized GridSearchCV, perform it on the given transformed vectors, and return the best results.

If you use the 'multi-grid' and the full transformation_vectors set, the functions will test multiple classifiers (with tuning) for all six vector transformations (BOW, BOW bigram, BOW trigram, TFIDF, TFIDF bigram, TFIDF trigram). As you might imagine, this take a very long time to run. We don't recommend running it that way unless you have some significant computing power or time. <br><br>
We limited it to only the unigram/bigram transformations, but we did run the multi pipeline. On a laptop, it took about (and things got pretty hot). You can also individually build classifiers below. 

In [134]:
#Functions 

def test_classifiers(transformed_vectors, grid, labels):
    results_dict = {}
    
    for named_matrix, sparse_matrix in transformed_vectors.items():
        X_train, X_test, y_train, y_test = train_test_split(sparse_matrix, labels)
        grid.fit(X_train, y_train)
        results_dict[named_matrix + ' results'] = [grid.best_score_, grid.best_params_, grid.score(X_test, y_test)]
    
    return results_dict
    

In [135]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

In [138]:
# Multi-classifier Pipeline 

# NOTE- currently not working 

multi_param_grid = [
    {'classifier':[RandomForestClassifier()], 'classifier__n_estimators': [50, 150, 250], 
    'classifier__max_depth': [2, 4, 6, 8, 10]},
    
    {'classifier':[AdaBoostClassifier()], 'classifier__n_estimators': [50, 150, 250], 
    'classifier__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1]},
    
    {'classifier':[KNeighborsClassifier()], 'classifier__n_neighbors': [5, 15, 25]}
    ]

multi_pipe = make_pipeline(RandomForestClassifier())

multi_grid = GridSearchCV(multi_pipe, multi_param_grid, cv = 5, scoring = "accuracy")

In [142]:
author_classifier_results = test_classifiers(transformed_vectors, multi_grid, all_labels['author'])
%time 

In [None]:
period_classifier_results = test_classifiers(transformed_vectors, multi_grid, all_labels['period'])
%time 

In [None]:
sex_classifier_results = test_classifiers(transformed_vectors, multi_grid, all_labels['sex'])
%time 

### Random Forest

In [72]:
from sklearn.ensemble import RandomForestClassifier 

In [150]:
rf_param_grid = {'randomforestclassifier__n_estimators': [150, 250, 350], 
                'randomforestclassifier__max_depth': [5, 10, 15]}

rf_pipe = make_pipeline(RandomForestClassifier())

rf_grid = GridSearchCV(rf_pipe, rf_param_grid, cv = 5, scoring = "accuracy")

In [None]:
rf_author_results = test_classifiers(transformed_vectors, rf_grid, all_labels['author'])
%time

In [None]:
rf_period_results = test_classifiers(transformed_vectors, rf_grid, all_labels['period'])
%time

In [None]:
rf_sex_results = test_classifiers(transformed_vectors, rf_grid, all_labels['sex'])
%time

### Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier()

ada_param_grid = {'adaboostclassifier__n_estimators': [50, 150, 250], 
                 'adaboostclassifier__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1]}

ada_pipe = make_pipeline(ada)

ada_grid = GridSearchCV(ada_pipe, ada_param_grid, cv = 5, scoring = "accuracy")

In [None]:
ada_author_results = test_classifiers(transformed_vectors, ada_grid, all_labels['author'])
%time

In [None]:
ada_period_results = test_classifiers(transformed_vectors, ada_grid, all_labels['period'])
%time

In [None]:
ada_sex_results = test_classifiers(transformed_vectors, ada_grid, all_labels['sex'])
%time

### K-Nearest Neighbors

In [77]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
KNeighborsClassifier()