# Text Analysis
Analyzes texts from seemingly-similar authors, Rachel Swirsky, Cat Valente, and Carmen Maria Machado, on a sentence-by-sentence basis, to see if their writing styles can be predicted.

In [201]:
import numpy as np
import pandas as pd
import spacy
import re
from urllib.request import Request, urlopen
import requests
from bs4 import BeautifulSoup
from time import time
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 100)
%matplotlib inline

## Scraping Data

### Requesting and parsing html
* Using requests and BeautifulSoup
* The program first pulls stories directly from their published websites

In [224]:
# Pull stories from their websites

header={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}

def get_raw(url, no_strong=False, no_b=False, no_emoji=False, no_small_caps=False):
    response = requests.get(url, headers=header)
    html = response.content
    soup = BeautifulSoup(html, 'lxml-xml')
    if no_strong == True:
        for strong in soup('strong'):
            strong.decompose()
    if no_b == True:
        for b in soup('b'):
            b.decompose()
    if no_emoji == True:
        for emoji in soup.find_all('img', class_='emoji'):
            emoji.decompose()
    if no_small_caps == True:
        for caps in soup.find_all('p', {'class':'small-caps'}):
            caps.decompose()
    return soup.get_text()

t0 = time()

# Swirsky stories
dino_raw = get_raw('https://www.apex-magazine.com/if-you-were-a-dinosaur-my-love/')
eros_raw = get_raw('https://www.tor.com/2009/03/03/eros-philia-agape/')
still_raw = get_raw('http://uncannymagazine.com/article/love-is-never-still/', no_b=True)

# Valente stories
fade_raw = get_raw('http://clarkesworldmagazine.com/valente_08_12/')
silent1_raw = get_raw('http://clarkesworldmagazine.com/valente_10_11/', no_strong=True)
silent2_raw = get_raw('http://clarkesworldmagazine.com/valente_11_11/', no_strong=True)
silent3_raw = get_raw('http://clarkesworldmagazine.com/valente_12_11/', no_strong=True)
lion_raw = get_raw('http://uncannymagazine.com/article/planet-lion/')

# Machado stories
stitch_raw = get_raw('https://granta.com/The-Husband-Stitch/')
follow_raw = get_raw('http://www.lightspeedmagazine.com/fiction/help-follow-sister-land-dead/')
descent_raw = get_raw('http://www.nightmare-magazine.com/fiction/descent/')
bites_raw = get_raw('http://texas.gulfcoastmag.org/journal/29.2-summer/fall-2017/eight-bites/')
body_raw = get_raw('http://uncannymagazine.com/article/my-body-herself/')
heinous_raw = get_raw('http://theamericanreader.com/especially-heinous-272-views-of-law-order-svu/', no_emoji=True, no_small_caps=True)

print('Get raw in {:.2f} seconds'.format(time() - t0))

Get raw in 21.51 seconds


In [228]:
print(heinous_raw)








Especially Heinous: 272 Views of Law & Order SVU | The American Reader




			window._wpemojiSettings = {"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/2.3\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/2.3\/svg\/","svgExt":".svg","source":{"concatemoji":"\/wp-includes\/js\/wp-emoji-release.min.js?ver=4.8.6"}};
			!function(a,b,c){function d(a){var b,c,d,e,f=String.fromCharCode;if(!k||!k.fillText)return!1;switch(k.clearRect(0,0,j.width,j.height),k.textBaseline="top",k.font="600 32px Arial",a){case"flag":return k.fillText(f(55356,56826,55356,56819),0,0),b=j.toDataURL(),k.clearRect(0,0,j.width,j.height),k.fillText(f(55356,56826,8203,55356,56819),0,0),c=j.toDataURL(),b!==c(k.clearRect(0,0,j.width,j.height),k.fillText(f(55356,57332,56128,56423,56128,56418,56128,56421,56128,56430,56128,56423,56128,56447),0,0),b=j.toDataURL(),k.clearRect(0,0,j.width,j.height),k.fillText(f(55356,57332,8203,56128,56423,8203,56128,56418,8203,56128,56421,8203,56128,56430,8203

### Finding story text in html
* Using find() from BeautifulSoup
* Identifies story text inside the total site text

In [204]:
# Select out story text

def select_out(raw, start_text, end_text):
    start = raw.find(start_text)
    end = raw.find(end_text)
    return raw[start:end+len(end_text)]

dinosaur = select_out(dino_raw, 
                      'If you were a dinosaur, my', 
                      ', and the stuttering of my broken heart.'
                     )
eros = select_out(eros_raw, 
                  'Lucian packed his possessions before he left.', 
                  'yet form the thoughts to wonder what will happen next.\nHe moves on.'
                 )
still = select_out(still_raw, 
                  'Through every moment of carving, I want her as one wants a woman. I want', 
                  'and decay, but it is also the early bloom that opens into winter’s cold. She is love, and she will always be reborn.'
                  )
fade = select_out(fade_raw, 
                 'ZOOM IN on a bright-eyed Betty in a crisp green dress,', 
                 'And over the black, a cheerful fat man giving the thumbs up to Sylvie, grinning:\n\nBuy Freedom Brand Film! It\'s A-OK!'
                 )
silent1 = select_out(silent1_raw, 
                    'Inanna was called Queen of Heaven and Earth,', 
                    'The sun breaks the mountain crests, hard and cold, a shaft of white spilling over the black lake.'
                    )
silent2 = select_out(silent2_raw, 
                    'Humanity lived many years and ruled the earth,', 
                    'The castle windows go dark, one by one.'
                    )
silent3 = select_out(silent3_raw, 
                    'Tell me a story about yourself, Elefsis.', 
                    'we walk up the long path out of the churning, honey-colored sea.'
                    )
silent = silent1 + '\n' + silent2 + '\n' + silent3

lion = select_out(lion_raw, 
                 'Initial Survey Report: Planet 6MQ441 (Bakeneko), Alaraph', 
                 'Szent Istvan. She longs to hear the first roar of her young.'
                 )
stitch = select_out(stitch_raw, 
                   '(If you read this story out loud, please use', 
                   'backwards off my neck and rolls off the bed, I feel as lonely as I have ever been.'
                   )
follow = select_out(follow_raw, 
                   'Help Me Follow My Sister\ninto the Land of the Dead\nby Ursula Ruiz', 
                   's—in a terrifying place where no mortal has any business treading.'
                   ) + '\n' + select_out(follow_raw, 
                                        'Home\nThis is the thing about my sister and I:', 
                                        'Lucille L\nJun 28, 2015\nUrsula?'
                                        )
descent = select_out(descent_raw, 
                    'We gathered for the last time in October,', 
                    ', drinks cocked in their hands.\nI looked down.'
                    )
bites = select_out(bites_raw, 
                  'As they put me to sleep, my mouth fills with the dust of the moon. I expect to choke on the silt but', 
                  'my charge.\"I\'m sorry,\" I will whisper into her as she walks me toward the front door.\"I\'m sorry,\" I will repeat. \"I didn\'t know.\"')
body = select_out(body_raw, 
                 'When the cave’s ceiling crumples, so do I. Through my body,', 
                 'the privilege to see: a young woman, born screaming.')

## Parsing Text

### Cleaning 
* Cleaning out un-parseable symbols
* Using re

In [205]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub(r'[\[\]]', '', text)
    text = ' '.join(text.split())
    return text

In [206]:
# Clean out bad symbols
dinosaur = text_cleaner(dinosaur)
eros = text_cleaner(eros)
fade = text_cleaner(fade)
silent = text_cleaner(silent)
lion = text_cleaner(lion)
still = text_cleaner(still)
stitch = text_cleaner(stitch)
follow = text_cleaner(follow)
descent = text_cleaner(descent)
bites = text_cleaner(bites)
body = text_cleaner(body)

### Parsing documents
* Using spaCy
* Identifies sentence structure in the stories

In [207]:
t0 = time()

# Parse using spaCy
nlp = spacy.load('en')

dinosaur_doc = nlp(dinosaur)
eros_doc = nlp(eros)
fade_doc = nlp(fade)
silent_doc = nlp(silent)
lion_doc = nlp(lion)
still_doc = nlp(still)
stitch_doc = nlp(stitch)
follow_doc = nlp(follow)
descent_doc = nlp(descent)
bites_doc = nlp(bites)
body_doc = nlp(body)

print('Parsed in {:.2f} seconds'.format(time() - t0))

Parsed in 25.00 seconds


### Grouping data into sentences
* We use spaCy's tokenization and sentence identification
* Creates one table of all the data, organized with one sentence for each line

In [208]:
# Group sentences with their authors. (Each sentence will be a row.)

dinosaur_sents = [["Swirsky", sent.string] for sent in dinosaur_doc.sents]
eros_sents = [["Swirsky", sent.string] for sent in eros_doc.sents]
still_sents = [["Swirsky", sent.string] for sent in still_doc.sents]
fade_sents = [["Valente", sent.string] for sent in fade_doc.sents]
silent_sents = [["Valente", sent.string] for sent in silent_doc.sents]
lion_sents = [["Valente", sent.string] for sent in lion_doc.sents]
stitch_sents = [["Machado", sent.string] for sent in stitch_doc.sents]
follow_sents = [["Machado", sent.string] for sent in follow_doc.sents]
descent_sents = [["Machado", sent.string] for sent in descent_doc.sents]
bites_sents = [["Machado", sent.string] for sent in bites_doc.sents]
body_sents = [["Machado", sent.string] for sent in body_doc.sents]

In [209]:
# Take part of speech of each word in each sentence.

dinosaur_pos = [[token.pos_ for token in sent] for sent in dinosaur_doc.sents]
eros_pos = [[token.pos_ for token in sent] for sent in eros_doc.sents]
still_pos = [[token.pos_ for token in sent] for sent in still_doc.sents]
fade_pos = [[token.pos_ for token in sent] for sent in fade_doc.sents]
silent_pos = [[token.pos_ for token in sent] for sent in silent_doc.sents]
lion_pos = [[token.pos_ for token in sent] for sent in lion_doc.sents]
stitch_pos = [[token.pos_ for token in sent] for sent in stitch_doc.sents]
follow_pos = [[token.pos_ for token in sent] for sent in follow_doc.sents]
descent_pos = [[token.pos_ for token in sent] for sent in descent_doc.sents]
bites_pos = [[token.pos_ for token in sent] for sent in bites_doc.sents]
body_pos = [[token.pos_ for token in sent] for sent in body_doc.sents]

In [210]:
# list(dinosaur_doc.print_tree())

In [211]:
# Combine the sentences from the stories into one data frame.
sentences = pd.DataFrame(dinosaur_sents + eros_sents + still_sents + fade_sents + silent_sents + lion_sents + stitch_sents + follow_sents + descent_sents + bites_sents + body_sents)
sentences.columns = ['author', 'text']

# Strip out unnecessary punctuation.
sentences['text'] = sentences['text'].str.lower().str.replace(r'[^a-zA-Z0-9 ]+', ' ').fillna('')

# Word count per sentence.
sentences['words'] = sentences['text'].str.split().apply(len)

# Parts of speech classification of each word in sentence.
sentences['pos'] = pd.Series(dinosaur_pos + eros_pos + still_pos + fade_pos + silent_pos + lion_pos + stitch_pos + follow_pos + descent_pos + bites_pos + body_pos)

# Remove blank sentences.
sentences = sentences.loc[sentences['words'] > 0]

# Remove proper nouns personal to each document.
propn_pat = r'\b(agogna|koetoi|adriana|lucian|galatea|aphrodite|hephaestus|fuoco|inanna|enki|erishkigal|salma|janet|lawrence|olive|luna|ursula|ceno|neva|elefsis|ravan|cassian|diane|ares|ben|seki|ilet)\b'
sentences['text'] = sentences['text'].str.replace(propn_pat, '')

# Sentence counts by author.
df_count = sentences.author.value_counts().to_frame(name='count')
df_count['ratio'] = (df_count['count'] / len(sentences.author)).round(2)
print('Full Data Set:')
display(df_count)
print('{} total sentences'.format(len(sentences.author)))
display(sentences.loc[sentences['author'] == 'Swirsky'].head(1))
display(sentences.loc[sentences['author'] == 'Valente'].head(1))
display(sentences.loc[sentences['author'] == 'Machado'].head(1))

Full Data Set:


Unnamed: 0,count,ratio
Valente,2269,0.38
Swirsky,1880,0.32
Machado,1758,0.3


5907 total sentences


Unnamed: 0,author,text,words,pos
0,Swirsky,if you were a dinosaur my love then you woul...,14,"[ADP, PRON, VERB, DET, NOUN, PUNCT, ADJ, NOUN,..."


Unnamed: 0,author,text,words,pos
2153,Valente,was called queen of heaven and earth queen o...,37,"[PROPN, VERB, VERB, PROPN, ADP, PROPN, CCONJ, ..."


Unnamed: 0,author,text,words,pos
4486,Machado,if you read this story out loud please use t...,24,"[PUNCT, ADP, PRON, VERB, DET, NOUN, ADV, ADV, ..."


## Splitting Test Set
This protects the vectorizer and regressor from over-fitting to the training set.

In [176]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(sentences.drop(['author', 'words'], axis=1), sentences['author'], test_size=0.2)
X_train = sentences.drop(['author', 'words'], axis=1)
y_train = sentences['author']

In [177]:
print('Training Set:')
print(y_train.value_counts().to_frame(name='count'))
print('\nTest Set:')
print(y_test.value_counts().to_frame(name='count'))

Training Set:
         count
Valente   1842
Swirsky   1496
Machado   1263

Test Set:
         count
Valente    427
Swirsky    384
Machado    340


## Pipeline: Selector, Transformers and Estimator

### Item Selector
* Makes it possible to run different transformers on different columns of data before concatenating them together with FeatureUnion and then running through the estimator

In [178]:
from sklearn.base import BaseEstimator, TransformerMixin
# The key corresponding to the desired value in a mappable.

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

### TFIDF Vectorizer

* Try most common words across all texts
* Bigrams and trigrams are arrangements of two- and three-words, respectively

In [179]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=str.split, 
                            stop_words='english', 
                            lowercase=True, 
                            use_idf=True, 
                            norm='l2', 
                            smooth_idf=True, 
                            max_features=None, 
                            ngram_range=(1,2), 
                            max_df=0.6, 
                            min_df=2)

### Text Statistics

In [180]:
class TextStats(BaseEstimator, TransformerMixin):
# Extract features from each document for DictVectorizer

    def fit(self, x, y=None):
        return self

    def transform(self, sents):
        return [{'LENGTH': len(sent), 
                 'ADJ': sent.count('ADJ'), 
                 'NOUN': sent.count('NOUN'), 
                 'VERB': sent.count('VERB'), 
                 'ADV': sent.count('ADV'), 
                 'ADP': sent.count('ADP'), 
                 'CCONJ': sent.count('CCONJ'), 
                 'INTJ': sent.count('INTJ'), 
                 'PART': sent.count('PART'), 
                 'SCONJ': sent.count('SCONJ'), 
                 'PRON': sent.count('PRON'), 
                 'PROPN': sent.count('PROPN'), 
                 'DET': sent.count('DET'), 
                 'NUM': sent.count('NUM'), 
                 'AUX': sent.count('AUX'), 
                 'SYM': sent.count('SYM'), 
                }
                for sent in sents]

### Dict Vectorizer

In [181]:
from sklearn.feature_extraction import DictVectorizer
# list of dicts -> feature matrix

### Stochastic Gradient Descent Classifier
* Combines support vector classifier with gradient descent

In [182]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(penalty='elasticnet', 
                    alpha=0.0001, 
                    class_weight=None, 
                    loss='log', 
                    tol=0.005, 
                    fit_intercept=True, 
                    power_t=0.5)

### Pipeline
* Chains transformers and estimator as input to grid search algorithm

In [183]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

pipe = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[
            ('text', Pipeline([
                ('selector', ItemSelector(key='text')), 
                ('vect', vectorizer), 
            ])), 
            ('pos', Pipeline([
                ('selector', ItemSelector(key='pos')), 
                ('stats', TextStats()), 
                ('dict', DictVectorizer()), 
            ])), 
        ], 
        transformer_weights={
            'text': 1.0, 
            'pos': 0.1, 
        }, 
    )), 
    ('model', sgd), 
])

## Grid Search
### Hyperparameters
* List parameters to search over for best cv fit

In [184]:
vect_features = [None, 20000, 10000, 5000, 1000]
vect_ngrams = [(1,1), (1,2), (1,3)]
vect_max = [0.6, 0.7, 0.8]
vect_min = [1, 2, 3]
vect_norm = ['l2', 'l1', None]
vect_stop = ['english', None]
model_alpha = [0.001, 0.0001, 0.00001]
model_loss = ['hinge', 'log']
model_tol = [0.01, 0.005, 0.001]
model_class_weight = ['balanced', None]
model_power_t = [0.4, 0.5, 0.6]
model_penalty = ['l2', 'elasticnet', 'l1', 'none']
model_intercept = [True, False]

parameters = {
#         'union__text__vect__max_features': vect_features, 
        'union__text__vect__ngram_range': vect_ngrams, 
#         'union__text__vect__max_df': vect_max, 
#         'union__text__vect__min_df': vect_min, 
#         'union__text__vect__norm': vect_norm, 
#         'union__text__vect__stop_words': vect_stop, 
#         'model__alpha': model_alpha, 
        'model__loss': model_loss, 
#         'model__tol': model_tol,
#         'model__class_weight': model_class_weight, 
#         'model__power_t': model_power_t, 
#         'model__penalty': model_penalty, 
#         'model__fit_intercept': model_intercept
    }

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, 
                    parameters, 
                    scoring='f1_micro', 
                    return_train_score=True)

### Fitting and Results

In [185]:
print('Performing grid search...')
t0 = time()
grid.fit(X_train, y_train)
print('\nCompleted in {:.2f} seconds'.format(time() - t0))

Performing grid search...

Completed in 6.13 seconds


In [186]:
print('\nBest Training Set Score: {:.2%}'.format(grid.cv_results_['mean_train_score'][grid.best_index_]))
print('\nBest Cross-Val Score: {:.2%}'.format(grid.best_score_))

print('\nBest Parameters:\n')
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('{}: {}'.format(param_name, best_parameters[param_name]))

from sklearn.metrics import confusion_matrix

test_score = grid.score(X_test, y_test)
y_test_pred = grid.predict(X_test)
test_cm = confusion_matrix(y_test, y_test_pred)

print('\nTest set score: {:.1%}'.format(test_score))

df_test_cm = pd.DataFrame(test_cm, index=grid.classes_, columns=grid.classes_)
df_test_cm.index.name = 'Actual Author'
df_test_cm.columns.name = 'Predicted Author'
display(df_test_cm)


Best Training Set Score: 88.87%

Best Cross-Val Score: 66.03%

Best Parameters:

model__loss: log
union__text__vect__ngram_range: (1, 2)

Test set score: 67.2%


Predicted Author,Machado,Swirsky,Valente
Actual Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Machado,215,36,89
Swirsky,72,210,102
Valente,53,26,348


In [187]:
# grid.best_estimator_.named_steps['union'].transformer_list[0][1].named_steps['vect']

In [188]:
# grid.best_estimator_.named_steps['union'].transformer_list[1][1].named_steps['dict'].get_feature_names()

In [189]:
cv_results = pd.DataFrame()
cv_results['mean_test_score'] = (grid.cv_results_['mean_test_score'] * 100).round(2)
cv_results['std_test_score'] = (grid.cv_results_['std_test_score'] * 100).round(2)
cv_results['mean_train_score'] = (grid.cv_results_['mean_train_score'] * 100).round(2)
cv_results['std_train_score'] = (grid.cv_results_['std_train_score'] * 100).round(2)
cv_results['mean_fit_time'] = grid.cv_results_['mean_fit_time'].round(2)
cv_results = cv_results.join(pd.DataFrame(grid.cv_results_).filter(like='param_'))
cv_results.sort_values('std_test_score', ascending=True, axis=0)

Unnamed: 0,mean_test_score,std_test_score,mean_train_score,std_train_score,mean_fit_time,param_model__loss,param_union__text__vect__ngram_range
2,64.33,0.65,90.37,1.04,0.16,hinge,"(1, 3)"
4,66.03,0.79,88.87,0.4,0.15,log,"(1, 2)"
1,63.55,1.14,89.34,0.42,0.15,hinge,"(1, 2)"
5,60.14,1.35,80.11,2.49,0.15,log,"(1, 3)"
0,62.51,1.65,87.83,2.23,0.15,hinge,"(1, 1)"
3,63.38,2.28,84.99,4.86,0.16,log,"(1, 1)"


In [198]:
feature_list1 = grid.best_estimator_.named_steps['union'].transformer_list[0][1].named_steps['vect'].get_feature_names()
feature_list2 = grid.best_estimator_.named_steps['union'].transformer_list[1][1].named_steps['dict'].get_feature_names()
feature_list = feature_list1 + feature_list2
feature_coefficients = pd.DataFrame(grid.best_estimator_.steps[-1][1].coef_, columns=feature_list, index=grid.classes_)
print('\n{} features'.format(len(feature_list)))
feature_coefficients = feature_coefficients.abs().round(2)
display(feature_coefficients.loc['Machado'].sort_values(ascending=False).head(20))
display(feature_coefficients.loc['Swirsky'].sort_values(ascending=False).head(20))
display(feature_coefficients.loc['Valente'].sort_values(ascending=False).head(20))
# display(feature_coefficients.sort_values('Valente', ascending=False, axis=1))


3556 features


PROPN         7.96
son           4.41
cal           4.34
ribbon        4.06
parents       4.03
doctor        3.90
girlfriend    3.80
rose          3.79
says          3.67
going         3.48
woman         3.44
2015          3.12
sister        3.09
slender       3.09
loved         3.02
my            2.97
students      2.87
lion          2.87
lions         2.83
i             2.81
Name: Machado, dtype: float64

rose        7.27
d           4.38
lion        4.27
love        3.92
says        3.86
bird        3.44
NUM         3.41
mother      3.38
we          3.37
i           3.25
lions       3.22
ivory       3.16
pedestal    3.05
must        3.03
mine        3.01
him         2.92
sculptor    2.85
story       2.74
across      2.58
statue      2.56
Name: Swirsky, dtype: float64

rose          5.39
lion          5.36
lions         4.38
PROPN         4.12
d             3.58
big           3.54
machine       3.53
sea           3.44
dormouse      3.42
woman         3.39
difference    3.34
inside        3.32
him           3.22
dreambody     2.98
means         2.86
jewel         2.86
smallgod      2.85
husband       2.84
black         2.83
mother        2.69
Name: Valente, dtype: float64