# Text Analysis
Analyzes texts from seemingly-similar authors, Rachel Swirsky, Cat Valente, and Carmen Maria Machado, on a sentence-by-sentence basis, to see if their writing styles can be predicted.

In [598]:
import numpy as np
import pandas as pd
import spacy
import re
from urllib.request import Request, urlopen
import requests
from bs4 import BeautifulSoup
from time import time
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 100)
%matplotlib inline

## Scraping Data

### Requesting and parsing html
* Using requests and BeautifulSoup
* The program first pulls stories directly from their published websites

In [599]:
# Pull stories from their websites

header={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}

def get_raw(url, no_strong=False, no_b=False):
    response = requests.get(url, headers=header)
#     req = Request(url, headers=header)
    html = response.content
#     html = urlopen(req).read().decode('utf8')
    soup = BeautifulSoup(html, 'lxml')
    if no_strong == True:
        for strong in soup('strong'):
            strong.decompose()
    if no_b == True:
        for b in soup('b'):
            b.decompose()
    return soup.get_text()

t0 = time()

# Swirsky stories
dino_raw = get_raw('https://www.apex-magazine.com/if-you-were-a-dinosaur-my-love/')

eros_raw = get_raw('https://www.tor.com/2009/03/03/eros-philia-agape/')

still_raw = get_raw('http://uncannymagazine.com/article/love-is-never-still/', no_b=True)

# Valente stories
fade_raw = get_raw('http://clarkesworldmagazine.com/valente_08_12/')

silent1_raw = get_raw('http://clarkesworldmagazine.com/valente_10_11/', no_strong=True)

silent2_raw = get_raw('http://clarkesworldmagazine.com/valente_11_11/', no_strong=True)

silent3_raw = get_raw('http://clarkesworldmagazine.com/valente_12_11/', no_strong=True)

lion_raw = get_raw('http://uncannymagazine.com/article/planet-lion/')

# Machado stories
stitch_raw = get_raw('https://granta.com/The-Husband-Stitch/')

follow_raw = get_raw('http://www.lightspeedmagazine.com/fiction/help-follow-sister-land-dead/')

descent_raw = get_raw('http://www.nightmare-magazine.com/fiction/descent/')

bites_raw = get_raw('http://texas.gulfcoastmag.org/journal/29.2-summer/fall-2017/eight-bites/')

print('Get raw in {:.2f} seconds'.format(time() - t0))

Get raw in 11.22 seconds


### Finding story text in html
* Using find() from BeautifulSoup
* Identifies story text inside the total site text

In [600]:
# Select out story text

def select_out(raw, start_text, end_text):
    start = raw.find(start_text)
    end = raw.find(end_text)
    return raw[start:end+len(end_text)]

dinosaur = select_out(dino_raw, 
                      'If you were a dinosaur, my', 
                      ', and the stuttering of my broken heart.'
                     )
eros = select_out(eros_raw, 
                  'Lucian packed his possessions before he left.', 
                  'yet form the thoughts to wonder what will happen next.\nHe moves on.'
                 )
still = select_out(still_raw, 
                  'Through every moment of carving, I want her as one wants a woman. I want', 
                  'and decay, but it is also the early bloom that opens into winter’s cold. She is love, and she will always be reborn.'
                  )
fade = select_out(fade_raw, 
                 'ZOOM IN on a bright-eyed Betty in a crisp green dress,', 
                 'And over the black, a cheerful fat man giving the thumbs up to Sylvie, grinning:\n\nBuy Freedom Brand Film! It\'s A-OK!'
                 )
silent1 = select_out(silent1_raw, 
                    'Inanna was called Queen of Heaven and Earth,', 
                    'The sun breaks the mountain crests, hard and cold, a shaft of white spilling over the black lake.'
                    )
silent2 = select_out(silent2_raw, 
                    'Humanity lived many years and ruled the earth,', 
                    'The castle windows go dark, one by one.'
                    )
silent3 = select_out(silent3_raw, 
                    'Tell me a story about yourself, Elefsis.', 
                    'we walk up the long path out of the churning, honey-colored sea.'
                    )
silent = silent1 + '\n' + silent2 + '\n' + silent3

lion = select_out(lion_raw, 
                 'Initial Survey Report: Planet 6MQ441 (Bakeneko), Alaraph', 
                 'Szent Istvan. She longs to hear the first roar of her young.'
                 )
stitch = select_out(stitch_raw, 
                   '(If you read this story out loud, please use', 
                   'backwards off my neck and rolls off the bed, I feel as lonely as I have ever been.'
                   )
follow = select_out(follow_raw, 
                   'Help Me Follow My Sister\ninto the Land of the Dead\nby Ursula Ruiz', 
                   's—in a terrifying place where no mortal has any business treading.'
                   ) + '\n' + select_out(follow_raw, 
                                        'Home\nThis is the thing about my sister and I:', 
                                        'Lucille L\nJun 28, 2015\nUrsula?'
                                        )
descent = select_out(descent_raw, 
                    'We gathered for the last time in October,', 
                    ', drinks cocked in their hands.\nI looked down.'
                    )
bites = select_out(bites_raw, 
                  'As they put me to sleep, my mouth fills with the dust of the moon. I expect to choke on the silt but', 
                  'my charge.\"I\'m sorry,\" I will whisper into her as she walks me toward the front door.\"I\'m sorry,\" I will repeat. \"I didn\'t know.\"')

## Parsing Text

### Cleaning 
* Cleaning out un-parseable symbols
* Using re

In [601]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub(r'[\[\]]', '', text)
    text = ' '.join(text.split())
    return text

In [602]:
# Clean out bad symbols
dinosaur = text_cleaner(dinosaur)
eros = text_cleaner(eros)
fade = text_cleaner(fade)
silent = text_cleaner(silent)
lion = text_cleaner(lion)
still = text_cleaner(still)
stitch = text_cleaner(stitch)
follow = text_cleaner(follow)
descent = text_cleaner(descent)
bites = text_cleaner(bites)

### Parsing documents
* Using spaCy
* Identifies sentence structure in the stories

In [603]:
t0 = time()

# Parse using spaCy
nlp = spacy.load('en')

dinosaur_doc = nlp(dinosaur)
eros_doc = nlp(eros)
fade_doc = nlp(fade)
silent_doc = nlp(silent)
lion_doc = nlp(lion)
still_doc = nlp(still)
stitch_doc = nlp(stitch)
follow_doc = nlp(follow)
descent_doc = nlp(descent)
bites_doc = nlp(bites)

print('Parsed in {:.2f} seconds'.format(time() - t0))

Parsed in 24.14 seconds


In [604]:
# print([sent.lemma_ for sent in eros_doc.sents])

### Grouping data into sentences
* We undo spaCy's tokenization, just keeping the data split into sentences
* Creates one table of all the data, organized with one sentence for each line

In [860]:
# Group sentences with their authors. (Each sentence will be a row.)

dinosaur_sents = [[sent.string, "Swirsky"] for sent in dinosaur_doc.sents]
eros_sents = [[sent.string, "Swirsky"] for sent in eros_doc.sents]
still_sents = [[sent.string, "Swirsky"] for sent in still_doc.sents]
fade_sents = [[sent.string, "Valente"] for sent in fade_doc.sents]
silent_sents = [[sent.string, "Valente"] for sent in silent_doc.sents]
lion_sents = [[sent.string, "Valente"] for sent in lion_doc.sents]
stitch_sents = [[sent.string, "Machado"] for sent in stitch_doc.sents]
follow_sents = [[sent.string, "Machado"] for sent in follow_doc.sents]
descent_sents = [[sent.string, "Machado"] for sent in descent_doc.sents]
bites_sents = [[sent.string, "Machado"] for sent in bites_doc.sents]

In [871]:
# Combine the sentences from the stories into one data frame.
sentences = pd.DataFrame(dinosaur_sents + eros_sents + still_sents + fade_sents + silent_sents + lion_sents + stitch_sents + follow_sents + descent_sents + bites_sents)
sentences.columns = ['text', 'author']

# Strip out unnecessary punctuation.
sentences['text'] = sentences['text'].str.lower().str.replace(r'[^a-zA-Z0-9 ]+', ' ').fillna('')

# Remove proper nouns personal to each document.
pron_pat = r'\b(agogna|koetoi|adriana|lucian|galatea|aphrodite|hephaestus|fuoco|inanna|enki|erishkigal|salma|janet|lawrence|olive|luna|ursula|ceno|neva|elefsis|ravan|cassian|diane|ares|ben|seki|ilet)\b'
sentences['text'] = sentences['text'].str.replace(pron_pat, '')

# Sentence counts by author.
df_count = sentences.author.value_counts().to_frame(name='count')
df_count['ratio'] = (df_count['count'] / len(sentences.author)).round(2)
print('Full Data Set:')
display(df_count)
print('{} total sentences'.format(len(sentences.author)))
display(sentences.loc[sentences['author'] == 'Swirsky'].head(1))
display(sentences.loc[sentences['author'] == 'Valente'].head(1))
display(sentences.loc[sentences['author'] == 'Machado'].head(1))

Full Data Set:


Unnamed: 0,count,ratio
Valente,2333,0.37
Swirsky,2153,0.34
Machado,1778,0.28


6264 total sentences


Unnamed: 0,text,author
0,if you were a dinosaur my love then you woul...,Swirsky


Unnamed: 0,text,author
2153,was called queen of heaven and earth queen o...,Valente


Unnamed: 0,text,author
4486,if you read this story out loud please use t...,Machado


## Splitting Test Set
This protects the vectorizer and regressor from over-fitting to the training set.

In [862]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sentences.drop(['author'], axis=1), sentences['author'], test_size=0.2)

In [863]:
print('Training Set:')
print(y_train.value_counts().to_frame(name='count'))
print('\nTest Set:')
print(y_test.value_counts().to_frame(name='count'))

Training Set:
         count
Valente   1865
Swirsky   1720
Machado   1426

Test Set:
         count
Valente    468
Swirsky    433
Machado    352


## Pipeline: Vectorizer and Estimators

### TFIDF Vectorizer

* Try most common words across all texts
* Bigrams and trigrams are arrangements of two- and three-words, respectively

In [864]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=str.split, 
                            stop_words='english', 
                            lowercase=True, 
                            use_idf=True, 
                            norm='l2', 
                            smooth_idf=True, 
                            max_features=None, 
                            ngram_range=(1,3), 
                            max_df=0.6, 
                            min_df=1)

### Stochastic Gradient Descent Classifier
* Combines support vector classifier with gradient descent

In [867]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(penalty='elasticnet', 
                    alpha=0.0001, 
                    class_weight=None, 
                    loss='log', 
                    tol=0.005, 
                    fit_intercept=True, 
                    power_t=0.5)

### Pipeline
* Chains vectorizer and estimator as input to grid search algorithm

In [868]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('vect', vectorizer), ('model', sgd)])

## Grid Search
### Parameters
* List parameters to search over for best cv fit

In [869]:
vect_features = [None, 20000, 10000, 5000, 1000]
vect_ngrams = [(1,1), (1,2), (1,3)]
vect_max = [0.6, 0.7, 0.8]
vect_min = [1, 2, 3]
vect_norm = ['l2', 'l1', None]
lsa_n = [1000, 500, 100]
model_alpha = [0.001, 0.0001, 0.00001]
model_loss = ['hinge', 'log', 'squared_hinge', 'perceptron']
model_tol = [0.01, 0.005, 0.001]
model_class_weight = ['balanced', None]
model_power_t = [0.4, 0.5, 0.6]
model_penalty = ['l2', 'elasticnet', 'l1', 'none']
model_intercept = [True, False]

parameters = {
        'vect__max_features': vect_features, 
        'vect__ngram_range': vect_ngrams, 
#         'vect__max_df': vect_max, 
#         'vect__min_df': vect_min, 
#         'vect__norm': vect_norm, 
#         'lsa__n_components': lsa_n, 
#         'model__alpha': model_alpha, 
#         'model__loss': model_loss, 
#         'model__tol': model_tol,
#         'model__class_weight': model_class_weight, 
#         'model__power_t': model_power_t, 
#         'model__penalty': model_penalty, 
#         'model__fit_intercept': model_intercept
    }

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, 
                    parameters, 
                    scoring='f1_weighted', 
                    return_train_score=True)

### Fitting and Results

In [870]:
print('Performing grid search...')
t0 = time()
grid.fit(X_train, y_train)
print('\nCompleted in {:.2f} seconds'.format(time() - t0))

Performing grid search...


ValueError: Found input variables with inconsistent numbers of samples: [2, 3339]

In [None]:
print('\nBest Training Set Score: {:.2%}'.format(grid.cv_results_['mean_train_score'][grid.best_index_]))
print('\nBest Cross-Val Score: {:.2%}'.format(grid.best_score_))

print('\nBest Parameters:\n')
best_parameters = grid.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('{}: {}'.format(param_name, best_parameters[param_name]))

print('\n{} features'.format(len(feature_list)))

from sklearn.metrics import confusion_matrix

test_score = grid.score(X_test, y_test)
y_test_pred = grid.predict(X_test)
test_cm = confusion_matrix(y_test, y_test_pred)

print('\nTest set score: {:.1%}'.format(test_score))

df_test_cm = pd.DataFrame(test_cm, index=grid.classes_, columns=grid.classes_)
df_test_cm.index.name = 'Actual Author'
df_test_cm.columns.name = 'Predicted Author'
display(df_test_cm)

feature_list = grid.best_estimator_.named_steps['vect'].get_feature_names()
feature_coefficients = pd.DataFrame(grid.best_estimator_.steps[-1][1].coef_, columns=feature_list, index=grid.classes_)
# columns=feature_list, 
display(feature_coefficients.sort_values('Machado', ascending=False, axis=1).head(10))
display(feature_coefficients.sort_values('Swirsky', ascending=False, axis=1).head(10))
display(feature_coefficients.sort_values('Valente', ascending=False, axis=1).head(10))

In [None]:
cv_results = pd.DataFrame()
cv_results['mean_test_score'] = (grid.cv_results_['mean_test_score'] * 100).round(2)
cv_results['std_test_score'] = (grid.cv_results_['std_test_score'] * 100).round(2)
cv_results['mean_train_score'] = (grid.cv_results_['mean_train_score'] * 100).round(2)
cv_results['std_train_score'] = (grid.cv_results_['std_train_score'] * 100).round(2)
cv_results['mean_fit_time'] = grid.cv_results_['mean_fit_time'].round(2)
cv_results = cv_results.join(pd.DataFrame(grid.cv_results_).filter(like='param_'))
cv_results['split0_test_score'] = (grid.cv_results_['split0_test_score'] * 100).round(2)
cv_results['split1_test_score'] = (grid.cv_results_['split1_test_score'] * 100).round(2)
cv_results['split2_test_score'] = (grid.cv_results_['split2_test_score'] * 100).round(2)
cv_results['split0_train_score'] = (grid.cv_results_['split0_train_score'] * 100).round(2)
cv_results['split1_train_score'] = (grid.cv_results_['split1_train_score'] * 100).round(2)
cv_results['split2_train_score'] = (grid.cv_results_['split2_train_score'] * 100).round(2)
cv_results.sort_values('std_test_score', ascending=True, axis=0)