# Text Analysis
Analyzes texts from seemingly-similar authors, Rachel Swirsky, Cat Valente, and Carmen Maria Machado, on a sentence-by-sentence basis, to see if their writing styles can be predicted.

In [103]:
import numpy as np
import pandas as pd
import spacy
import re
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from time import time
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
start_time = time()

## Scraping Data

### Requesting and parsing html
* Using urllib and BeautifulSoup
* The program first pulls stories directly from their published websites

In [104]:
# Pull stories from their websites

header={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}

def get_raw(url, no_strong=False, no_b=False):
    req=Request(url, headers=header)
    html = urlopen(req).read().decode('utf8')
    soup = BeautifulSoup(html, 'lxml')
    if no_strong == True:
        for strong in soup('strong'):
            strong.decompose()
    if no_b == True:
        for b in soup('b'):
            b.decompose()
    return soup.get_text()

t0 = time()
# Swirsky stories
dino_raw = get_raw('https://www.apex-magazine.com/if-you-were-a-dinosaur-my-love/')

eros_raw = get_raw('https://www.tor.com/2009/03/03/eros-philia-agape/')

still_raw = get_raw('http://uncannymagazine.com/article/love-is-never-still/', no_b=True)

# Valente stories
fade_raw = get_raw('http://clarkesworldmagazine.com/valente_08_12/')

silent1_raw = get_raw('http://clarkesworldmagazine.com/valente_10_11/', no_strong=True)

silent2_raw = get_raw('http://clarkesworldmagazine.com/valente_11_11/', no_strong=True)

silent3_raw = get_raw('http://clarkesworldmagazine.com/valente_12_11/', no_strong=True)

lion_raw = get_raw('http://uncannymagazine.com/article/planet-lion/')

# Machado stories
stitch_raw = get_raw('https://granta.com/The-Husband-Stitch/')

follow_raw = get_raw('http://www.lightspeedmagazine.com/fiction/help-follow-sister-land-dead/')

descent_raw = get_raw('http://www.nightmare-magazine.com/fiction/descent/')
print('Get raw in {:.2f} seconds'.format(time() - t0))

### Finding story text in html
* Using find() from BeautifulSoup
* Identifies story text inside the total site text

In [106]:
# Select out story text

def select_out(raw, start_text, end_text):
    start = raw.find(start_text)
    end = raw.find(end_text)
    return raw[start:end+len(end_text)]

dinosaur = select_out(dino_raw, 
                      'If you were a dinosaur, my', 
                      ', and the stuttering of my broken heart.'
                     )
eros = select_out(eros_raw, 
                  'Lucian packed his possessions before he left.', 
                  'yet form the thoughts to wonder what will happen next.\nHe moves on.'
                 )
still = select_out(still_raw, 
                  'Through every moment of carving, I want her as one wants a woman. I want', 
                  'and decay, but it is also the early bloom that opens into winter’s cold. She is love, and she will always be reborn.'
                  )
fade = select_out(fade_raw, 
                 'ZOOM IN on a bright-eyed Betty in a crisp green dress,', 
                 'And over the black, a cheerful fat man giving the thumbs up to Sylvie, grinning:\n\nBuy Freedom Brand Film! It\'s A-OK!'
                 )
silent1 = select_out(silent1_raw, 
                    'Inanna was called Queen of Heaven and Earth,', 
                    'The sun breaks the mountain crests, hard and cold, a shaft of white spilling over the black lake.'
                    )
silent2 = select_out(silent2_raw, 
                    'Humanity lived many years and ruled the earth,', 
                    'The castle windows go dark, one by one.'
                    )
silent3 = select_out(silent3_raw, 
                    'Tell me a story about yourself, Elefsis.', 
                    'we walk up the long path out of the churning, honey-colored sea.'
                    )
silent = silent1 + '\n' + silent2 + '\n' + silent3

lion = select_out(lion_raw, 
                 'Initial Survey Report: Planet 6MQ441 (Bakeneko), Alaraph', 
                 'Szent Istvan. She longs to hear the first roar of her young.'
                 )
stitch = select_out(stitch_raw, 
                   '(If you read this story out loud, please use', 
                   'backwards off my neck and rolls off the bed, I feel as lonely as I have ever been.'
                   )
follow = select_out(follow_raw, 
                   'Help Me Follow My Sister\ninto the Land of the Dead\nby Ursula Ruiz', 
                   's—in a terrifying place where no mortal has any business treading.'
                   ) + '\n' + select_out(follow_raw, 
                                        'Home\nThis is the thing about my sister and I:', 
                                        'Lucille L\nJun 28, 2015\nUrsula?'
                                        )
descent = select_out(descent_raw, 
                    'We gathered for the last time in October,', 
                    ', drinks cocked in their hands.\nI looked down.'
                    )

## Parsing Text

### Cleaning 
* Cleaning out un-parseable symbols
* Using re

In [108]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub(r'[\[\]]', '', text)
    text = ' '.join(text.split())
    return text

In [109]:
# Clean out bad symbols
dinosaur = text_cleaner(dinosaur)
eros = text_cleaner(eros)
fade = text_cleaner(fade)
silent = text_cleaner(silent)
lion = text_cleaner(lion)
still = text_cleaner(still)
stitch = text_cleaner(stitch)
follow = text_cleaner(follow)
descent = text_cleaner(descent)

### Parsing documents
* Using spaCy
* Identifies sentence structure in the stories

In [110]:
t0 = time()
# Parse using spaCy
nlp = spacy.load('en')

dinosaur_doc = nlp(dinosaur)
eros_doc = nlp(eros)
fade_doc = nlp(fade)
silent_doc = nlp(silent)
lion_doc = nlp(lion)
still_doc = nlp(still)
stitch_doc = nlp(stitch)
follow_doc = nlp(follow)
descent_doc = nlp(descent)
print('Parsed in {:.2f} seconds'.format(time() - t0))

Parsed in 22.06 seconds


In [111]:
# print([sent.lemma_ for sent in eros_doc.sents])

### Grouping data into sentences
* We undo spaCy's tokenization, just keeping the data split into sentences
* Creates one table of all the data, organized with one sentence for each line

In [112]:
#Group into sentences. (Each sentence will be a row in pre-model data.)
dinosaur_sents = [[sent.string, "Swirsky"] for sent in dinosaur_doc.sents]
eros_sents = [[sent.string, "Swirsky"] for sent in eros_doc.sents]
still_sents = [[sent.string, "Swirsky"] for sent in still_doc.sents]
fade_sents = [[sent.string, "Valente"] for sent in fade_doc.sents]
silent_sents = [[sent.string, "Valente"] for sent in silent_doc.sents]
lion_sents = [[sent.string, "Valente"] for sent in lion_doc.sents]
stitch_sents = [[sent.string, "Machado"] for sent in stitch_doc.sents]
follow_sents = [[sent.string, "Machado"] for sent in follow_doc.sents]
descent_sents = [[sent.string, "Machado"] for sent in descent_doc.sents]

In [147]:
# Combine the sentences from the stories into one data frame.
sentences = pd.DataFrame(dinosaur_sents + eros_sents + still_sents + fade_sents + silent_sents + lion_sents + stitch_sents + follow_sents + descent_sents)
sentences.columns = ['text', 'author']
print('Full Data Set:')
display(sentences.author.value_counts().to_frame(name='count'))
print('{} total sentences'.format(len(sentences.author)))

Full Data Set:


Unnamed: 0,count
Valente,2333
Swirsky,2153
Machado,1336


5822 total sentences


## Splitting Test Set
This protects the vectorizer and regressor from over-fitting to the training set.

In [130]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(sentences['text'], sentences['author'], test_size=0.1)

In [149]:
print('Training Set:')
print(y_train.value_counts().to_frame(name='count'))
print('\nTest Set:')
print(y_test.value_counts().to_frame(name='count'))

Training Set:
         count
Valente   2096
Swirsky   1941
Machado   1202

Test Set:
         count
Valente    237
Swirsky    212
Machado    134


## Bag of Words

### TFIDF Vectorizer

* Try most common words across all texts
* Bigrams and trigrams are arrangements of two- and three-words, respectively

In [132]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    analyzer=str.split,
    max_features=10000,
    ngram_range=(1,1),
    max_df=0.5,
    min_df=2,
    stop_words='english', 
    lowercase=True,
    use_idf=True,
    norm='l2',
    smooth_idf=True)

In [133]:
t0 = time()
bag_train = vectorizer.fit_transform(X_train)
bag_test = vectorizer.transform(X_test)
X_train = pd.DataFrame(bag_train.toarray(), columns=vectorizer.get_feature_names())
X_test = pd.DataFrame(bag_test.toarray(), columns=vectorizer.get_feature_names())
print('{} sentences and {} features vectorized in {:.2f} seconds'.format(X_train.shape[0], X_train.shape[1], time() - t0))

5239 sentences and 3377 features vectorized in 0.38 seconds


## Model Training

### Fitting
* Using sklearn's logistic regression, with L2 normalization
* This creates a formula to calculate the probability of each sentence being by each author

In [151]:
t0 = time()
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    penalty='l2',
    class_weight='balanced',
    solver='liblinear',
    multi_class='ovr',
    C=1.0)

model.fit(X_train, y_train)

feature_names = X_train.columns
feature_coefficients = pd.DataFrame(model.coef_.T, X_train.columns, model.classes_)

display(feature_coefficients.sort_values('Swirsky', ascending=False).head())

print('Fit model in {:.2f} seconds'.format(time() - t0))

Unnamed: 0,Machado,Swirsky,Valente
adriana,-4.037911,5.906727,-3.919323
lucian,-3.193048,4.906562,-3.293061
rose,-3.380132,4.47083,-2.65575
galatea,-1.705214,2.831347,-1.780435
ares,-1.677103,2.699812,-1.580654


Fit model in 0.12 seconds


In [121]:
print(vectorizer.get_feature_names())

['229 is', '26 2015', '27 2015', '6mq441 bakeneko', 'abandoned my', 'ability to', 'able to', 'about days', 'about fifteen', 'about girl', 'about her', 'about him', 'about hours', 'about it', 'about me', 'about my', 'about ravan', 'about the', 'about this', 'about today', 'about woman', 'about yourself', 'according to', 'accustomed to', 'ache and', 'across her', 'across his', 'across my', 'across the', 'act or', 'adrenaline and', 'adriana and', 'adriana approached', 'adriana as', 'adriana could', 'adriana couldn', 'adriana father', 'adriana felt', 'adriana gave', 'adriana grabbed', 'adriana had', 'adriana hand', 'adriana held', 'adriana house', 'adriana looked', 'adriana sat', 'adriana sisters', 'adriana stood', 'adriana the', 'adriana took', 'adriana was', 'adriana went', 'adriana will', 'adriana with', 'afraid of', 'afraid to', 'after all', 'after class', 'after her', 'after his', 'after it', 'after my', 'after that', 'after the', 'after we', 'again and', 'again in', 'again the', 'aga




### Validation
* Splits model into five parts, fitting the model to four-fifths of the data and testing it on the remaining one-fifth each time
* Evaluates model for over-fitting to noise in the input data

In [122]:
t0 = time()

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix

train_score = model.score(X_train, y_train)

y_train_pred = model.predict(X_train)

cv_score = cross_val_score(model, X_train, y_train, cv=5)

y_train_cv_pred = cross_val_predict(model, X_train, y_train, cv=5)

cm = confusion_matrix(y_train, y_train_pred)
cv_cm = confusion_matrix(y_train, y_train_cv_pred)

print('Validated in {:.2f} seconds'.format(time() - t0))

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Validated in 3.22 seconds


In [123]:
plusminus = u"\u00B1"

print('Training set score: {:.1%}'.format(train_score))
print('\nCross validation scores: {:.1%} {} {:.1%}'.format(cv_score.mean(), plusminus, cv_score.std()*2))

df_cm = pd.DataFrame(cm, index=model.classes_, columns=model.classes_)
df_cm.index.name = 'Actual Author'
df_cm.columns.name = 'Predicted Author'

df_cv_cm = pd.DataFrame(cv_cm, index=model.classes_, columns=model.classes_)
df_cv_cm.index.name = 'Actual Author'
df_cv_cm.columns.name = 'Predicted Author'

print('Training Confusion Matrix')
display(df_cm)

print('Cross Validation Confusion Matrix')
display(df_cv_cm)

Training set score: 83.8%

Cross validation scores: 62.4% ± 4.3%
Training Confusion Matrix


Predicted Author,Machado,Swirsky,Valente
Actual Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Machado,835,323,52
Swirsky,39,1826,73
Valente,39,322,1730


Cross Validation Confusion Matrix


Predicted Author,Machado,Swirsky,Valente
Actual Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Machado,477,508,225
Swirsky,162,1460,316
Valente,146,613,1332


## Prediction
### Test sentences
Finally uses the vectorized sentences from the test set.

In [None]:
t0 = time()
test_score = model.score(X_test, y_test)
y_test_pred = model.predict(X_test)
test_cm = confusion_matrix(y_test, y_test_pred)
print('Predicted in {:.2f} seconds'.format(time() - t0))

In [None]:
print('Test set score: {:.1%}'.format(test_score))

df_test_cm = pd.DataFrame(test_cm, index=model.classes_, columns=model.classes_)
df_test_cm.index.name = 'Actual Author'
df_test_cm.columns.name = 'Predicted Author'
display(df_test_cm)

In [None]:
print('Total time: {:.2f} seconds'.format(time() - start_time))

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier(
#     max_depth=10,
#     min_samples_split=50,
#     n_estimators=1000,
#     verbose=1, 
#     n_jobs=-1)

# model.fit(X_train, y_train)

# feature_names = X_train.columns

# forest_importances = pd.DataFrame(model.feature_importances_, feature_names, columns=["Importances"])

# display(forest_importances.sort_values('Importances', ascending=False).head(20))

# print('{:.2f} seconds'.format(time() - start_time))