In [1]:
from stylometry import *
import pymongo
import time
import multiprocessing as mp
import pandas as pd
from pandas.io.json import json_normalize
import warnings
warnings.filterwarnings('ignore')

import spacy
#import en_core_web_lg
import en_core_web_md

In [2]:
## connect the database to my virtual machine
client = pymongo.MongoClient('mongodb://localhost/')
db = client['gutenberg_db']
collection = db['gutenberg_collection']

In [3]:
collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'gutenberg_db'), 'gutenberg_collection')

In [5]:
text = collection.find({'file': 'project_gutenberg/51185.txt'}, {'text': 1})[0]['text']

In [6]:
s = time.time()
doc = nlp(text)
tokens = [token.orth_.lower() for token in doc if not token.is_punct and token if not token.is_stop]
sentences = [sent.string.strip() for sent in doc.sents]
print(f'Took {time.time() - s}s to process')

Took 1.6460483074188232s to process


In [7]:
sttr(tokens)

0.627

In [8]:
hapax_legomenon(tokens)

0.486

In [9]:
yules_k(tokens)

1703.7019456860444

In [10]:
function_words(doc)

0.32435685271551074

In [11]:
avg_sentence_length_word(sentences)

8.565289256198348

In [12]:
avg_sentence_length_chars(sentences)

49.239669421487605

In [13]:
avg_syllables_per_word(tokens)

1.7941299790356393

In [14]:
punctuation_sentence(sentences)

2.6743801652892563

In [15]:
shannon_entropy(tokens)

9.51165465635602

In [16]:
simpsons_d(tokens)

0.9968982595359701

In [17]:
average_nps(doc)

221.16666666666666

In [18]:
noun_to_verb(doc)

0.586852207293666

In [19]:
noun_to_adj(doc)

0.8235690235690236

In [20]:
verb_to_adv(doc)

0.6926790024135157

In [21]:
avg_dependency_distance(doc)

2.0988422371735185

In [22]:
get_style_metrics('project_gutenberg/51185.txt')

Took 0.8566100597381592s to process project_gutenberg/51185.txt


### Mongo Upload

In [32]:
filelist = [doc['file'] for doc in collection.find({}, {'file': 1})]
print(len(filelist))

23294


In [7]:
filelist = list(df[df['yules_k'].isnull()].file.values)
print(len(filelist))

4898


In [None]:
for file in filelist[1:]:
    try:
        get_style_metrics(file)
    except (ZeroDivisionError, ValueError) as e: 
        continue

Took 21.433281421661377s to process project_gutenberg/42386.txt
Took 10.250550270080566s to process project_gutenberg/37528.txt
Took 1.3608670234680176s to process project_gutenberg/40204.txt
Took 1.9739973545074463s to process project_gutenberg/42383.txt
Took 26.571258544921875s to process project_gutenberg/37523.txt
Took 10.340190649032593s to process project_gutenberg/40203.txt
Took 0.35340046882629395s to process project_gutenberg/37521.txt
Took 3.9193222522735596s to process project_gutenberg/40201-0.txt
Took 15.039881706237793s to process project_gutenberg/44900.txt
Took 0.867194414138794s to process project_gutenberg/40200.txt
Took 19.6598002910614s to process project_gutenberg/37519.txt
Took 9.821786880493164s to process project_gutenberg/4020.txt
Took 111.7565770149231s to process project_gutenberg/44851.txt
Took 5.7292160987854s to process project_gutenberg/37467.txt
Took 1.582223653793335s to process project_gutenberg/44850.txt
Took 29.69178032875061s to process project_gute

Took 11.031864643096924s to process project_gutenberg/44724.txt
Took 0.5215117931365967s to process project_gutenberg/37356.txt
Took 5.059277057647705s to process project_gutenberg/4004.txt
Took 4.93714451789856s to process project_gutenberg/37355.txt
Took 9.241450309753418s to process project_gutenberg/40038.txt
Took 42.14813303947449s to process project_gutenberg/44721.txt
Took 1.7069156169891357s to process project_gutenberg/40037.txt
Took 4.4872612953186035s to process project_gutenberg/37353-0.txt
Took 2.979051113128662s to process project_gutenberg/44720.txt
Took 3.60111141204834s to process project_gutenberg/4472.txt
Took 1.619999647140503s to process project_gutenberg/42188.txt
Took 3.1327617168426514s to process project_gutenberg/42187.txt
Took 4.584988832473755s to process project_gutenberg/40035.txt
Took 8.045928716659546s to process project_gutenberg/44716.txt
Took 1.4230000972747803s to process project_gutenberg/42185.txt
Took 13.141314029693604s to process project_gutenbe

Took 1.7125282287597656s to process project_gutenberg/42118.txt
Took 13.865393877029419s to process project_gutenberg/42117.txt
Took 14.160008668899536s to process project_gutenberg/44632.txt
Took 11.019001960754395s to process project_gutenberg/3728.txt
Took 4.695101499557495s to process project_gutenberg/44631.txt
Took 16.27394413948059s to process project_gutenberg/42115.txt
Took 1.532393217086792s to process project_gutenberg/37278.txt
Took 20.46473741531372s to process project_gutenberg/39969.txt
Took 1.6091506481170654s to process project_gutenberg/42114.txt
Took 4.248319149017334s to process project_gutenberg/39968.txt
Took 2.6902108192443848s to process project_gutenberg/37276.txt
Took 13.591778755187988s to process project_gutenberg/44628.txt
Took 16.48083209991455s to process project_gutenberg/42112.txt
Took 19.55794858932495s to process project_gutenberg/37274.txt
Took 5.694271564483643s to process project_gutenberg/42111.txt
Took 8.260510921478271s to process project_gutenb

In [27]:
## parallelizing get_style_metrics function
n_cpu = mp.cpu_count()
with mp.Pool(processes = n_cpu) as p:
    p.map(get_style_metrics, filelist)

In [4]:
datapoints = list(collection.find({}, {'_id': 0, 'text': 0}))
df = pd.DataFrame(json_normalize(datapoints))
df = df.rename(columns = {'metrics.sttr': 'sttr', 'metrics.hapax_legomenon': 'hapax_legomenon', 
                          'metrics.yules_k': 'yules_k', 'metrics.function_words': 'function_words', 
                          'metrics.avg_sentence_length_word': 'avg_sentence_length_word',
                          'metrics.avg_sentence_length_chars': 'avg_sentence_length_chars', 
                          'metrics.avg_syllables_per_word': 'avg_syllables_per_word', 
                          'metrics.punctuation_sentence': 'punctuation_sentence', 
                          'metrics.shannon_entropy': 'shannon_entropy',
                          'metrics.simpsons_d': 'simpsons_d', 'metrics.average_nps': 'average_nps', 
                          'metrics.noun_to_verb': 'noun_to_verb', 'metrics.noun_to_adj': 'noun_to_adj', 
                          'metrics.verb_to_adv': 'verb_to_adv', 
                          'metrics.avg_dependency_distance': 'avg_dependency_distance'})
df.head()
print(df.shape)

(23294, 19)


In [5]:
df.head(10)

Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
0,project_gutenberg/9999.txt,Sarah H. Bradford,"Harriet, The Moses of Her People",1822,0.624214,0.452214,528.796852,0.39884,20.889773,111.791794,1.735272,4.017759,10.845536,0.998584,232.7,0.640382,0.79291,0.717273,2.338342
1,project_gutenberg/56195-0.txt,Kenneth Ward,The Boy Volunteers on the Belgian Front,1941,0.585125,0.415,378.878385,0.39712,14.213115,77.20765,1.752356,2.763206,10.506952,0.997945,232.130435,0.603311,0.811363,0.718462,2.134627
2,project_gutenberg/51187.txt,John Wilson,A Visit to the Mammoth Cave of Kentucky,1972,0.62,0.444,1298.919399,0.428477,27.3003,146.405405,1.720949,4.384384,10.003001,0.99836,222.8,0.656081,0.777569,0.638938,2.403897
3,project_gutenberg/51185.txt,Daniel F. Galouye,All Jackson's Children,1970,0.627,0.486,1703.701946,0.324357,8.565289,49.239669,1.79413,2.67438,9.511655,0.996898,221.166667,0.586852,0.823569,0.692679,2.098842
4,project_gutenberg/9997.txt,Francis Parkman,"France and England in North America, Part Third",2013,0.686821,0.54175,269.095482,0.372508,21.56223,122.567266,1.868661,4.204856,11.781284,0.998751,241.057143,0.716877,0.837261,0.751203,2.331626
5,project_gutenberg/9996.txt,Charles Francis Adams,'Tis Sixty Years Since,1814,0.689,0.525429,1123.982792,0.412379,22.716826,135.894665,2.360974,4.124487,10.880695,0.999149,214.0,0.671823,0.718001,0.598605,2.411298
6,project_gutenberg/56190-0.txt,Warren Commission,Warren Commission (14 of 26): Hearings Vol. XI...,1963,0.332608,0.195665,212.461619,0.382709,7.211034,37.233989,1.824845,1.806544,9.316746,0.98119,257.088583,0.685171,0.900294,0.686401,2.003238
7,project_gutenberg/9995.txt,Marian M. George,A Little Journey to Puerto Rico,2007,0.644,0.471333,837.998946,0.409068,14.52459,80.180328,1.75401,2.71846,10.896416,0.998974,246.869565,0.723873,0.794841,0.726814,2.150615
8,project_gutenberg/51183-0.txt,George Grote,"History of Greece, Volume 10 (of 12)",2018,0.678575,0.524642,180.798344,0.35097,15.716207,95.380218,2.14312,2.82718,12.209676,0.999341,245.503968,0.736571,0.807329,0.674299,2.284798
9,project_gutenberg/5619-0.txt,William Petty,Essays on Mankind and Political Arithmetic,1899,0.533643,0.367429,506.56469,0.407927,24.473765,136.352623,1.819653,4.45679,10.645643,0.998158,205.694444,0.707252,0.77892,0.696943,2.465841


In [6]:
df.isnull().sum()

file                            0
author                          0
title                           0
year                            0
sttr                         5253
hapax_legomenon              5253
yules_k                      4898
function_words               4898
avg_sentence_length_word     4898
avg_sentence_length_chars    4898
avg_syllables_per_word       4898
punctuation_sentence         4898
shannon_entropy              4898
simpsons_d                   4898
average_nps                  5024
noun_to_verb                 4898
noun_to_adj                  4898
verb_to_adv                  4898
avg_dependency_distance      4898
dtype: int64

In [37]:
df.to_csv('metrics.csv', index=False)