In [1]:
from stylometry import *
import pymongo
import time
import multiprocessing as mp
import pandas as pd
from pandas.io.json import json_normalize
import warnings
warnings.filterwarnings('ignore')

import spacy
#import en_core_web_lg
import en_core_web_md

In [2]:
## connect the database to my virtual machine
client = pymongo.MongoClient('mongodb://localhost/')
db = client['gutenberg_db']
collection = db['gutenberg_collection']

In [3]:
collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'gutenberg_db'), 'gutenberg_collection')

In [5]:
text = collection.find({'file': 'project_gutenberg/51185.txt'}, {'text': 1})[0]['text']

In [6]:
s = time.time()
doc = nlp(text)
tokens = [token.orth_.lower() for token in doc if not token.is_punct and token if not token.is_stop]
sentences = [sent.string.strip() for sent in doc.sents]
print(f'Took {time.time() - s}s to process')

Took 1.6460483074188232s to process


In [7]:
sttr(tokens)

0.627

In [8]:
hapax_legomenon(tokens)

0.486

In [9]:
yules_k(tokens)

1703.7019456860444

In [10]:
function_words(doc)

0.32435685271551074

In [11]:
avg_sentence_length_word(sentences)

8.565289256198348

In [12]:
avg_sentence_length_chars(sentences)

49.239669421487605

In [13]:
avg_syllables_per_word(tokens)

1.7941299790356393

In [14]:
punctuation_sentence(sentences)

2.6743801652892563

In [15]:
shannon_entropy(tokens)

9.51165465635602

In [16]:
simpsons_d(tokens)

0.9968982595359701

In [17]:
average_nps(doc)

221.16666666666666

In [18]:
noun_to_verb(doc)

0.586852207293666

In [19]:
noun_to_adj(doc)

0.8235690235690236

In [20]:
verb_to_adv(doc)

0.6926790024135157

In [21]:
avg_dependency_distance(doc)

2.0988422371735185

In [22]:
get_style_metrics('project_gutenberg/51185.txt')

Took 0.8566100597381592s to process project_gutenberg/51185.txt


### Mongo Upload

In [32]:
filelist = [doc['file'] for doc in collection.find({}, {'file': 1})]
print(len(filelist))

23294


In [22]:
filelist = list(df[df['yules_k'].isnull()].file.values)
print(len(filelist))

6005


In [None]:
for file in filelist:
    try:
        get_style_metrics(file)
    except (ZeroDivisionError, ValueError): 
        continue

In [27]:
## parallelizing get_style_metrics function
n_cpu = mp.cpu_count()
with mp.Pool(processes = n_cpu) as p:
    p.map(get_style_metrics, filelist)

In [16]:
datapoints = list(collection.find({}, {'_id': 0, 'text': 0}))
df = pd.DataFrame(json_normalize(datapoints))
df = df.rename(columns = {'metrics.sttr': 'sttr', 'metrics.hapax_legomenon': 'hapax_legomenon', 
                          'metrics.yules_k': 'yules_k', 'metrics.function_words': 'function_words', 
                          'metrics.avg_sentence_length_word': 'avg_sentence_length_word',
                          'metrics.avg_sentence_length_chars': 'avg_sentence_length_chars', 
                          'metrics.avg_syllables_per_word': 'avg_syllables_per_word', 
                          'metrics.punctuation_sentence': 'punctuation_sentence', 
                          'metrics.shannon_entropy': 'shannon_entropy',
                          'metrics.simpsons_d': 'simpsons_d', 'metrics.average_nps': 'average_nps', 
                          'metrics.noun_to_verb': 'noun_to_verb', 'metrics.noun_to_adj': 'noun_to_adj', 
                          'metrics.verb_to_adv': 'verb_to_adv', 
                          'metrics.avg_dependency_distance': 'avg_dependency_distance'})
df.head()
print(df.shape)

(23294, 19)


In [17]:
df.head(10)

Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
0,project_gutenberg/9999.txt,Sarah H. Bradford,"Harriet, The Moses of Her People",1822,0.624214,0.452214,528.796852,0.39884,20.889773,111.791794,1.735272,4.017759,10.845536,0.998584,232.7,0.640382,0.79291,0.717273,2.338342
1,project_gutenberg/56195-0.txt,Kenneth Ward,The Boy Volunteers on the Belgian Front,1941,0.585125,0.415,378.878385,0.39712,14.213115,77.20765,1.752356,2.763206,10.506952,0.997945,232.130435,0.603311,0.811363,0.718462,2.134627
2,project_gutenberg/51187.txt,John Wilson,A Visit to the Mammoth Cave of Kentucky,1972,0.62,0.444,1298.919399,0.428477,27.3003,146.405405,1.720949,4.384384,10.003001,0.99836,222.8,0.656081,0.777569,0.638938,2.403897
3,project_gutenberg/51185.txt,Daniel F. Galouye,All Jackson's Children,1970,0.627,0.486,1703.701946,0.324357,8.565289,49.239669,1.79413,2.67438,9.511655,0.996898,221.166667,0.586852,0.823569,0.692679,2.098842
4,project_gutenberg/9997.txt,Francis Parkman,"France and England in North America, Part Third",2013,0.686821,0.54175,269.095482,0.372508,21.56223,122.567266,1.868661,4.204856,11.781284,0.998751,241.057143,0.716877,0.837261,0.751203,2.331626
5,project_gutenberg/9996.txt,Charles Francis Adams,'Tis Sixty Years Since,1814,0.689,0.525429,1123.982792,0.412379,22.716826,135.894665,2.360974,4.124487,10.880695,0.999149,214.0,0.671823,0.718001,0.598605,2.411298
6,project_gutenberg/56190-0.txt,Warren Commission,Warren Commission (14 of 26): Hearings Vol. XI...,1963,0.332608,0.195665,212.461619,0.382709,7.211034,37.233989,1.824845,1.806544,9.316746,0.98119,257.088583,0.685171,0.900294,0.686401,2.003238
7,project_gutenberg/9995.txt,Marian M. George,A Little Journey to Puerto Rico,2007,0.644,0.471333,837.998946,0.409068,14.52459,80.180328,1.75401,2.71846,10.896416,0.998974,246.869565,0.723873,0.794841,0.726814,2.150615
8,project_gutenberg/51183-0.txt,George Grote,"History of Greece, Volume 10 (of 12)",2018,0.678575,0.524642,180.798344,0.35097,15.716207,95.380218,2.14312,2.82718,12.209676,0.999341,245.503968,0.736571,0.807329,0.674299,2.284798
9,project_gutenberg/5619-0.txt,William Petty,Essays on Mankind and Political Arithmetic,1899,0.533643,0.367429,506.56469,0.407927,24.473765,136.352623,1.819653,4.45679,10.645643,0.998158,205.694444,0.707252,0.77892,0.696943,2.465841


In [18]:
df.isnull().sum()

file                            0
author                          0
title                           0
year                            0
sttr                         6345
hapax_legomenon              6345
yules_k                      6005
function_words               6005
avg_sentence_length_word     6005
avg_sentence_length_chars    6005
avg_syllables_per_word       6005
punctuation_sentence         6005
shannon_entropy              6005
simpsons_d                   6005
average_nps                  6124
noun_to_verb                 6005
noun_to_adj                  6005
verb_to_adv                  6005
avg_dependency_distance      6005
dtype: int64

In [24]:
for doc in collection.find({'file': 'project_gutenberg/54492.txt'}, {'text': 0}):
    print(doc)

{'_id': ObjectId('5e1842319736a0f9dcdc91e2'), 'file': 'project_gutenberg/54492.txt', 'author': 'Joseph Pike', 'title': 'Chester; A Sketch-Book', 'year': '1915'}


In [37]:
df.to_csv('metrics.csv', index=False)