In [1]:
from stylometry import *
import pymongo
import time
import multiprocessing as mp
import pandas as pd
from pandas.io.json import json_normalize
import warnings
warnings.filterwarnings('ignore')

import spacy
#import en_core_web_lg
import en_core_web_md

In [None]:
## connect the database to my virtual machine
client = pymongo.MongoClient('mongodb://localhost/')
db = client['gutenberg_db']
collection = db['gutenberg_collection']

In [3]:
collection

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'gutenberg_db'), 'gutenberg_collection')

In [5]:
text = collection.find({'file': 'project_gutenberg/51185.txt'}, {'text': 1})[0]['text']

In [6]:
s = time.time()
doc = nlp(text)
tokens = [token.orth_.lower() for token in doc if not token.is_punct and token if not token.is_stop]
sentences = [sent.string.strip() for sent in doc.sents]
print(f'Took {time.time() - s}s to process')

Took 1.6460483074188232s to process


In [7]:
sttr(tokens)

0.627

In [8]:
hapax_legomenon(tokens)

0.486

In [9]:
yules_k(tokens)

1703.7019456860444

In [10]:
function_words(doc)

0.32435685271551074

In [11]:
avg_sentence_length_word(sentences)

8.565289256198348

In [12]:
avg_sentence_length_chars(sentences)

49.239669421487605

In [13]:
avg_syllables_per_word(tokens)

1.7941299790356393

In [14]:
punctuation_sentence(sentences)

2.6743801652892563

In [15]:
shannon_entropy(tokens)

9.51165465635602

In [16]:
simpsons_d(tokens)

0.9968982595359701

In [17]:
average_nps(doc)

221.16666666666666

In [18]:
noun_to_verb(doc)

0.586852207293666

In [19]:
noun_to_adj(doc)

0.8235690235690236

In [20]:
verb_to_adv(doc)

0.6926790024135157

In [21]:
avg_dependency_distance(doc)

2.0988422371735185

In [22]:
get_style_metrics('project_gutenberg/51185.txt')

Took 0.8566100597381592s to process project_gutenberg/51185.txt


### Mongo Upload

In [32]:
filelist = [doc['file'] for doc in collection.find({}, {'file': 1})]
print(len(filelist))

23294


In [10]:
## parallelizing get_style_metrics function
n_cpu = mp.cpu_count()
with mp.Pool(processes = n_cpu) as p:
    p.map(get_style_metrics, filelist)

In [11]:
datapoints = list(collection.find({}, {'_id': 0, 'text': 0}))
df = pd.DataFrame(json_normalize(datapoints))
df.rename(columns = {'metrics.sttr': 'sttr', 'metrics.hapax_legomenon': 'hapax_legomenon', 
                          'metrics.yules_k': 'yules_k', 'metrics.function_words': 'function_words', 
                          'metrics.avg_sentence_length_word': 'avg_sentence_length_word',
                          'metrics.avg_sentence_length_chars': 'avg_sentence_length_chars', 
                          'metrics.avg_syllables_per_word': 'avg_syllables_per_word', 
                          'metrics.punctuation_sentence': 'punctuation_sentence', 
                          'metrics.shannon_entropy': 'shannon_entropy',
                          'metrics.simpsons_d': 'simpsons_d', 'metrics.average_nps': 'average_nps', 
                          'metrics.noun_to_verb': 'noun_to_verb', 'metrics.noun_to_adj': 'noun_to_adj', 
                          'metrics.verb_to_adv': 'verb_to_adv', 
                          'metrics.avg_dependency_distance': 'avg_dependency_distance'}, inplace=True)
# df.dropna(inplace=True)
print(df.shape)
df.head(5)

(23294, 19)


Unnamed: 0,file,author,title,year,sttr,hapax_legomenon,yules_k,function_words,avg_sentence_length_word,avg_sentence_length_chars,avg_syllables_per_word,punctuation_sentence,shannon_entropy,simpsons_d,average_nps,noun_to_verb,noun_to_adj,verb_to_adv,avg_dependency_distance
0,project_gutenberg/9999.txt,Sarah H. Bradford,"Harriet, The Moses of Her People",1822,0.624214,0.452214,528.796852,0.39884,20.889773,111.791794,1.735272,4.017759,10.845536,0.998584,232.7,0.640382,0.79291,0.717273,2.338342
1,project_gutenberg/56195-0.txt,Kenneth Ward,The Boy Volunteers on the Belgian Front,1941,0.585125,0.415,378.878385,0.39712,14.213115,77.20765,1.752356,2.763206,10.506952,0.997945,232.130435,0.603311,0.811363,0.718462,2.134627
2,project_gutenberg/51187.txt,John Wilson,A Visit to the Mammoth Cave of Kentucky,1972,0.62,0.444,1298.919399,0.428477,27.3003,146.405405,1.720949,4.384384,10.003001,0.99836,222.8,0.656081,0.777569,0.638938,2.403897
3,project_gutenberg/51185.txt,Daniel F. Galouye,All Jackson's Children,1970,0.627,0.486,1703.701946,0.324357,8.565289,49.239669,1.79413,2.67438,9.511655,0.996898,221.166667,0.586852,0.823569,0.692679,2.098842
4,project_gutenberg/9997.txt,Francis Parkman,"France and England in North America, Part Third",2013,0.686821,0.54175,269.095482,0.372508,21.56223,122.567266,1.868661,4.204856,11.781284,0.998751,241.057143,0.716877,0.837261,0.751203,2.331626


In [12]:
df.isnull().sum()

file                           0
author                         0
title                          0
year                           0
sttr                         490
hapax_legomenon              490
yules_k                        7
function_words                 7
avg_sentence_length_word       7
avg_sentence_length_chars      7
avg_syllables_per_word         7
punctuation_sentence           7
shannon_entropy                7
simpsons_d                     7
average_nps                  185
noun_to_verb                   7
noun_to_adj                    7
verb_to_adv                    7
avg_dependency_distance        7
dtype: int64

In [15]:
df.dropna(inplace=True)
df['year'] = df['year'].astype('int')
df = df[(df['year'] <= 2020) & (df['year'] >= 1050)]
print(df.shape)

(22481, 19)


In [18]:
df.to_csv('metrics.csv', index=False)