In [7]:
import csv, nltk, glob
#read dictionary.com values into a Python dictionary
dictionary_com = {}
with open('Etymologies.txt') as myfile:
    for line in csv.reader(myfile, dialect="excel-tab"):
        dictionary_com[line[0]] = int(line[1])
dictionary_com

{'house': 900,
 'secretary': 1350,
 'delaware': 0,
 'dwells': 900,
 'sects': 1300,
 'compose': 1375,
 'corporate': 1350,
 'tories': 1640,
 'cheek': 900,
 'eighty': 850,
 'pathology': 1590,
 'romans': 900,
 'rouse': 1480,
 'absence': 1350,
 'explanations': 1350,
 'posts': 1000,
 'deliverance': 1250,
 'avail': 1250,
 'compliment': 1570,
 'intuition': 1400,
 'richly': 900,
 'strongly': 900,
 'friars': 1250,
 'across': 1,
 'layer': 1350,
 'charged': 1275,
 'bone': 900,
 'persecutions': 1300,
 'on': 1,
 'night': 900,
 'workmen': 900,
 'odious': 1350,
 'richness': 900,
 'killing': 1400,
 'lords': 900,
 'reap': 900,
 'acquired': 1400,
 'obtained': 1375,
 'marching': 1375,
 'abstain': 1350,
 'habitual': 1520,
 'neighbours': 900,
 'kansas': 0,
 'canaan': 0,
 'gordon': 0,
 'deluge': 1325,
 'figs': 1175,
 'metres': 0,
 'murderers': 1300,
 'aware': 1100,
 'solitary': 1300,
 'substituted': 1350,
 'apostolic': 1540,
 'railway': 1770,
 'changed': 1175,
 'trip': 1350,
 'narrative': 1555,
 'armies': 13

In [8]:
#part of speech tagging with nltk unigram tagger
from nltk.corpus import masc_tagged
tags = []
for i in masc_tagged.fileids():
    if len(tags) > 150000:
        break
    if 'written' in i:
        tags.extend(masc_tagged.tagged_words(i))

t = []
t.append(tags)

# if running this script in binder, comment out lines 5 through 15 and remove the triple quotes on lines 18 and 21 top uncomment the pickle version.
"""
import pickle
t = pickle.load(open('tagger.p', 'rb'))
"""

"\nimport pickle\nt = pickle.load(open('tagger.p', 'rb'))\n"

In [10]:
#this is a huge tagset, so training the tagger will take a long time!
unigram_tagger = nltk.UnigramTagger(t)

In [19]:
#print(dictionary_com)
#loop, read() files
text = glob.glob('./full_text/*.txt')
all_texts_pos = []
for i in text:
    #open and read()
    with open(i) as f:
        md_text = f.read()
    #tokenize, lowercase, remove newlines and tabs, strip punctuation and numbers
    #convert newlines and tabs to spaces
    md_text = md_text.replace('\n', ' ').replace('\t', ' ')
    #remove no-alpha characters, convert all to lowercase
    md_no_punct = ''.join(char.lower() if char.isalpha() else ' ' for char in md_text )
    #tokenize and drop empty list items
    md_tokens = [i for i in md_no_punct.split(' ') if i != '']
    #pos tag with unigram tagger
    uni_pos = unigram_tagger.tag(md_tokens)
    #append to all_texts
    all_texts_pos.append(uni_pos)

all_pos_one_list = []
for i in all_texts_pos:
    all_pos_one_list.extend(i) 
len(all_pos_one_list)

946281

In [24]:
from collections import Counter
function_symbols = ['DT', 'IN', 'CC', 'PRP', 'PRP$', 'WP', 'WP$']
all_pos_one_counter = Counter([i[0] for i in all_pos_one_list if i[1] in function_symbols])

all_pos_one_counter

Counter({'a': 19839,
         'about': 1354,
         'above': 281,
         'across': 162,
         'after': 1160,
         'against': 432,
         'all': 4293,
         'along': 251,
         'alongside': 27,
         'although': 196,
         'among': 460,
         'an': 2997,
         'and': 28622,
         'another': 532,
         'any': 1874,
         'around': 204,
         'as': 7206,
         'at': 6258,
         'because': 410,
         'before': 1317,
         'behind': 223,
         'below': 181,
         'beneath': 168,
         'beside': 101,
         'besides': 132,
         'between': 421,
         'beyond': 197,
         'both': 446,
         'but': 6720,
         'by': 4733,
         'cha': 19,
         'de': 2208,
         'despite': 21,
         'during': 256,
         'each': 397,
         'either': 209,
         'en': 2042,
         'et': 21,
         'every': 1057,
         'except': 183,
         'for': 6856,
         'from': 4147,
         'he': 11029,
       

In [25]:

token_sets = []
for token_list in all_texts_pos:
    #remove function words from test texts: determiners, prepositions, conjunctions, and pronouns
    token_set = []
    for token in token_list:
        if token[1] not in function_symbols:
            token_set.append(token[0])
    #convert full term lists to lists with each term listed only once (aka a "set")
    token_set = set(token_set)
    token_sets.append(token_set)

In [26]:
#loop through each term set, derive "the ratio of pre- and post-twelfth-century words"
ratios =[]
for token_set in token_sets:
    pre = 0
    post = 0
    for token in token_set:
        try:
            year = dictionary_com[token]
            if year < 1100:
                pre += 1
            if year > 1100:
                post +=1
        except:
            pass
    ratio = 1.0*(pre/post)
    ratios.append(ratio)

In [27]:
#print for each text
titles = [i.replace('./full_text/', '').replace('.txt', '') for i in text]
orig_lengths = [len(i) for i in all_texts_pos]
length_of_sets = [len(i) for i in token_sets]
set_to_len_ratio = [1.0*(i[0]/i[1]) for i in zip(length_of_sets, orig_lengths)]
for i in zip(titles, orig_lengths, length_of_sets, set_to_len_ratio, ratios):
    print(i)

('bell_in_the_fog', 63821, 7291, 0.11424139389856003, 0.48482849604221634)
('conjure_woman', 62505, 5393, 0.08628109751219902, 0.5954997383568812)
('decoverly', 35303, 4963, 0.14058295329008866, 0.4930584770719394)
('dunwich_horror', 21037, 4235, 0.20131197414079954, 0.5733890214797136)
('frankenstein', 78329, 7155, 0.09134547868605497, 0.4020792357403765)
('king_in_yellow', 76129, 8155, 0.10712080810203733, 0.5177281680892974)
('moby_dick', 221564, 17032, 0.07687169395750212, 0.4223169314219295)
('poe_collected_vol_1', 94796, 9436, 0.09954006498164479, 0.3919902912621359)
('sense_and_sensibility', 123807, 6435, 0.05197605951198236, 0.3806060606060606)
('turn_of_the_screw', 46516, 4680, 0.10061054260899475, 0.45303626037570993)
('wuthering_heights', 122474, 9365, 0.07646520894230613, 0.43854084060269627)
