# Preprocess the Federalist papers to provide counts for a few important keywords

The keywords are taken from Mosteller and Wallace, "Inference in an Authorship Problem", as the most "important" words by there methods.

In [1]:
keywords = ['on', 'upon', 'there', 'whilst']

In [2]:
import glob
import re
import torchtext
import json

In [23]:
# Get file names for all of the papers
fnames = glob.glob("*.*-*")

In [24]:
def contents(fin):
    '''Returns the text contents of a file with given file name'''
    with open(fin, 'r') as input:
        # return '\n'.join(input.read().split('\n')[556:])
        return input.read()

In [30]:
# Dataset has one element for each paper
dataset = [None] * len(fnames)

for fname in fnames:
    # Extract metadata from file names
    m = re.match('^(?P<number>\d+)\.\s+(?P<title>.+) - (?P<authors>.*)$', fname)
    if m:
        number = m.group('number')
        title = m.group('title')
        authors = m.group('authors')
        #print(f'processing {number}. "{title}" by {authors}')
        # Tokenize the contents
        tokens = torchtext.data.get_tokenizer("basic_english")(contents(fname))
        # Get counts for the keywords
        counts = [tokens.count(key) for key in keywords]
        # Construct dataset item for this paper
        dataset[int(number) - 1] = {'number': number, 'title': title, 'authors': authors, 'counts': counts}

In [31]:
# Dump the dataset as json
with open('federalist_data.json', 'w') as fout:
    json.dump(dataset, fout, sort_keys=True, indent = 2)