# Comparison of Federalist Papers

Can we argue for the authorship of the disputed Federalist Papers?

Background: Read [Jenny Bryan's rules for naming things](https://speakerdeck.com/jennybc/how-to-name-files).

In [None]:
import csv, sys, os
import spacy
from collections import Counter
import numpy

from matplotlib import pyplot

nlp = spacy.load("en_core_web_sm")

Part 1: Why are we missing about ten documents? Modify the code to fix this. Hint: String formatting.

In [None]:
documents = []

with open("../data/FederalistPapers/metadata_federalist.csv", encoding="utf-8") as reader:
    csv_reader = csv.DictReader(reader)
    for row in csv_reader:
        ## convert string to int
        row["Number"] = int(row["Number"])
        
        row["Filename"] = "../data/FederalistPapers/federalist_{}.txt".format(row["Number"])
        if os.path.exists(row["Filename"]):
            documents.append(row)

In [None]:
len(documents)

In [None]:
for document in documents:
    try:
        with open(document["Filename"], encoding="utf-8") as reader:
            print(document["Title"])

            lines = []
            for line in reader:
                lines.append(line.rstrip())

            text = " ".join(lines)
            document["Spacy"] = nlp(text)
    except:
        print("Problem with {}".format(document["Number"]))

In [None]:
all_counts = Counter()

for document in documents:
    doc_counter = Counter([token.text for token in document["Spacy"]])
    all_counts += doc_counter
    document["TokenCounts"] = doc_counter

all_counts.most_common(30)

In [None]:
num_top_words = 150
top_words = [w for w, c in all_counts.most_common(num_top_words)]

doc_word_counts = numpy.zeros( (len(documents), num_top_words) )

for doc_id, document in enumerate(documents):
    for word_id, word in enumerate(top_words):
        doc_word_counts[doc_id,word_id] = document["TokenCounts"][word]

doc_word_counts[:5,:10]

Part 2: Divide the matrix appropriately so that each document sums to 1.0

In [None]:
doc_lengths = doc_word_counts.sum(axis=1)
print(doc_lengths.shape)
doc_word_probs = doc_word_counts / ## What should go here?
doc_word_probs[:5,:10]

Part 3: Include code to calculate mean and standard deviation

In [None]:
word_means = doc_word_probs. ## what goes here?
word_sds = doc_word_probs.   ## what goes here?

doc_word_zscores = ## subtract means, divide by std
doc_word_zscores[:5,:10]

Here we create a low-dimensional projection and 

In [None]:
U,S,Vt = numpy.linalg.svd(doc_word_zscores)

Part 4: What is going on with the Author field? How should we fix this?

In [None]:
Counter([doc["Author"] for doc in documents])

In [None]:
colormap = {"Alexander Hamilton": "red", "James Madison": "blue",
            "John Jay": "green", "Alexander Hamilton and James Madison": "purple",
            "Alexander Hamilton or James Madison": "gray",}

authors = [colormap[doc["Author"]] for doc in documents]

pyplot.scatter(U[:,0], U[:,1], c=authors)
pyplot.show()

Part 5: 

In [None]:
descriptors = ["{} {}, {}".format(i, doc["author"], doc["title"][:30]) for i, doc in enumerate(documents)]

zscore_norms = numpy.linalg.norm(doc_word_zscores, axis=1)

def nearest(query_id):
    cosines = doc_word_zscores.dot(doc_word_zscores[query_id,:]) / (zscore_norms * zscore_norms[query_id])
    
    return sorted(zip(cosines, descriptors), reverse=True)

In [None]:
print(descriptors[62])
nearest(62)

Part 5: Modify the number of top words. Try different ranges, including non-top words. Are the results different, and if so, how?

[Answer below]



### More advanced NLP

The following demonstrates how to access part-of-speech and syntax trees.

We will explore these more on Wednesday.

In [None]:
sentences = list(document.sents)

for sentence in sentences:
    parts_of_speech = [token.pos_ for token in sentence]
    print(sentence)
    print(Counter(parts_of_speech))

In [None]:
spacy.displacy.render(sentences[3], jupyter=True)