In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pprint import pprint

In [2]:
fedArray = []
for x in open('federalist.json', 'r'):
    fedArray.append(json.loads(x))

In [3]:
fed = pd.DataFrame(fedArray)
fed.info()
fed.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85 entries, 0 to 84
Data columns (total 6 columns):
author      85 non-null object
date        50 non-null object
paper_id    85 non-null int64
text        85 non-null object
title       85 non-null object
venue       84 non-null object
dtypes: int64(1), object(5)
memory usage: 4.1+ KB


Unnamed: 0,author,date,paper_id,text,title,venue
0,HAMILTON,,1,To the People of the State of New York:\n\nAFT...,General Introduction,For the Independent Journal
1,JAY,,2,To the People of the State of New York:\n\nWHE...,Concerning Dangers from Foreign Force and Infl...,For the Independent Journal
2,JAY,,3,To the People of the State of New York:\n\nIT ...,The Same Subject Continued (Concerning Dangers...,For the Independent Journal
3,JAY,,4,To the People of the State of New York:\n\nMY ...,The Same Subject Continued (Concerning Dangers...,For the Independent Journal
4,JAY,,5,To the People of the State of New York:\n\nQUE...,The Same Subject Continued (Concerning Dangers...,For the Independent Journal


In [4]:
#checking # of papars by each author 
federalist_counts = fed.groupby('author').count()['paper_id']
federalist_counts

author
HAMILTON                51
HAMILTON AND MADISON     3
HAMILTON OR MADISON     11
JAY                      5
MADISON                 15
Name: paper_id, dtype: int64

In [5]:
#changing author of paper ID 58
fed['author'][fed['paper_id'] == 58] = "HAMILTON OR MADISON"
fed['author'][fed['paper_id'] == 58]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


57    HAMILTON OR MADISON
Name: author, dtype: object

cleaning, stemming, removing stopwords and creating TDM from the text 

In [6]:
#libraries required to clean text in the document and pre-process
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to C:\Users\NIRANJAN
[nltk_data]     SOMASANI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#processing the text column 
processed_text = []
for i in range(0,85):
    text = re.sub('[^a-zA-Z]', ' ', fed['text'][i]) #removing special characters except for keeping alphabets
    text = text.lower() #making all the letters in text to lower case
    text = text.split() #converting into list to filter stopwords and stem words
    ps = PorterStemmer()
    #removing non-significant words like prepositions and stemming - keeping only root words
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text) #converting list of words to a sentence (as an original text column format)
    processed_text.append(text)

creating bag of words model - creating sparse matrix and tokenization

In [45]:
#creating TDM with processed text column
cv = CountVectorizer() # we can pass arguments that we used as part of cleaning texts here
X = cv.fit_transform(processed_text).toarray()
#creating dataframe out of TDM
xdf = pd.DataFrame(X, columns=cv.get_feature_names())
xdf.head()

Unnamed: 0,abandon,abat,abb,abet,abhorr,abil,abject,abl,ablest,abolish,...,yeomanri,yet,yield,yoke,york,young,zaleucu,zeal,zealand,zealou
0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,3,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,3,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,2,0,0,...,0,3,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,3,0,0,1,1,0,0,0,0


From now, all the below code is applying ML model to predict the ownership of dispute papers (not appropriate results need to dig into statistics to provide more evidence for the prediction). This is just a try.

In [52]:
#taking only author column
df1 = pd.DataFrame(fed['author'])
df1 = df1.rename(columns = {'author':'authorname'})
df1.head()

Unnamed: 0,authorname
0,HAMILTON
1,JAY
2,JAY
3,JAY
4,JAY


In [55]:
# place the DataFrames side by side
cleaned_df = pd.concat([xdf, df1], axis=1)
cleaned_df.head()

Unnamed: 0,abandon,abat,abb,abet,abhorr,abil,abject,abl,ablest,abolish,...,yet,yield,yoke,york,young,zaleucu,zeal,zealand,zealou,authorname
0,0,0,0,0,0,0,0,1,0,0,...,1,0,0,1,0,0,3,0,0,HAMILTON
1,0,0,0,0,0,1,0,0,0,0,...,3,0,0,1,0,0,0,0,0,JAY
2,0,0,0,0,0,0,0,2,0,0,...,3,0,0,1,0,0,0,0,0,JAY
3,0,0,0,0,0,0,0,1,1,0,...,0,0,0,1,0,0,0,0,0,JAY
4,0,0,0,0,0,0,0,0,0,0,...,3,0,0,1,1,0,0,0,0,JAY


In [62]:
#separating records with Hamilton or Madison as author as we need to predict this records we keep them as a part of test
data = cleaned_df
train = cleaned_df.loc[cleaned_df['authorname'] != 'HAMILTON OR MADISON']
test = data[data.authorname == 'HAMILTON OR MADISON']
train.head()
test.head()

Unnamed: 0,abandon,abat,abb,abet,abhorr,abil,abject,abl,ablest,abolish,...,yet,yield,yoke,york,young,zaleucu,zeal,zealand,zealou,authorname
48,0,0,0,0,0,0,0,2,0,0,...,0,0,0,1,0,0,0,0,0,HAMILTON OR MADISON
49,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,HAMILTON OR MADISON
50,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,HAMILTON OR MADISON
51,0,0,0,0,0,1,0,1,0,0,...,0,1,0,1,1,0,1,0,0,HAMILTON OR MADISON
52,0,1,0,0,0,0,0,0,0,0,...,3,0,0,1,0,0,1,0,0,HAMILTON OR MADISON


In [63]:
#creating training and testing tables for independent variables/TDM
x_train = train[train.columns.difference(['authorname'])]
x_test = test[test.columns.difference(['authorname'])]

In [64]:
#creating training and testing tables for dependent/target variable
y_train = train['authorname']
y_test = test['authorname']
y_train.head()
y_train.count()
y_test.count()

12

In [65]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB(priors=None)

In [66]:
# Predicting the Test set results
y_pred = classifier.predict(x_test)

In [71]:
y_test

48    HAMILTON OR MADISON
49    HAMILTON OR MADISON
50    HAMILTON OR MADISON
51    HAMILTON OR MADISON
52    HAMILTON OR MADISON
53    HAMILTON OR MADISON
54    HAMILTON OR MADISON
55    HAMILTON OR MADISON
56    HAMILTON OR MADISON
57    HAMILTON OR MADISON
61    HAMILTON OR MADISON
62    HAMILTON OR MADISON
Name: authorname, dtype: object

In [70]:
y_pred

array(['HAMILTON', 'HAMILTON', 'HAMILTON', 'HAMILTON', 'HAMILTON',
       'MADISON', 'HAMILTON', 'HAMILTON', 'HAMILTON', 'HAMILTON',
       'HAMILTON', 'HAMILTON'], dtype='<U20')

In [395]:
#just want to check the owner of paper_id 55 as there was controversy on this
fed[fed['paper_id'] == 55]

Unnamed: 0,author,date,paper_id,text,title,venue
54,HAMILTON OR MADISON,"Friday, February 15, 1788",55,To the People of the State of New York:\n\nTHE...,The Total Number of the House of Representatives,From the New York Packet


The above model isn't best, it can be made better and the predictions can be even more close by calculating statistics to provide more evidence for each prediction.