In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stopwords)

df = pd.read_csv('federalist.csv',dtype={'author':'category'},)
y = df.author
X = df.text


print(df[:15])



      author                                               text
0   HAMILTON  FEDERALIST. No. 1 General Introduction For the...
1        JAY  FEDERALIST No. 2 Concerning Dangers from Forei...
2        JAY  FEDERALIST No. 3 The Same Subject Continued (C...
3        JAY  FEDERALIST No. 4 The Same Subject Continued (C...
4        JAY  FEDERALIST No. 5 The Same Subject Continued (C...
5   HAMILTON  FEDERALIST No. 6 Concerning Dangers from Disse...
6   HAMILTON  FEDERALIST. No. 7 The Same Subject Continued (...
7   HAMILTON  FEDERALIST No. 8 The Consequences of Hostiliti...
8   HAMILTON  FEDERALIST No. 9 The Union as a Safeguard Agai...
9    MADISON  FEDERALIST No. 10 The Same Subject Continued (...
10  HAMILTON  FEDERALIST No. 11 The Utility of the Union in ...
11  HAMILTON  FEDERALIST No. 12 The Utility of the Union In ...
12  HAMILTON  FEDERALIST No. 13 Advantage of the Union in Re...
13   MADISON  FEDERALIST No. 14 Objections to the Proposed C...
14  HAMILTON  FEDERALIST No. 15 The Insu

In [2]:
counts = {}
for auth in df["author"].cat.categories:
    counts[auth] =0

for auth in y:
    if auth in counts:
        counts[auth] += 1

for auth in counts:
    print(auth,"OCCURENCE:",counts[auth])
        

HAMILTON OCCURENCE: 49
HAMILTON AND MADISON OCCURENCE: 3
HAMILTON OR MADISON OCCURENCE: 11
JAY OCCURENCE: 5
MADISON OCCURENCE: 15


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1234)

print("Shape of Train:",end=" ")
print(X_train.shape)
print("Shape of Test:",end=" ")
print(X_test.shape)



Shape of Train: (66,)
Shape of Test: (17,)


In [4]:


X_train = vectorizer.fit_transform(X_train)  
X_test = vectorizer.transform(X_test)    

print("Shape of Train:",end=" ")
print(X_train.shape)
print("Shape of Test:",end=" ")
print(X_test.shape)

Shape of Train: (66, 7876)
Shape of Test: (17, 7876)


In [5]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)

pred = bnb.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)
from sklearn.metrics import *
print('Bernoulli Naive Bayes\naccuracy score: ', round(100*accuracy_score(y_test, pred),4),"%")


Bernoulli Naive Bayes
accuracy score:  58.8235 %


In [6]:
vectorizer = TfidfVectorizer(stop_words=stopwords,ngram_range=(1,2),max_features=1000)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=1234)
X_train = vectorizer.fit_transform(X_train)  
X_test = vectorizer.transform(X_test)    

In [7]:
print("Shape of Train:",end=" ")
print(X_train.shape)
print("Shape of Test:",end=" ")
print(X_test.shape)
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

pred2 = bnb.predict(X_test)

print('Bernoulli Naive Bayes with MAX_FEATURES and NGRAM_RANGE\naccuracy score: ', round(100*accuracy_score(y_test, pred2),4),"%")



Shape of Train: (66, 1000)
Shape of Test: (17, 1000)
Bernoulli Naive Bayes with MAX_FEATURES and NGRAM_RANGE
accuracy score:  94.1176 %


In [8]:
from sklearn.linear_model import LogisticRegression

regDefault = LogisticRegression()
 
regDefault.fit(X_train, y_train)

pred3 = regDefault.predict(X_test)
confusion_matrix(y_test, pred)

print('Default logistic Regresssion\naccuracy score: ', round(100*accuracy_score(y_test, pred3),4),"%")

Default logistic Regresssion
accuracy score:  58.8235 %


In [9]:
regLoaded= LogisticRegression(multi_class='multinomial', solver='lbfgs',class_weight='balanced')
 
regLoaded.fit(X_train, y_train)
pred4 = regLoaded.predict(X_test)

print('Balanced, Multinomial, LBFGS Logistic Regression\naccuracy score: ', round(100*accuracy_score(y_test, pred4),4),"%")

Balanced, Multinomial, LBFGS Logistic Regression
accuracy score:  76.4706 %


In [16]:
from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(max_iter=300)
nn.fit(X_train, y_train)

pred5 = nn.predict(X_test)
print('max 300 iterations Neural Network\naccuracy score: ', round(100*accuracy_score(y_test, pred5),4),"%")

max 300 iterations Neural Network
accuracy score:  76.4706 %


In [11]:
nn2 = MLPClassifier(max_iter=2000,activation='relu',solver="adam",learning_rate='adaptive')
nn2.fit(X_train, y_train)

pred6 = nn2.predict(X_test)
print('2000 max iterations, Adam, adaptive LR, Neural Network\naccuracy score: ', round(100*accuracy_score(y_test, pred6),4),"%")

2000 max iterations, Adam, adaptive LR, Neural Network
accuracy score:  82.3529 %


In [12]:
nn3 = MLPClassifier(solver='adam', alpha=1e-5,activation='relu',learning_rate='adaptive',
                   hidden_layer_sizes=(10,9,8,7,6,5,4,3,2,), random_state=1,max_iter=10000)
nn3.fit(X_train, y_train)

pred7 = nn3.predict(X_test)
print('varied topology Neural Network\naccuracy score: ', round(100*accuracy_score(y_test, pred7),4),"%")

varied topology Neural Network
accuracy score:  58.8235 %


In [13]:
nn4 = MLPClassifier(solver='adam', alpha=1e-5,activation='relu',learning_rate='adaptive',
                   hidden_layer_sizes=(66,132,264,132,66,), random_state=1,max_iter=10000)
nn4.fit(X_train, y_train)

pred8 = nn4.predict(X_test)
print('varied topology Neural Network\naccuracy score: ', round(100*accuracy_score(y_test, pred8),4),"%")

varied topology Neural Network
accuracy score:  76.4706 %


In [14]:
nn5 = MLPClassifier(solver='adam', alpha=1e-5,activation='relu',learning_rate='adaptive',
                   hidden_layer_sizes=(66,132,66,), random_state=1,max_iter=10000)
nn5.fit(X_train, y_train)

pred9 = nn5.predict(X_test)
print('varied topology Neural Network\naccuracy score: ', round(100*accuracy_score(y_test, pred9),4),"%")

varied topology Neural Network
accuracy score:  82.3529 %


In [15]:
nn6 = MLPClassifier(solver='adam',learning_rate='adaptive',
                   hidden_layer_sizes=(33,66,33,), random_state=1,max_iter=10000)
nn6.fit(X_train, y_train)

pred10 = nn6.predict(X_test)
print('varied topology Neural Network\naccuracy score: ', round(100*accuracy_score(y_test, pred10),4),"%")

varied topology Neural Network
accuracy score:  88.2353 %
