In [1]:
import nltk
import string
import pandas as pd 
import csv

from nltk.corpus import gutenberg
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

#For converting all the words to stem words in lowercase
snowball_stemmer = SnowballStemmer('english', ignore_stopwords=True)
punctRemover=str.maketrans('','',string.punctuation)

X = [] #To store actual text of the author
Y = [] #To store author name

innerStr = " "
strToAppend= " "

austen = gutenberg.sents('austen-emma.txt') # can choose another author name from the available choices
chester = gutenberg.sents('chesterton-thursday.txt')

#To remove the punctuation
punctRemover=str.maketrans('','',string.punctuation)

for row in austen:
    if row:
        innerStr = ' '.join([str(elem) for elem in row]) #convert the list to single string
        X.append(innerStr.translate(punctRemover))
        Y.append('austen')
        
for row in chester:
    if row:
        innerStr = ' '.join([str(elem) for elem in row]) #convert the list to single string
        X.append(innerStr.translate(punctRemover))
        Y.append('chester') 

X_train, X_test = train_test_split(X, test_size = 0.2)#split into training and test set
Y_train, Y_test = train_test_split(Y, test_size = 0.2)# to make sure the label size remains same

In [2]:
#To obtain binary document-term matrix
binary_vect = CountVectorizer(stop_words='english', binary=True)
X_train_binary = binary_vect.fit_transform(X_train)

X_train_array = X_train_binary.toarray()
df = pd.DataFrame(X_train_array[0:10])
df.columns = binary_vect.get_feature_names()

#To export binary document-term matrix as csv file
df.to_csv('binary_document_term_matrix.csv')

#Training logistic classifier SVM with the author as target stored in Y_Train.
binary_clf = Pipeline([('vect', binary_vect), 
                       ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=50, random_state=42))])
binary_clf = binary_clf.fit(X_train, Y_train)

In [3]:
#To obtain document-term matrix of counts
count_vect = CountVectorizer(stop_words='english')
X_train_count = count_vect.fit_transform(X_train)
X_train_count_array = X_train_count.toarray()

df = pd.DataFrame(X_train_count_array[0:10])
df.columns = count_vect.get_feature_names()

#To export document-term matrix of counts as csv file
df.to_csv('document_term_matrix_of_counts.csv')

#Training logistic classifier SVM with the author as target stored in Y_Train.
count_clf = Pipeline([('vect', count_vect), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=50, random_state=42))])
count_clf = count_clf.fit(X_train, Y_train)

In [4]:
#To obtain tfidf matrix of counts
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_vect = CountVectorizer()
X_train_counts = tfidf_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_train_tfidf_array = X_train_tfidf.toarray()
df = pd.DataFrame(X_train_tfidf_array[0:10])
df.columns = tfidf_vect.get_feature_names()

#To export document-tfidf matrix as csv file
df.to_csv('document_tfidf_matrix.csv')

#Training logistic classifier SVM with the author as target stored in Y_Train.
tfidf_clf = Pipeline([('vect', tfidf_vect), ('tf-idf', tfidf_transformer), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=50, random_state=42))])
tfidf_clf = tfidf_clf.fit(X_train, Y_train)

In [5]:
#To predict the classification with the three models on the test data
import numpy as np
#binary document term matrix predictor from question 5a
y_binaryPredictor = binary_clf.predict(X_test)
print('Accuracy from binary document-term matix: ', 100* np.mean(y_binaryPredictor == Y_test), "%")

#document term matrix predictor with count from question 5b
y_countPredictor = count_clf.predict(X_test)
print('Accuracy from document-term matrix of counts: ', 100* np.mean(y_countPredictor == Y_test), "%")

#document matrix predictor with tdidf from question 5c
y_tfidfPredictor = tfidf_clf.predict(X_test)
print('Accuracy from tf-idf scores document-term matrix: ', 100* np.mean(y_tfidfPredictor == Y_test), "%")

Accuracy from binary document-term matix:  66.68116572422792 %
Accuracy from document-term matrix of counts:  66.42018268812528 %
Accuracy from tf-idf scores document-term matrix:  67.11613745106568 %


In [6]:
#To obtaining the confusion matrices for the three different models.
from sklearn.metrics import confusion_matrix
print('Confusion Matrix for Document Term-Matrix Binary model:')
results = confusion_matrix(Y_test, y_binaryPredictor) 
print(results) 

print('\nConfusion Matrix for Document Term-Matrix Count model:')
results = confusion_matrix(Y_test, y_countPredictor) 
print(results) 

print('\nConfusion Matrix for Document_Term-Matrix Tfidf model:')
results = confusion_matrix(Y_test, y_tfidfPredictor) 
print(results) 

Confusion Matrix for Document Term-Matrix Binary model:
[[1523   20]
 [ 746   10]]

Confusion Matrix for Document Term-Matrix Count model:
[[1518   25]
 [ 747    9]]

Confusion Matrix for Document_Term-Matrix Tfidf model:
[[1543    0]
 [ 756    0]]


In [7]:
#To obtaining accuracy and f1-score for the three different models.
from sklearn.metrics import accuracy_score, f1_score

print('Binary document-term matix logistic classifier:')
print(' Testing accuracy %s' % accuracy_score(Y_test, y_binaryPredictor))
print(' Testing F1 score: {}'.format(f1_score(Y_test, y_binaryPredictor, average='weighted')))

print('\nDocument-term matrix of counts logistic classifier:')
print(' Testing accuracy %s' % accuracy_score(Y_test, y_countPredictor))
print(' Testing F1 score: {}'.format(f1_score(Y_test, y_countPredictor, average='weighted')))

print('\nTdidf score Document-term matrix logistic classifier:')
print(' Testing accuracy %s' % accuracy_score(Y_test, y_tfidfPredictor))
print(' Testing F1 score: {}'.format(f1_score(Y_test, y_tfidfPredictor, average='weighted')))

Binary document-term matix logistic classifier:
 Testing accuracy 0.6668116572422792
 Testing F1 score: 0.5446626591533111

Document-term matrix of counts logistic classifier:
 Testing accuracy 0.6642018268812527
 Testing F1 score: 0.5425886214986708

Tdidf score Document-term matrix logistic classifier:
 Testing accuracy 0.6711613745106568
 Testing F1 score: 0.5390952633367743
