In [1]:
import nltk
import string

from nltk.corpus import gutenberg
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split

#For converting all the words to stem words in lowercase
snowball_stemmer = SnowballStemmer('english', ignore_stopwords=True)
punctRemover=str.maketrans('','',string.punctuation)

X = [] #To store actual text of the author
Y = [] #To store author name

innerStr = " "
strToAppend= " "

austen = gutenberg.sents('austen-emma.txt') # can choose another author name from the available choices
chester = gutenberg.sents('chesterton-thursday.txt')

#To remove the punctuation
punctRemover=str.maketrans('','',string.punctuation)

for row in austen:
    if row:
        innerStr = ' '.join([str(elem) for elem in row]) #convert the list to single string
        X.append(innerStr.translate(punctRemover))
        Y.append('austen')
        
for row in chester:
    if row:
        innerStr = ' '.join([str(elem) for elem in row]) #convert the list to single string
        X.append(innerStr.translate(punctRemover))
        Y.append('chester') 

X_train, X_test = train_test_split(X, test_size = 0.2)#split into training and test set
Y_train, Y_test = train_test_split(Y, test_size = 0.2)# to make sure the label size remains same

In [2]:
import gensim
import multiprocessing
from gensim.models import Doc2Vec
from tqdm import tqdm
from sklearn import utils
from sklearn import svm

train_tagged = []
test_tagged = []

for i in range (0, len(X_train)):
    train_tagged.append(gensim.models.doc2vec.TaggedDocument(words=gensim.utils.simple_preprocess(X_train[i]), tags=[Y_train[i]]))

for i in range (0, len(X_test)):
    test_tagged.append(gensim.models.doc2vec.TaggedDocument(words=gensim.utils.simple_preprocess(X_test[i]), tags=[Y_test[i]]))

cores = multiprocessing.cpu_count()

# dm = 1 will use distributed memory model and it will preserves the word order in a document
doc2vec_model = Doc2Vec(dm=1, vector_size=300, min_count=2, workers=cores)
doc2vec_model.build_vocab([x for x in tqdm(train_tagged)])

for epoch in range(30):
    doc2vec_model.train(utils.shuffle([x for x in (train_tagged)]), total_examples=len(train_tagged), epochs=1)
    doc2vec_model.alpha -= 0.002
    doc2vec_model.min_alpha = doc2vec_model.alpha
    
# Building the feature vector for the classifier
def feature_vector(model, docs):
    doc2vec_vectors = [model.infer_vector(doc.words) for doc in docs]
    targets = [doc.tags[0] for doc in docs]
    return doc2vec_vectors, targets

# Translating docs into vectors for training and test set
X_doc2vec_train, y_doc2vec_train = feature_vector(doc2vec_model, train_tagged)
X_doc2vec_test, y_doc2vec_test = feature_vector(doc2vec_model, test_tagged)



100%|██████████| 9195/9195 [00:00<?, ?it/s]


In [3]:
# Training a classification model

#C: 1e5, Since we can not visualize a linear relationalship in 300 dimensional vector to the target, We take small value of C
#gamma:'1'. We have trained the model on 300-dimension of the document vector 
#           If it is chosen too small, the area of influence of any point will increase.
log_doc2vec = svm.SVC(C=0.1, gamma = 1)
log_doc2vec.fit(X_doc2vec_train, y_doc2vec_train)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [7]:
#doc2vec predictor Accuracy and confusion matrix
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

y_doc2vecPred = log_doc2vec.predict(X_doc2vec_test)
print('Accuracy from doc2vec: ', 100* np.mean(y_doc2vecPred == y_doc2vec_test), "%")

print('\nConfusion Matrix for doc2vec model:')
results = confusion_matrix(y_doc2vec_test, y_doc2vecPred) 
print(results) 

print('\nDoc2vec logistic classifier:')
print(' Testing accuracy %s' % accuracy_score(Y_test, y_doc2vecPred))
print(' Testing F1 score: {}'.format(f1_score(Y_test, y_doc2vecPred, average='weighted')))

Accuracy from doc2vec:  68.37755545889517 %

Confusion Matrix for doc2vec model:
[[1572    0]
 [ 727    0]]

Doc2vec logistic classifier:
 Testing accuracy 0.6837755545889517
 Testing F1 score: 0.5553578774548343
