In [None]:
# Run in terminal or command prompt
# python3 -m spacy download en

import numpy as np
import pandas as pd
import re, nltk, spacy, gensim
import glob, os
from numpy import linalg as LA

# nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from sklearn.externals import joblib

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
with open('stopwords.txt', 'r') as f:
    en_stop = f.readlines()
en_stop = [w.replace('\n', '') for w in en_stop]

In [None]:
data = []
doc_name = []
for file in glob.glob("data/Hamlet-Othello/*.txt"):
    f = open(file, "r")
    text = f.read()
    data.append(text)
    name = file.split('/')[2].split('.txt')[0]
    doc_name.append(name)

In [None]:
# Remove new line characters
data = [re.sub('\n', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [None]:
p_stemmer = PorterStemmer()

# list for tokenized documents in loop
texts = []

# loop through document list
for tokens in data_words:
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # add tokens to list
    texts.append(stopped_tokens)

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)

tf_feature_names = vectorizer.get_feature_names()

In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

In [None]:
x_coords = LA.eig(np.cov(data_dense))[1][0]
y_coords = LA.eig(np.cov(data_dense))[1][1]

In [None]:
from adjustText import adjust_text
# Plot
plt.figure(figsize=(12, 12))

for i,doc in enumerate(doc_name):
    x = x_coords[i]
    y = y_coords[i]
    genre = doc[0][:-1]
    plt.scatter(x, y, marker='o', color='black', alpha=0.3, s=200)
        
texts = []    
for i,doc in enumerate(doc_name):
    x = x_coords[i]
    y = y_coords[i]
    doc = doc.split("-")
    texts.append(plt.text(x, y, doc, fontsize=10, color="black"))

plt.show()