In [1]:
import base64
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from textblob import TextBlob
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import functools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("xid-34549157_1")
train['text_processed']=train['text'].map(lambda x:re.sub('[,\.!?]','',x))
train['text_processed']=train['text_processed'].map(lambda x:x.lower())
def cleanText(input_string):
    modified_string=re.sub('[^A-Za-z0-9]+',' ', input_string)
    return(modified_string)
train['text_processed']=train.text_processed.apply(cleanText)

In [3]:
nltk.download('stopwords')
stopWords=stopwords.words('english')
stopWords.extend(["make","mr","de","without","let","rather","upon","within","made","must","much","yet","thought","see","said","us","say","whose","though","every","know","many","will","never","even","found","might","almost","although","indeed","thus","still","this","me","of","may","would","ever","could","shall","come","go","soon","however","become","give","take","well"])
def removeStopWords(stopWords,rvw_txt):
    newtxt=' '.join([word for word in rvw_txt.split() if word not in stopWords])
    return newtxt
train['text_processed']=[removeStopWords(stopWords,x) for x in train['text_processed']]

[nltk_data] Downloading package stopwords to /Users/Rup/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
lemm = WordNetLemmatizer()
class LemmaTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaTfidfVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [5]:
# Storing the entire training text in a list
text = list(train.text_processed.values)
# Calling our overwritten Count vectorizer
tfidf = LemmaTfidfVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     norm='l1',
                                     decode_error='ignore')
dtm = tfidf.fit_transform(train['text_processed'])
dtm

<19579x13759 sparse matrix of type '<class 'numpy.float64'>'
	with 207043 stored elements in Compressed Sparse Row format>

In [32]:
words_HPL = train[train.author=="HPL"]["text_processed"].str.split(expand=True).unstack().value_counts()
# data = [go.Bar(x = words_HPL.index.values[2:100],y = words_HPL.values[2:100],marker= dict(colorscale='earth',color = words_HPL.values[2:100]),text="Frequency")]
# layout = go.Layout(title='Top 50 words of HPL')
# fig = go.Figure(data=data, layout=layout)
# py.iplot(fig, filename='bargraph')
print(words_HPL[-60:])

wreath            1
daytime           1
primeval          1
watchin           1
hindoo            1
implicitly        1
inundating        1
animalistic       1
sterner           1
battlefield       1
congealed         1
lifelessness      1
choosing          1
textbook          1
circulatory       1
aimless           1
rottin            1
dandy             1
ploughed          1
farms             1
sex               1
sellin            1
fred              1
zone              1
deathless         1
deceived          1
caving            1
identification    1
coral             1
kindness          1
loser             1
virtue            1
detestation       1
rejuvenated       1
bears             1
forefather        1
nuther            1
sided             1
mightily          1
delighted         1
lustrous          1
drawback          1
construe          1
harold            1
tardily           1
chasing           1
islander          1
despising         1
receipt           1
investigated      1


In [37]:
nmf_model = NMF(n_components=3,random_state=46)
nmf_model.fit(dtm)
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 50 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-60:]])
    print('\n')

THE TOP 50 WORDS FOR TOPIC #0
['resolved', 'living', 'chance', 'save', 'poetry', 'attempting', 'far', 'impossible', 'behold', 'speak', 'fell', 'intuition', 'world', 'endured', 'observe', 'old', 'like', 'peculiar', 'absolutely', 'sight', 'similar', 'change', 'somewhat', 'pretty', 'bad', 'suspected', 'hell', 'men', 'little', 'happy', 'annoyed', 'sensation', 'told', 'read', 'tell', 'sort', 'strange', 'state', 'heard', 'unknown', 'seen', 'going', 'speaks', 'great', 'funny', 'earth', 'style', 'stand', 'investigation', 'proved', 'method', 'profound', 'line', 'curious', 'particular', 'easily', 'simple', 'try', 'yes', 'thing']


THE TOP 50 WORDS FOR TOPIC #1
['believe', 'course', 'strange', 'door', 'great', 'sound', 'voice', 'men', 'object', 'length', 'looked', 'told', 'better', 'room', 'dream', 'hope', 'moment', 'went', 'far', 'father', 'light', 'return', 'perdita', 'mind', 'think', 'change', 'matter', 'heart', 'hand', 'raymond', 'tell', 'world', 'left', 'good', 'hour', 'old', 'mean', 'seen',

In [35]:
words_EAP = train[train.author=="EAP"]["text_processed"].str.split(expand=True).unstack().value_counts()
# data = [go.Bar(x = words_HPL.index.values[2:100],y = words_HPL.values[2:100],marker= dict(colorscale='earth',color = words_HPL.values[2:100]),text="Frequency")]
# layout = go.Layout(title='Top 50 words of HPL')
# fig = go.Figure(data=data, layout=layout)
# py.iplot(fig, filename='bargraph')
print(words_EAP[:60])

one          655
little       275
time         260
man          257
first        249
great        227
two          213
long         212
length       178
whole        176
day          176
like         173
eyes         168
head         164
far          162
nothing      153
seemed       150
three        150
way          149
night        146
good         141
left         140
old          139
matter       139
thing        135
point        134
hand         133
mind         131
came         128
body         126
course       126
back         120
seen         120
fact         119
feet         118
saw          117
room         116
words        115
idea         115
door         115
manner       114
death        113
general      113
full         113
means        111
less         108
light        108
took         107
water        106
doubt        106
person       106
life         105
moment       104
character    103
earth        103
air          103
nature       103
place        101
nearly       1