### Experimenting with various text analysis techniques for the Song of Ice and Fire reddit data ###

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pickle
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
asoiaf = pickle.load( open( "./Data/Other/final_asoiaf_monday.pkl", "rb" ) )

In [10]:
final_asoiaf = []

for item in asoiaf:
    item = ''.join(item)
    final_asoiaf.append(item)

### Count Vectorizer ###

In [12]:
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(final_asoiaf)
doc_word.shape

(28280, 4511)

### LSA/PCA/SVD ###

In [19]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

array([0.17272546, 0.03895342, 0.03210509, 0.02706039, 0.02503342])

In [20]:
topic_word = pd.DataFrame(lsa.components_.round(5),
             index = ["component_1","component_2", 'component_3', 'component_4', 'component_5'],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,abandon,abandoned,ability,able,abomination,abrupt,absence,absent,absolute,absolutely,...,yo,yohn,young,younger,youth,youtube,yunkai,zero,zeus,zombie
component_1,0.00248,0.00194,0.00595,0.02465,0.00084,0.00072,0.0011,0.00127,0.00315,0.00924,...,0.00023,0.00187,0.01403,0.00437,0.00183,0.00035,0.0029,0.00277,0.0004,0.00316
component_2,0.00307,0.00115,-0.00631,-0.01062,-0.00093,0.00022,4e-05,0.00027,0.00242,0.00417,...,5e-05,0.00065,0.00675,0.00541,0.0017,0.00038,0.00675,-6e-05,-0.00044,-0.00018
component_3,-0.00156,-0.00052,0.0041,0.00053,-0.0,-0.00047,-0.00041,0.00137,-0.00059,0.00077,...,-0.0,-0.00062,0.01902,0.0161,0.00172,-0.00026,-0.00032,-0.00071,0.00028,-0.00068
component_4,0.00166,0.00135,0.01099,0.01807,6e-05,0.00095,-0.00116,-0.00207,0.00308,0.01097,...,0.00027,-0.00181,0.0133,0.00017,0.00225,0.00042,0.01165,0.00251,0.0001,0.00108
component_5,-0.00146,0.00152,0.00731,0.00575,0.0027,-0.00131,-0.00214,-0.00201,-0.00035,-0.00119,...,0.00012,-0.00025,0.01919,0.00372,0.00238,-0.00038,-0.00725,-0.00103,-0.00137,-0.00029


In [21]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [22]:
display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
jon, king, night, dany, bran, time, arya, jaime, cersei, think

Topic  1
dany, jon, sansa, jaime, cersei, daenerys, tyrion, hand, people, queen

Topic  2
jaime, hand, cersei, tyrion, lannister, ser, tywin, brienne, man, aery

Topic  3
dany, bran, time, people, euron, crow, daenerys, think, story, eyed

Topic  4
jon, bran, sansa, crow, mud, time, eyed, euron, arya, ned


In [43]:
Vt = pd.DataFrame(doc_topic.round(5),
             index = final_asoiaf,
             columns = ["component_1","component_2", 'component_3', 'component_4', 'component_5'])


In [44]:
ex_label = [e[:30]+"..." for e in final_asoiaf]
vectorizer = CountVectorizer(stop_words = 'english')
doc_word = vectorizer.fit_transform(final_asoiaf)
pd.DataFrame(doc_word.toarray(), index=ex_label, columns=vectorizer.get_feature_names()).head(10)

Unnamed: 0,abandon,abandoned,abandoning,ability,able,abomination,absence,absent,absolute,absolutely,...,young,younger,youngest,youth,youtu,youtube,yunkai,zero,zeus,zombie
main castle black importance h...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main craster king carry king b...,0,0,0,3,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
main living able defeat enemy ...,0,0,0,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
main game thrones’ final pull ...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
inconsistency castle black sig...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
da bran e rhaegar e jon snow b...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
much time make prediction thin...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
upon adaptation main rewatchin...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
random post...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
grrm expressed desire make pro...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### NMF ###

In [13]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)

In [14]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2", 'component_3', 'component_4', 'component_5'],
             columns = vectorizer.get_feature_names())

In [17]:
display_topics(nmf_model, vectorizer.get_feature_names(), 10)


Topic  0
king, night, white, battle, dead, army, walker, arya, winterfell, dragon

Topic  1
bran, time, eyed, crow, euron, know, think, raven, men, story

Topic  2
jaime, hand, cersei, tyrion, king, brienne, lannister, ser, tywin, man

Topic  3
dany, daenerys, people, think, way, cersei, time, dragon, throne, tyrion

Topic  4
jon, sansa, arya, winterfell, snow, sam, battle, north, lord, stark


In [48]:
H = pd.DataFrame(doc_topic.round(5),
             index = ex_label,
             columns = ["component_1","component_2", 'component_3', 'component_4', 'component_5'])

### LDA ###

In [50]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(ngram_range=(1, 2),  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")

count_vectorizer.fit(final_asoiaf)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [51]:
doc_word = count_vectorizer.transform(final_asoiaf).transpose()

In [52]:
pd.DataFrame(doc_word.toarray(), count_vectorizer.get_feature_names()).sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24733,24734,24735,24736,24737,24738,24739,24740,24741,24742
choice little,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
learn guy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
prophecy bastard,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
power skill,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
believable fallout,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dragonstone nuclear,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
complex fantasy,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
final sit,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
isn joke,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
great assuming,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
doc_word.shape

(503565, 24743)

In [54]:
from gensim import corpora, models, similarities, matutils

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [55]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word)

In [56]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [59]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=5, id2word=id2word, passes=1) # train model

2019-05-19 11:55:48,097 : INFO : using symmetric alpha at 0.2
2019-05-19 11:55:48,098 : INFO : using symmetric eta at 0.2
2019-05-19 11:55:48,196 : INFO : using serial LDA version on this node
2019-05-19 11:55:48,427 : INFO : running online (single-pass) LDA training, 5 topics, 1 passes over the supplied corpus of 24743 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2019-05-19 11:55:48,478 : INFO : PROGRESS: pass 0, at document #2000/24743
2019-05-19 11:55:50,008 : INFO : merging changes from 2000 documents into a model of 24743 documents
2019-05-19 11:55:50,232 : INFO : topic #0 (0.200): 0.007*"king" + 0.004*"jon" + 0.003*"think" + 0.003*"time" + 0.003*"throne" + 0.003*"dany" + 0.002*"game" + 0.002*"night" + 0.002*"jaime" + 0.002*"main"
2019-05-19 11:55:50,235 : INFO : topic #1 (0.200): 0.004*"jon" + 0.004*"king" + 0.004*"night" + 0.003*"dragon" + 0.003*"think" + 0.003*"dany" + 0.

In [60]:
lda.print_topics()

2019-05-19 11:56:49,657 : INFO : topic #0 (0.200): 0.005*"really" + 0.004*"people" + 0.004*"think" + 0.004*"good" + 0.004*"throne" + 0.004*"thing" + 0.003*"make" + 0.003*"got" + 0.003*"way" + 0.003*"king"
2019-05-19 11:56:49,661 : INFO : topic #1 (0.200): 0.009*"night" + 0.007*"king" + 0.006*"night king" + 0.005*"jon" + 0.004*"walker" + 0.004*"white" + 0.004*"white walker" + 0.003*"kill" + 0.003*"dragon" + 0.003*"arya"
2019-05-19 11:56:49,668 : INFO : topic #2 (0.200): 0.007*"cersei" + 0.005*"tyrion" + 0.004*"dany" + 0.004*"arya" + 0.004*"jaime" + 0.003*"jon" + 0.003*"kill" + 0.002*"people" + 0.002*"sansa" + 0.002*"make"
2019-05-19 11:56:49,671 : INFO : topic #3 (0.200): 0.004*"think" + 0.003*"time" + 0.003*"grrm" + 0.003*"story" + 0.003*"dany" + 0.002*"know" + 0.002*"dragon" + 0.002*"euron" + 0.002*"really" + 0.002*"people"
2019-05-19 11:56:49,675 : INFO : topic #4 (0.200): 0.007*"jon" + 0.006*"bran" + 0.004*"dany" + 0.003*"dragon" + 0.003*"winterfell" + 0.003*"king" + 0.003*"battle" 

[(0,
  '0.005*"really" + 0.004*"people" + 0.004*"think" + 0.004*"good" + 0.004*"throne" + 0.004*"thing" + 0.003*"make" + 0.003*"got" + 0.003*"way" + 0.003*"king"'),
 (1,
  '0.009*"night" + 0.007*"king" + 0.006*"night king" + 0.005*"jon" + 0.004*"walker" + 0.004*"white" + 0.004*"white walker" + 0.003*"kill" + 0.003*"dragon" + 0.003*"arya"'),
 (2,
  '0.007*"cersei" + 0.005*"tyrion" + 0.004*"dany" + 0.004*"arya" + 0.004*"jaime" + 0.003*"jon" + 0.003*"kill" + 0.002*"people" + 0.002*"sansa" + 0.002*"make"'),
 (3,
  '0.004*"think" + 0.003*"time" + 0.003*"grrm" + 0.003*"story" + 0.003*"dany" + 0.002*"know" + 0.002*"dragon" + 0.002*"euron" + 0.002*"really" + 0.002*"people"'),
 (4,
  '0.007*"jon" + 0.006*"bran" + 0.004*"dany" + 0.003*"dragon" + 0.003*"winterfell" + 0.003*"king" + 0.003*"battle" + 0.003*"army" + 0.003*"dead" + 0.002*"wight"')]