### Experimenting with various text analysis techniques for the Game of Thrones reddit data ###

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pickle
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
got = pickle.load( open( "./Data/final_got_monday.pkl", "rb" ) )

### Count Vectorizer ###

In [74]:
count_vectorizer = CountVectorizer(analyzer = 'word', encoding = 'string',  strip_accents = 'unicode',
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")
count_got = count_vectorizer.fit_transform(got)
count_got.shape

(153179, 7231)

In [127]:
count_got_pd = pd.DataFrame(count_got)

### LSA/PCA/SVD ###

In [177]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa_count = TruncatedSVD(3)
count_doc_topic = lsa_count.fit_transform(count_got)
lsa_count.explained_variance_ratio_

array([0.17006132, 0.10962433, 0.10199264])

In [178]:
got_count_topic_word = pd.DataFrame(lsa_count.components_.round(3),
             index = ["component_1","component_2", 'component_3'],
             columns = count_vectorizer.get_feature_names())
got_count_topic_word

Unnamed: 0,abandon,abandoned,abandonment,abdicate,abdomen,ability,ablaze,able,aboard,abolish,...,youtube,yunkai,yup,zero,zig,zimmer,zombie,zone,zoned,zoom
component_1,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.002,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.002,0.002,0.0,0.0,0.0,0.007,0.001,0.025,0.0,0.0,...,0.0,0.002,0.0,0.003,0.0,0.0,0.004,0.0,0.0,0.0
component_3,-0.0,-0.0,-0.0,-0.0,-0.0,-0.001,-0.0,-0.002,-0.0,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0


In [179]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [180]:
display_topics(lsa_count, count_vectorizer.get_feature_names(), 15)


Topic  0
plot, armor, jon, night, dany, bran, like, arya, cersei, people, think, battle, time, way, throne

Topic  1
jon, night, dany, bran, arya, like, cersei, think, people, battle, time, throne, dead, dragon, way

Topic  2
hold, door, armor, plot, hodor, tumblin, wylis, beer, vi, bus, stargaryen, rope, oak, furdik, reconnaissance


In [85]:
got_label = [e[:30]+"..." for e in got]
pd.DataFrame(count_got.toarray(), index=got_label, columns=count_vectorizer.get_feature_names()).head(10)

Unnamed: 0,abandon,abandoned,abandonment,abdicate,abdomen,ability,ablaze,able,aboard,abolish,...,youtube,yunkai,yup,zero,zig,zimmer,zombie,zone,zoned,zoom
watch wait curious see watch c...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
picked dragon glass dagger fou...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
book question man without face...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
got...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
enough time binge list must wa...,0,0,0,0,0,0,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
book question faceless man old...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
need subscription watch new go...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
watching six wonder time robin...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
samwell tarly daenerys sure so...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
got death bingo made death bin...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### NMF ###

In [181]:
count_nmf_model = NMF(3)
count_nmf_topic = count_nmf_model.fit_transform(count_got)

In [182]:
count_topic_word_got = pd.DataFrame(count_nmf_model.components_.round(3),
             index = ["component_1","component_2", 'component_3'],
             columns = count_vectorizer.get_feature_names())

In [183]:
display_topics(count_nmf_model, count_vectorizer.get_feature_names(), 15)


Topic  0
plot, armor, like, story, got, battle, writing, series, people, main, way, really, game, time, point

Topic  1
jon, night, dany, bran, arya, like, cersei, think, people, battle, time, throne, dead, dragon, way

Topic  2
hold, door, hodor, bran, dead, time, past, jaime, dothraki, undead, castle, wildfire, meera, unsullied, winterfell


In [90]:
H = pd.DataFrame(count_doc_topic.round(10),
             index = got_label,
             columns = ["component_1","component_2", 'component_3', 'component_4', 'component_5'])

### LDA ###

In [150]:
from gensim import corpora, models, similarities, matutils

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [151]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(count_got)

In [152]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [27]:
lda_count.print_topics()

2019-05-20 12:31:30,953 : INFO : topic #0 (0.200): 0.003*"assume sex" + 0.002*"cersei shelter" + 0.002*"arya chase" + 0.002*"certainly justify" + 0.002*"bell audience" + 0.002*"bell favorite" + 0.002*"bell concentrated" + 0.002*"bell braid" + 0.002*"briene south" + 0.002*"car dothraki"
2019-05-20 12:31:30,966 : INFO : topic #1 (0.200): 0.003*"assassin darth" + 0.003*"better choice" + 0.002*"bring martha" + 0.002*"added related" + 0.002*"body burnt" + 0.002*"burning conquering" + 0.002*"cersei board" + 0.001*"believe happening" + 0.001*"bulk wight" + 0.001*"based individual"
2019-05-20 12:31:30,978 : INFO : topic #2 (0.200): 0.003*"cersei tragically" + 0.003*"cersei layer" + 0.003*"big fleet" + 0.003*"big filler" + 0.002*"bran pussy" + 0.002*"cersei hollywood" + 0.002*"bigger course" + 0.002*"addition happy" + 0.002*"alternative like" + 0.002*"believe ala"
2019-05-20 12:31:30,989 : INFO : topic #3 (0.200): 0.003*"better analysis" + 0.002*"actor cool" + 0.002*"broad secret" + 0.002*"chan

[(0,
  '0.003*"assume sex" + 0.002*"cersei shelter" + 0.002*"arya chase" + 0.002*"certainly justify" + 0.002*"bell audience" + 0.002*"bell favorite" + 0.002*"bell concentrated" + 0.002*"bell braid" + 0.002*"briene south" + 0.002*"car dothraki"'),
 (1,
  '0.003*"assassin darth" + 0.003*"better choice" + 0.002*"bring martha" + 0.002*"added related" + 0.002*"body burnt" + 0.002*"burning conquering" + 0.002*"cersei board" + 0.001*"believe happening" + 0.001*"bulk wight" + 0.001*"based individual"'),
 (2,
  '0.003*"cersei tragically" + 0.003*"cersei layer" + 0.003*"big fleet" + 0.003*"big filler" + 0.002*"bran pussy" + 0.002*"cersei hollywood" + 0.002*"bigger course" + 0.002*"addition happy" + 0.002*"alternative like" + 0.002*"believe ala"'),
 (3,
  '0.003*"better analysis" + 0.002*"actor cool" + 0.002*"broad secret" + 0.002*"change completely" + 0.002*"bean watch" + 0.002*"charisma make" + 0.002*"care forgetting" + 0.001*"absolute nose" + 0.001*"absolute snack" + 0.001*"absence"'),
 (4,
  

### TF IDF Vectorizer ###

In [155]:
# For sklearn, it's VERY similar to how we did CountVectorizer
tf_idf_vectorizer = TfidfVectorizer(  
                                   stop_words='english', token_pattern="\\b[a-z][a-z]+\\b")
got_tfidf = tf_idf_vectorizer.fit_transform(got)

### LSA/PCA/SVD ###

In [156]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
tf_idf_lsa = TruncatedSVD(2)
tf_idf_doc_topic = tf_idf_lsa.fit_transform(got_tfidf)
tf_idf_lsa.explained_variance_ratio_

array([0.00799442, 0.01226075])

In [157]:
tf_idf_topic_word = pd.DataFrame(tf_idf_lsa.components_.round(2),
             index = ["component_1","component_2"],
             columns = tf_idf_vectorizer.get_feature_names())
tf_idf_topic_word

Unnamed: 0,abandon,abandoned,abandonment,abdicate,abdomen,ability,ablaze,able,aboard,abolish,...,youtube,yunkai,yup,zero,zig,zimmer,zombie,zone,zoned,zoom
component_1,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
component_2,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.01,-0.0,0.0,...,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0


In [158]:
tran = tf_idf_topic_word.T

In [161]:
tran['component_2'].sort_values(ascending = False)

game          0.68
throne        0.59
iron          0.05
watch         0.02
theme         0.02
petition      0.01
premiere      0.01
cover         0.01
style         0.01
sit           0.01
song          0.01
edition       0.01
fan           0.01
official      0.01
pool          0.01
inspired      0.01
remake        0.01
trailer       0.01
video         0.01
review        0.01
promo         0.01
win           0.01
finale        0.01
recap         0.01
fireproof    -0.00
fleshed      -0.00
flawed       -0.00
flawless     -0.00
flea         -0.00
fleabottom    0.00
              ... 
sansa        -0.03
landing      -0.03
got          -0.03
jaime        -0.03
maybe        -0.03
thought      -0.03
snow         -0.03
way          -0.03
stark        -0.03
theory       -0.03
time         -0.04
dragon       -0.04
white        -0.04
tyrion       -0.04
people       -0.04
really       -0.04
going        -0.04
dead         -0.04
winterfell   -0.04
know         -0.04
battle       -0.05
kill        

In [162]:
display_topics(tf_idf_lsa, tf_idf_vectorizer.get_feature_names(), 15)


Topic  0
throne, night, game, jon, bran, like, arya, dany, think, got, cersei, people, know, end, battle

Topic  1
game, throne, iron, watch, theme, review, recap, sit, video, premiere, cover, remake, finale, petition, win


 ### NMF ###

In [163]:
tf_idf_nmf_model = NMF(2)
nmf_doc_topic = tf_idf_nmf_model.fit_transform(got_tfidf)

In [164]:
tf_idf_topic_word_got = pd.DataFrame(tf_idf_nmf_model.components_.round(2),
             index = ["component_1","component_2"],
             columns = tf_idf_vectorizer.get_feature_names())

In [165]:
display_topics(tf_idf_nmf_model, tf_idf_vectorizer.get_feature_names(), 40)


Topic  0
night, jon, bran, arya, like, dany, think, got, cersei, people, know, battle, time, kill, theory, end, dragon, really, going, tyrion, dead, daenerys, way, winterfell, scene, snow, sansa, white, die, thought, good, jaime, army, death, landing, make, stark, long, ending, queen

Topic  1
game, throne, iron, watch, end, ending, theme, final, series, finale, new, watching, win, sit, video, fan, recap, review, best, got, theory, premiere, song, prediction, cover, remake, petition, trailer, tonight, day, pool, want, daenerys, music, play, live, claim, real, week, favorite


### LDA ###

In [167]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(got_tfidf)

In [168]:
id2word = dict((v, k) for k, v in tf_idf_vectorizer.vocabulary_.items())

In [105]:
lda_count.print_topics()

2019-05-19 16:38:25,914 : INFO : topic #0 (0.143): 0.005*"blah foreshadowing" + 0.004*"burnt brandon" + 0.004*"belong saying" + 0.004*"basically attempted" + 0.004*"beat make" + 0.003*"army sub" + 0.003*"additional safety" + 0.003*"alot shitting" + 0.002*"broken seen" + 0.002*"away writing"
2019-05-19 16:38:25,939 : INFO : topic #1 (0.143): 0.004*"bran tasked" + 0.003*"beric actual" + 0.003*"adding lord" + 0.002*"blessing kind" + 0.002*"burning chanting" + 0.002*"baratheon chaos" + 0.002*"accepted watch" + 0.002*"brother exactly" + 0.002*"brother come" + 0.002*"bed opposite"
2019-05-19 16:38:25,965 : INFO : topic #2 (0.143): 0.005*"arya wandering" + 0.005*"believe greenseer" + 0.004*"believe leader" + 0.003*"bran behaving" + 0.003*"blame burning" + 0.002*"bran game" + 0.002*"army strongest" + 0.002*"army stay" + 0.002*"abandon belief" + 0.002*"birthday huge"
2019-05-19 16:38:25,994 : INFO : topic #3 (0.143): 0.004*"buried forever" + 0.004*"burn building" + 0.004*"bran quiet" + 0.003*"b

[(0,
  '0.005*"blah foreshadowing" + 0.004*"burnt brandon" + 0.004*"belong saying" + 0.004*"basically attempted" + 0.004*"beat make" + 0.003*"army sub" + 0.003*"additional safety" + 0.003*"alot shitting" + 0.002*"broken seen" + 0.002*"away writing"'),
 (1,
  '0.004*"bran tasked" + 0.003*"beric actual" + 0.003*"adding lord" + 0.002*"blessing kind" + 0.002*"burning chanting" + 0.002*"baratheon chaos" + 0.002*"accepted watch" + 0.002*"brother exactly" + 0.002*"brother come" + 0.002*"bed opposite"'),
 (2,
  '0.005*"arya wandering" + 0.005*"believe greenseer" + 0.004*"believe leader" + 0.003*"bran behaving" + 0.003*"blame burning" + 0.002*"bran game" + 0.002*"army strongest" + 0.002*"army stay" + 0.002*"abandon belief" + 0.002*"birthday huge"'),
 (3,
  '0.004*"buried forever" + 0.004*"burn building" + 0.004*"bran quiet" + 0.003*"bad stark" + 0.003*"bad pretty" + 0.003*"attracted watching" + 0.003*"attention jamie" + 0.003*"book general" + 0.003*"book explained" + 0.003*"believe sex"'),
 (4,