In [2]:
# Import other packages for examples
import pandas as pd
import numpy as np
import scipy.sparse as ss

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [57]:
data_df = pd.read_pickle('./lemmatized_bigstrings.pickle')
data_df

Unnamed: 0,text
wsb,future apple share gran kid ask not buy appl...
science,child motivate achieve maximum reward extent...
ama,tldr fuck want tell people internet lie funn...
askreddit,victorias secret originally suppose place man ...
worldnews,post leave purge ok context facebook estimat...
funny,want robot ferret haha suck fool get robotic f...
dankmemes,meme month seed nomination july nomination...
memes,redditmc open staff position builder helper mo...
nosleep,look like story click reminder check later g...
psychology,decision mental health care canada study ...


In [17]:
data_df = pd.read_pickle('./data_clean.pickle')
data_df

Unnamed: 0,text,subreddit
wsb,in the future when apple is at a share your g...,wsb
science,the children were not motivated by achieving ...,science
ama,tldr do whatever the fuck you want so youre te...,ama
askreddit,victorias secret was originally supposed to be...,askreddit
worldnews,so was there any posts left after the purge ok...,worldnews
funny,but i wanted the robot ferret haha suck it you...,funny
dankmemes,meme of the month seeding and nominations for ...,dankmemes
memes,redditmc is opening staff positions builders h...,memes
nosleep,it looks like there may be more to this story ...,nosleep
psychology,decisions regarding mental health care canada...,psychology


In [58]:
wsb = data_df[:1]
wsb

science = data_df[1:2]
science

politics = data_df[-3:-2]
politics

worldnews = data_df[4:5]
worldnews

Unnamed: 0,text
worldnews,post leave purge ok context facebook estimat...


In [61]:
vectorizer = CountVectorizer(
                             stop_words='english',
                             binary=True, ngram_range=(1,3))

doc_word = vectorizer.fit_transform(worldnews.text)
words = list(np.asarray(vectorizer.get_feature_names())) 

In [62]:
topic_model = ct.Corex(n_hidden=6, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=politics.text)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aa missile,pretend virus mean,pretend virus real,pretense,pretty absolutely beat,pretty abuse short,pretty admire obvious,pretty bad asthma,pretty bad ptsd,pretty bad wave
1: aa,preliminary voting,prelude,premba mcol city,premise argument rely,premise chinese entity,premium luxury water,premium storage,premptively destroy,prep include
2: aaaaaaaaaaaa motherland,preobama administration,prep commonly,prep commonly include,prep think hurricane,prepackage,prepaid card wire,prepaid home,prepandemic,preparation different
3: aa weapon shoot,president jair bolsonaro,president literal pussy,president look straight,president looney,president make,president power thing,president reason different,president stop,president strong
4: aa missile ballistic,pretty fast citizen,pretty fucking brutal,pretty fully,pretty generally,pretty generally rich,pretty good hamtown,pretty great reference,pretty grow,pretty holy
5: aback understatement,pretty fully reopen,pretty generic term,pretty god,pretty good

In [43]:
topic_model = ct.Corex(n_hidden=6, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=politics.text, 
                anchors=[['biden', 'kamala'], 'trump'], anchor_strength=10)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aaa eee,pretty high,pretty huge,pretty ideal,pretty idiocracy,pretty impossible,pretty ingrained,pretty ironic,pretty joe,pretty legit
1: aa gyyou,pretty spreading,pretty stances,pretty straightforward,pretty stupid,pretty telling,pretty think,pretty twofer,pretty verbatim,pretty virtually
2: aaaannnndddd went,pretext throw,pretransit,pretty absurd,pretty belt,pretty coherent,pretty common,pretty concerning,pretty convinced,pretty da
3: aa second,president worse,president wtf,president yup,presidentelect theres,presidentelect trump,presidenti,presidenti dont,presidential,presidential campaigns
4: aa,press saying,press sitting,press slip,press unfunny,pressed employees,pressed outside,presser just,presser listening,pressers added
5: aaa,pretty unsure,pretty wa,pretty wonder,pretty woods,prevail hate,prevailing,prevalent stupidity,prevent attending,prevent car


In [53]:
topic_model = ct.Corex(n_hidden=10,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=worldnews.text, anchors=['election', 'biden', 'trump'], anchor_strength=10)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aaaand,probably insiders,probably killed,probably killing,probably kind,probably known,probably knows,probably lean,probably main,probably meant
1: aa,pretty openly,pretty republican,pretty spectacularly,pretty stark,pretty straightforward,pretty sure,pretty typical,pretty underhanded,pretty unreliable
2: aaa,presidents held,presidents son,presidents think,presidents want,presidents win,press ask,press asks,press boom,press face
3: aaaandonto enemies,princeton university,principal,principals unique,principle republican,principle turn,principles thats,print prepared,primed ready,printed charts
4: aaand sue,presidency shown,presidency time,presidency trump,presidency want,president accountable,president actually,president admitted,president allows,president americans
5: aaa bonds,presumably taken,presumed cared,presumption seconds,pretea,pretend biden,pretend choice,pretend does,pretend ignorant,pretend mccain
6: aaaand going,pretty kamala,pretty laughable,pretty leftwing,pretty longr

In [63]:
topic_model = ct.Corex(n_hidden=10,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=worldnews.text, anchors=['election', 'biden', 'trump'], anchor_strength=10)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aaaaaaaaaaaa,preemptively quell,prefer fair matter,prefer hitler xi,prefer little,preferable,preferable actually short,preemptive nuclear,preferable china away,preindication act war
1: abandon deterrent reason,precedent feel dumb,precedent nation follow,precedent set folk,precedent thank,precidence surprised,precipice nuclear,precedent feel,precipitate,precise description situation
2: aa weapon,president literal pussy,president lose,president lose popular,president nuke,president plainly,president power thing,president httpswwwelectionscienceorgadvancingtheenvironmentalmovementthroughapprovalvote,president putin agent,president seehear
3: abandon deterrent,prevent interference,prevent kind,prevent nationality chime,prevent outside,prevent people flee,prevent plane fly,prevent entirely reduce,prevent spread sickness,prevent stock real
4: abandon spouse child,prison major,prison lot people,prison corner,prision island,priority water,priority solution run,priority feed rarely,priority 

In [64]:
topic_model = ct.Corex(n_hidden=10,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=worldnews.text, anchors=[['election', 'biden'], 'trump'], anchor_strength=10)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aaaaaaaaaaaa,preemptively quell,prefer fair matter,prefer hitler xi,prefer little,preferable,preferable actually short,preemptive nuclear,preferable china away,preindication act war
1: abandon deterrent reason,precedent nation follow,precedent set folk,precedent thank,precidence surprised,precipice nuclear,precipitate,precedent feel dumb,precipitate genocide myanmar,precise sure possible
2: aa weapon,president httpswwwelectionscienceorgadvancingtheenvironmentalmovementthroughapprovalvote,president literal pussy,president lose,president lose popular,president nuke,president plainly,president george,president power thing,president sausage phone
3: abandon deterrent,prevent interference,prevent kind,prevent nationality chime,prevent outside,prevent people flee,prevent plane fly,prevent entirely reduce,prevent spread sickness,prevent stock real
4: abandon spouse child,prison major,prison lot people,prison corner,prision island,priority water,priority solution run,priority feed rarely,pr