In [2]:
# Import other packages for examples
import pandas as pd
import numpy as np
import scipy.sparse as ss
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import datasets

from corextopic import corextopic as ct
from corextopic import vis_topic as vt

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [57]:
data_df = pd.read_pickle('./lemmatized_bigstrings.pickle')
data_df

Unnamed: 0,text
wsb,future apple share gran kid ask not buy appl...
science,child motivate achieve maximum reward extent...
ama,tldr fuck want tell people internet lie funn...
askreddit,victorias secret originally suppose place man ...
worldnews,post leave purge ok context facebook estimat...
funny,want robot ferret haha suck fool get robotic f...
dankmemes,meme month seed nomination july nomination...
memes,redditmc open staff position builder helper mo...
nosleep,look like story click reminder check later g...
psychology,decision mental health care canada study ...


In [3]:
with open('/home/nick/Documents/askreddit_lemmatized_dict.pickle', 'rb') as file:
    askreddit = pickle.load(file)

In [58]:
wsb = data_df[:1]
wsb

science = data_df[1:2]
science

politics = data_df[-3:-2]
politics

worldnews = data_df[4:5]
worldnews

Unnamed: 0,text
worldnews,post leave purge ok context facebook estimat...


In [5]:
vectorizer = CountVectorizer()

doc_word = vectorizer.fit_transform(askreddit)
words = list(np.asarray(vectorizer.get_feature_names())) 

In [8]:
topic_model = ct.Corex(n_hidden=20, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=askreddit)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: not,year,like,know,thing,start,want,work,life,way
1: people,need,person,bad,care,problem,actually,job,health,issue
2: time,tell,friend,girl,day,sex,say,night,later,ask
3: pay,money,buy,credit,bank,save,card,company,account,loan
4: president,world,leader,country,war,george,roosevelt,washington,power,political
5: have,feel,get,try,good,come,help,talk,make,week
6: harry,magic,potter,wizard,muggle,speak,english,office,hogwart,teacher
7: home,old,kid,car,parent,family,house,away,drive,young
8: be,go,think,sure,to,pretty,sorry,shit,remember,read
9: jesus,united,states,west,jed,bartlet,wing,vote,christ,joe
10: wear,heel,game,high,play,man,woman,pink,video,shoe
11: eat,use,water,clean,body,originally,weight,ride,blood,healthy
12: bed,break,tooth,drink,brush,morning,smoke,floor,fuck,pull
13: place,big,see,social,deal,kind,call,cost,especially,drug
14: new,run,set,completely,plan,bring,order,course,public,education
15: darth,park,terry,crew,vader,van,mark,der,trailer,assimilate
16: hard,relat

In [9]:
topic_model = ct.Corex(n_hidden=10, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=askreddit, 
                anchors=[['biden'], 'election'], anchor_strength=3)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: go,come,school,try,home,later,leave,ask,family,away
1: people,thing,work,want,lot,life,way,help,take,pay
2: good,right,read,shit,probably,fall,write,maybe,book,hear
3: not,like,year,be,have,feel,know,get,think,start
4: president,world,country,leader,government,money,war,bank,political,power
5: play,game,watch,video,head,super,hand,throw,music,grab
6: time,girl,tell,friend,sex,orgasm,night,ex,hour,girlfriend
7: wear,heel,man,woman,originally,high,pink,shoe,color,hair
8: drink,food,eat,smoke,water,drug,use,alcohol,jed,bartlet
9: say,happen,mom,child,dad,turn,sister,love,guy,fuck


In [53]:
topic_model = ct.Corex(n_hidden=10,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=worldnews.text, anchors=['election', 'biden', 'trump'], anchor_strength=10)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aaaand,probably insiders,probably killed,probably killing,probably kind,probably known,probably knows,probably lean,probably main,probably meant
1: aa,pretty openly,pretty republican,pretty spectacularly,pretty stark,pretty straightforward,pretty sure,pretty typical,pretty underhanded,pretty unreliable
2: aaa,presidents held,presidents son,presidents think,presidents want,presidents win,press ask,press asks,press boom,press face
3: aaaandonto enemies,princeton university,principal,principals unique,principle republican,principle turn,principles thats,print prepared,primed ready,printed charts
4: aaand sue,presidency shown,presidency time,presidency trump,presidency want,president accountable,president actually,president admitted,president allows,president americans
5: aaa bonds,presumably taken,presumed cared,presumption seconds,pretea,pretend biden,pretend choice,pretend does,pretend ignorant,pretend mccain
6: aaaand going,pretty kamala,pretty laughable,pretty leftwing,pretty longr

In [63]:
topic_model = ct.Corex(n_hidden=10,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=worldnews.text, anchors=['election', 'biden', 'trump'], anchor_strength=10)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aaaaaaaaaaaa,preemptively quell,prefer fair matter,prefer hitler xi,prefer little,preferable,preferable actually short,preemptive nuclear,preferable china away,preindication act war
1: abandon deterrent reason,precedent feel dumb,precedent nation follow,precedent set folk,precedent thank,precidence surprised,precipice nuclear,precedent feel,precipitate,precise description situation
2: aa weapon,president literal pussy,president lose,president lose popular,president nuke,president plainly,president power thing,president httpswwwelectionscienceorgadvancingtheenvironmentalmovementthroughapprovalvote,president putin agent,president seehear
3: abandon deterrent,prevent interference,prevent kind,prevent nationality chime,prevent outside,prevent people flee,prevent plane fly,prevent entirely reduce,prevent spread sickness,prevent stock real
4: abandon spouse child,prison major,prison lot people,prison corner,prision island,priority water,priority solution run,priority feed rarely,priority 

In [64]:
topic_model = ct.Corex(n_hidden=10,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=worldnews.text, anchors=[['election', 'biden'], 'trump'], anchor_strength=10)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aaaaaaaaaaaa,preemptively quell,prefer fair matter,prefer hitler xi,prefer little,preferable,preferable actually short,preemptive nuclear,preferable china away,preindication act war
1: abandon deterrent reason,precedent nation follow,precedent set folk,precedent thank,precidence surprised,precipice nuclear,precipitate,precedent feel dumb,precipitate genocide myanmar,precise sure possible
2: aa weapon,president httpswwwelectionscienceorgadvancingtheenvironmentalmovementthroughapprovalvote,president literal pussy,president lose,president lose popular,president nuke,president plainly,president george,president power thing,president sausage phone
3: abandon deterrent,prevent interference,prevent kind,prevent nationality chime,prevent outside,prevent people flee,prevent plane fly,prevent entirely reduce,prevent spread sickness,prevent stock real
4: abandon spouse child,prison major,prison lot people,prison corner,prision island,priority water,priority solution run,priority feed rarely,pr