In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import nltk
import string
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim_models

from gensim import corpora, models
from gensim.utils import effective_n_jobs
import collections
import re
import gc
import os

In [4]:
# get dataframe composed of all csvs in CNN folder
csvs = [x for x in os.listdir('../data/CNN/') if x.endswith('.csv')]
fns = [os.path.splitext(os.path.basename(x))[0] for x in csvs]
topics=[]
for i in fns:
    topics.append(i.split('_')[1])
d = {}
for i in range(len(fns)):
    d[topics[i]] = pd.read_csv(os.path.join('./CNN',csvs[i]))

In [6]:
# functions to lemmatize news texts
STOP = set(nltk.corpus.stopwords.words('english') + list(string.punctuation) + ['``', "''", "’", "“", "”","–", "\'s"])

def get_lemmas(text):
    '''
    Gets lemmas for a string input, excluding stop words, punctuation, as well
    as a set of study-specific stop-words
    '''
    lemmas = [nltk.stem.WordNetLemmatizer().lemmatize(t)
              for t in nltk.word_tokenize((str(text).lower())) if t not in STOP
              ]
    return lemmas

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=2):
    '''
    Computes Coherence values for LDA models with differing numbers of topics.

    Returns list of models along with their respective coherence values (pick
    models with the highest coherence)
    '''
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = models.ldamulticore.LdaMulticore(corpus=corpus,
                                                 id2word=dictionary,
                                                 num_topics=num_topics,
                                                 workers=effective_n_jobs(-1))
        model_list.append(model)
        coherence_model = models.coherencemodel.CoherenceModel(model=model,
                                                          corpus=corpus,
                                                          dictionary=dictionary,
                                                          coherence='u_mass')
        coherence_values.append(coherence_model.get_coherence())
        
    return model_list, coherence_values

In [7]:
def train_lda(topic_df, model_name, num_topics, workers):
    '''
    Trains LDA model on a dataframe of news articles, saves model to disk
    Inputs:
        topic_df: dataframe of news articles
        model_name: name of model to be saved
        num_topics: number of topics to be trained
        workers: number of workers to be used in training
    '''
    # Get lemmas for each article
    lemmas = topic_df['text'].apply(get_lemmas)
    #reduce memory load
    del topic_df
    gc.collect()
    # Initialize Series of lemmas as Gensim Dictionary for further processing
    dictionary = corpora.Dictionary(lemmas)
    # Convert dictionary into bag of words format: list of (token_id, token_count) tuples
    bow_corpus = [dictionary.doc2bow(text) for text in lemmas]
    model_list, coherence_values = compute_coherence_values(dictionary=dictionary,
                                                              corpus=bow_corpus,
                                                              texts=lemmas,
                                                              start=2,
                                                              limit=40,
                                                              step=6)
    # train LDA model
    ldamodel = models.ldamulticore.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, workers=workers, passes=20, iterations=400)
    ldamodel.save('{}.model'.format(model_name))

In [9]:
# train all topic models for CNN, commented out because it takes a long time to run

# for i in topics:
#     train_lda(d[i], i+'CNN', 5, 10)
#     print(i+'done')

In [25]:
# get pyLDAvis visualizations for all topic models under CNN
cnn_topics=[]
ps = []
for i in fns:
    cnn_topics.append(i.split('_')[1])
for i in cnn_topics:
    model = models.ldamodel.LdaModel.load('{}CNN.model'.format(i))
    topics = model.print_topics(num_words=20)
    for topic in topics:
        print(topic)
    lemmas = d[i]['text'].apply(get_lemmas)
    dictionary = corpora.Dictionary.load('{}CNN.model.id2word'.format(i))
    corpus = [dictionary.doc2bow(text) for text in lemmas]
    pyLDAvis.enable_notebook()
    p = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
    ps.append(p)
    print(i+'done')
#     else:
#         model = models.ldamodel.LdaModel.load('{}cnn.model'.format(i))
#         topics = ldamodel.print_topics(num_words=20)
#         for topic in topics:
#             print(topic)
#         dictionary = corpora.Dictionary.load('{}cnn.id2word'.format(i))
#         p = pyLDAvis.gensim_models.prepare(model, bow_corpus, dictionary)
#         print(p)

(0, '0.019*"’" + 0.017*"“" + 0.017*"”" + 0.016*"heart" + 0.010*"said" + 0.010*"attack" + 0.007*"health" + 0.006*"patient" + 0.006*"study" + 0.006*"risk" + 0.005*"–" + 0.005*"people" + 0.005*"doctor" + 0.004*"hospital" + 0.004*"medical" + 0.004*"year" + 0.004*"disease" + 0.004*"one" + 0.003*"woman" + 0.003*"dr."')
(1, '0.022*"“" + 0.022*"’" + 0.022*"”" + 0.020*"said" + 0.013*"attack" + 0.012*"police" + 0.006*"people" + 0.006*"cnn" + 0.005*"one" + 0.004*"told" + 0.004*"two" + 0.004*"–" + 0.004*"according" + 0.004*"officer" + 0.003*"man" + 0.003*"say" + 0.003*"new" + 0.003*"suspect" + 0.003*"—" + 0.003*"victim"')
(2, '0.016*"’" + 0.012*"attack" + 0.011*"said" + 0.009*"”" + 0.009*"“" + 0.007*"ad" + 0.007*"video" + 0.006*"\'s" + 0.005*"security" + 0.005*"company" + 0.005*"content" + 0.005*"cnn" + 0.004*"government" + 0.004*"u" + 0.003*"system" + 0.003*"also" + 0.003*"new" + 0.003*"year" + 0.003*"one" + 0.003*"–"')
(3, '0.035*"’" + 0.026*"“" + 0.025*"”" + 0.015*"trump" + 0.010*"said" + 0.008

  default_term_info = default_term_info.sort_values(


attackdone
(0, '0.034*"’" + 0.010*"cnn" + 0.009*"video" + 0.009*"source" + 0.008*"ad" + 0.008*"feedback" + 0.008*"”" + 0.008*"“" + 0.007*"u" + 0.007*"today" + 0.006*"trump" + 0.006*"new" + 0.005*"get" + 0.005*"said" + 0.004*"thing" + 0.004*"day" + 0.004*"–" + 0.004*"president" + 0.004*"people" + 0.004*"5"')
(1, '0.013*"’" + 0.006*"u" + 0.006*"“" + 0.006*"”" + 0.006*"first" + 0.004*"president" + 0.004*"nichols" + 0.004*"said" + 0.004*"new" + 0.003*"january" + 0.003*"state" + 0.003*"february" + 0.003*"may" + 0.003*"court" + 0.003*"cnn" + 0.003*"obama" + 0.003*"official" + 0.003*"announces" + 0.003*"woman" + 0.003*"biden"')
(2, '0.028*"’" + 0.024*"“" + 0.024*"”" + 0.018*"said" + 0.006*"people" + 0.006*"state" + 0.005*"cnn" + 0.005*"police" + 0.005*"–" + 0.004*"say" + 0.004*"year" + 0.004*"told" + 0.004*"family" + 0.004*"u" + 0.003*"one" + 0.003*"new" + 0.003*"biden" + 0.003*"school" + 0.003*"law" + 0.003*"gun"')
(3, '0.016*"’" + 0.014*"said" + 0.013*"“" + 0.013*"”" + 0.007*"fire" + 0.007*

  default_term_info = default_term_info.sort_values(


Bidendone
(0, '0.031*"’" + 0.028*"“" + 0.028*"”" + 0.019*"said" + 0.013*"police" + 0.009*"black" + 0.007*"officer" + 0.006*"cnn" + 0.005*"people" + 0.005*"say" + 0.004*"one" + 0.004*"told" + 0.004*"–" + 0.003*"white" + 0.003*"family" + 0.003*"city" + 0.003*"video" + 0.003*"according" + 0.003*"death" + 0.003*"department"')
(1, '0.034*"’" + 0.023*"“" + 0.023*"”" + 0.015*"black" + 0.008*"said" + 0.007*"–" + 0.007*"people" + 0.007*"white" + 0.005*"american" + 0.005*"trump" + 0.004*"cnn" + 0.004*"president" + 0.004*"woman" + 0.004*"state" + 0.004*"right" + 0.004*"u" + 0.004*"one" + 0.003*"say" + 0.003*"would" + 0.003*"year"')
(2, '0.025*"“" + 0.024*"’" + 0.024*"”" + 0.012*"black" + 0.007*"\'s" + 0.005*"said" + 0.005*"–" + 0.004*"one" + 0.004*"year" + 0.004*"image" + 0.004*"first" + 0.004*"like" + 0.003*"cnn" + 0.003*"time" + 0.003*"new" + 0.003*"people" + 0.003*"film" + 0.003*"world" + 0.003*"player" + 0.003*"—"')
(3, '0.025*"’" + 0.011*"“" + 0.011*"”" + 0.011*"voter" + 0.010*"said" + 0.008

  default_term_info = default_term_info.sort_values(


blackdone
(0, '0.035*"’" + 0.011*"cnn" + 0.010*"video" + 0.009*"source" + 0.009*"ad" + 0.009*"feedback" + 0.008*"”" + 0.008*"“" + 0.007*"u" + 0.007*"trump" + 0.006*"new" + 0.005*"today" + 0.004*"get" + 0.004*"day" + 0.004*"people" + 0.004*"thing" + 0.004*"president" + 0.004*"first" + 0.004*"said" + 0.003*"–"')
(1, '0.024*"’" + 0.020*"“" + 0.020*"”" + 0.012*"said" + 0.004*"china" + 0.004*"fire" + 0.004*"–" + 0.004*"people" + 0.004*"state" + 0.004*"cnn" + 0.003*"u" + 0.003*"chinese" + 0.003*"one" + 0.003*"say" + 0.003*"year" + 0.003*"california" + 0.003*"new" + 0.003*"police" + 0.002*"two" + 0.002*"would"')
(2, '0.017*"’" + 0.016*"”" + 0.016*"“" + 0.008*"said" + 0.005*"–" + 0.004*"year" + 0.004*"u" + 0.004*"u.s." + 0.004*"state" + 0.004*"china" + 0.003*"north" + 0.003*"cnn" + 0.003*"new" + 0.003*"one" + 0.003*"united" + 0.003*"president" + 0.003*"would" + 0.003*"say" + 0.003*"military" + 0.002*"korea"')
(3, '0.029*"’" + 0.022*"“" + 0.021*"”" + 0.016*"said" + 0.006*"people" + 0.005*"–" + 

  default_term_info = default_term_info.sort_values(


Chinadone
(0, '0.027*"’" + 0.018*"”" + 0.018*"“" + 0.014*"trump" + 0.011*"said" + 0.009*"president" + 0.009*"u" + 0.006*"state" + 0.005*"–" + 0.005*"would" + 0.005*"china" + 0.004*"cnn" + 0.004*"house" + 0.004*"administration" + 0.003*"new" + 0.003*"also" + 0.003*"conflict" + 0.003*"\'s" + 0.003*"year" + 0.003*"iran"')
(1, '0.014*"’" + 0.013*"“" + 0.013*"”" + 0.009*"said" + 0.008*"people" + 0.006*"country" + 0.005*"year" + 0.005*"child" + 0.005*"–" + 0.004*"world" + 0.004*"one" + 0.004*"cnn" + 0.004*"\'s" + 0.004*"conflict" + 0.003*"say" + 0.003*"image" + 0.003*"also" + 0.003*"war" + 0.003*"according" + 0.003*"many"')
(2, '0.031*"’" + 0.022*"“" + 0.021*"”" + 0.008*"said" + 0.006*"–" + 0.005*"cnn" + 0.004*"people" + 0.004*"say" + 0.004*"one" + 0.004*"year" + 0.003*"time" + 0.003*"conflict" + 0.003*"woman" + 0.003*"like" + 0.003*"also" + 0.003*"new" + 0.003*"would" + 0.003*"—" + 0.002*"right" + 0.002*"world"')
(3, '0.019*"’" + 0.018*"“" + 0.018*"”" + 0.016*"said" + 0.009*"syria" + 0.007*

  default_term_info = default_term_info.sort_values(


conflictdone
(0, '0.022*"’" + 0.021*"“" + 0.020*"”" + 0.013*"said" + 0.009*"law" + 0.007*"crime" + 0.007*"police" + 0.005*"state" + 0.005*"case" + 0.005*"cnn" + 0.005*"new" + 0.004*"year" + 0.004*"department" + 0.004*"officer" + 0.004*"woman" + 0.004*"sexual" + 0.004*"say" + 0.004*"court" + 0.004*"–" + 0.003*"also"')
(1, '0.033*"’" + 0.020*"“" + 0.020*"”" + 0.018*"trump" + 0.010*"president" + 0.009*"said" + 0.007*"–" + 0.005*"cnn" + 0.005*"would" + 0.005*"crime" + 0.005*"house" + 0.004*"justice" + 0.004*"state" + 0.004*"former" + 0.004*"u" + 0.004*"republican" + 0.004*"investigation" + 0.004*"federal" + 0.004*"mueller" + 0.003*"election"')
(2, '0.021*"said" + 0.017*"“" + 0.017*"”" + 0.017*"’" + 0.012*"police" + 0.006*"cnn" + 0.006*"crime" + 0.005*"according" + 0.004*"people" + 0.004*"two" + 0.004*"authority" + 0.004*"one" + 0.003*"told" + 0.003*"attack" + 0.003*"state" + 0.003*"say" + 0.003*"–" + 0.003*"—" + 0.003*"year" + 0.003*"suspect"')
(3, '0.032*"’" + 0.020*"“" + 0.020*"”" + 0.00

  default_term_info = default_term_info.sort_values(


crimedone
(0, '0.029*"’" + 0.024*"“" + 0.024*"”" + 0.014*"said" + 0.010*"house" + 0.008*"trump" + 0.008*"democratic" + 0.008*"democrat" + 0.007*"president" + 0.006*"cnn" + 0.006*"would" + 0.006*"republican" + 0.005*"senate" + 0.005*"–" + 0.004*"bill" + 0.004*"committee" + 0.004*"vote" + 0.004*"told" + 0.004*"u" + 0.004*"member"')
(1, '0.023*"’" + 0.016*"voter" + 0.015*"poll" + 0.012*"democratic" + 0.011*"trump" + 0.011*"republican" + 0.011*"democrat" + 0.009*"among" + 0.008*"point" + 0.007*"clinton" + 0.006*"state" + 0.006*"candidate" + 0.006*"say" + 0.006*"party" + 0.006*"cnn" + 0.005*"likely" + 0.005*"election" + 0.005*"vote" + 0.005*"biden" + 0.005*"woman"')
(2, '0.030*"’" + 0.015*"republican" + 0.014*"state" + 0.014*"“" + 0.014*"”" + 0.011*"democrat" + 0.010*"democratic" + 0.009*"election" + 0.009*"–" + 0.007*"trump" + 0.007*"race" + 0.007*"senate" + 0.006*"said" + 0.006*"party" + 0.005*"district" + 0.005*"house" + 0.005*"seat" + 0.005*"voter" + 0.005*"year" + 0.004*"candidate"')
(

  default_term_info = default_term_info.sort_values(


democraticdone
(0, '0.024*"“" + 0.024*"”" + 0.022*"’" + 0.019*"said" + 0.006*"cnn" + 0.005*"police" + 0.005*"people" + 0.004*"state" + 0.004*"fair" + 0.004*"school" + 0.003*"say" + 0.003*"–" + 0.003*"according" + 0.003*"student" + 0.003*"one" + 0.003*"u" + 0.003*"government" + 0.003*"year" + 0.003*"told" + 0.003*"new"')
(1, '0.013*"\'s" + 0.010*"’" + 0.007*"said" + 0.007*"company" + 0.006*"year" + 0.005*"tax" + 0.005*"new" + 0.004*"“" + 0.004*"”" + 0.004*"also" + 0.004*"fair" + 0.004*"would" + 0.003*"one" + 0.003*"--" + 0.003*"business" + 0.003*"million" + 0.003*"pay" + 0.003*"worker" + 0.003*"n\'t" + 0.003*"like"')
(2, '0.033*"’" + 0.016*"“" + 0.013*"”" + 0.007*"image" + 0.006*"player" + 0.006*"team" + 0.006*"–" + 0.005*"game" + 0.005*"club" + 0.005*"league" + 0.005*"world" + 0.004*"\'s" + 0.004*"said" + 0.004*"year" + 0.004*"one" + 0.004*"first" + 0.003*"football" + 0.003*"cnn" + 0.003*"season" + 0.003*"sport"')
(3, '0.038*"’" + 0.023*"“" + 0.023*"”" + 0.013*"trump" + 0.009*"said" + 

  default_term_info = default_term_info.sort_values(


fairdone
(0, '0.034*"’" + 0.015*"”" + 0.015*"“" + 0.009*"trump" + 0.008*"–" + 0.007*"said" + 0.006*"president" + 0.006*"gun" + 0.005*"republican" + 0.005*"cnn" + 0.004*"clinton" + 0.004*"would" + 0.004*"new" + 0.004*"house" + 0.004*"state" + 0.003*"year" + 0.003*"democrat" + 0.003*"one" + 0.003*"people" + 0.003*"u"')
(1, '0.027*"’" + 0.025*"said" + 0.020*"“" + 0.020*"”" + 0.018*"police" + 0.012*"officer" + 0.007*"shooting" + 0.006*"shot" + 0.006*"gun" + 0.006*"cnn" + 0.005*"told" + 0.005*"according" + 0.004*"two" + 0.004*"one" + 0.004*"say" + 0.004*"man" + 0.004*"attorney" + 0.003*"–" + 0.003*"department" + 0.003*"family"')
(2, '0.018*"image" + 0.016*"\'s" + 0.012*"zealand" + 0.009*"minister" + 0.007*"prime" + 0.007*"mosque" + 0.006*"new" + 0.006*"christchurch" + 0.006*"abe" + 0.005*"attack" + 0.004*"japan" + 0.004*"ardern" + 0.004*"march" + 0.004*"auction" + 0.003*"simpson" + 0.003*"australian" + 0.003*"japanese" + 0.003*"australia" + 0.003*"friday" + 0.002*"--"')
(3, '0.031*"gun" + 0

  default_term_info = default_term_info.sort_values(


gundone
(0, '0.029*"’" + 0.019*"“" + 0.019*"”" + 0.012*"immigration" + 0.012*"said" + 0.011*"republican" + 0.011*"house" + 0.009*"president" + 0.009*"trump" + 0.009*"would" + 0.008*"democrat" + 0.007*"–" + 0.007*"bill" + 0.006*"senate" + 0.006*"obama" + 0.005*"congress" + 0.005*"border" + 0.004*"immigrant" + 0.004*"daca" + 0.004*"white"')
(1, '0.026*"’" + 0.020*"“" + 0.020*"”" + 0.013*"border" + 0.013*"said" + 0.009*"child" + 0.009*"u" + 0.008*"migrant" + 0.007*"immigration" + 0.007*"family" + 0.006*"cnn" + 0.006*"trump" + 0.006*"administration" + 0.005*"–" + 0.005*"people" + 0.005*"state" + 0.004*"mexico" + 0.004*"country" + 0.004*"official" + 0.004*"year"')
(2, '0.039*"’" + 0.029*"“" + 0.029*"”" + 0.018*"trump" + 0.012*"said" + 0.007*"–" + 0.006*"president" + 0.005*"cnn" + 0.005*"people" + 0.004*"immigration" + 0.004*"state" + 0.004*"campaign" + 0.004*"country" + 0.004*"one" + 0.003*"party" + 0.003*"would" + 0.003*"republican" + 0.003*"new" + 0.003*"say" + 0.003*"also"')
(3, '0.017*"

  default_term_info = default_term_info.sort_values(


immigrationdone
(0, '0.030*"’" + 0.020*"“" + 0.020*"”" + 0.016*"russia" + 0.013*"u" + 0.013*"said" + 0.012*"ukraine" + 0.010*"russian" + 0.008*"putin" + 0.007*"biden" + 0.007*"president" + 0.007*"–" + 0.006*"would" + 0.006*"official" + 0.005*"china" + 0.005*"war" + 0.005*"state" + 0.005*"country" + 0.004*"military" + 0.004*"cnn"')
(1, '0.030*"’" + 0.014*"“" + 0.014*"”" + 0.007*"said" + 0.007*"year" + 0.006*"price" + 0.005*"oil" + 0.004*"–" + 0.004*"gas" + 0.004*"—" + 0.004*"cnn" + 0.004*"u" + 0.003*"new" + 0.003*"energy" + 0.003*"world" + 0.003*"also" + 0.003*"one" + 0.003*"market" + 0.003*"company" + 0.003*"first"')
(2, '0.018*"“" + 0.018*"’" + 0.018*"”" + 0.013*"said" + 0.010*"russian" + 0.010*"ukraine" + 0.009*"ukrainian" + 0.006*"cnn" + 0.005*"force" + 0.005*"city" + 0.005*"people" + 0.005*"war" + 0.005*"–" + 0.005*"military" + 0.005*"russia" + 0.004*"country" + 0.004*"one" + 0.004*"say" + 0.003*"day" + 0.003*"attack"')
(3, '0.027*"’" + 0.017*"“" + 0.017*"”" + 0.012*"trump" + 0.009

  default_term_info = default_term_info.sort_values(


invasiondone
(0, '0.031*"’" + 0.028*"“" + 0.027*"”" + 0.012*"said" + 0.008*"people" + 0.007*"gay" + 0.005*"–" + 0.005*"lgbt" + 0.005*"say" + 0.005*"right" + 0.005*"cnn" + 0.004*"one" + 0.004*"year" + 0.003*"community" + 0.003*"also" + 0.003*"country" + 0.003*"would" + 0.003*"woman" + 0.003*"world" + 0.003*"like"')
(1, '0.033*"’" + 0.022*"“" + 0.022*"”" + 0.011*"said" + 0.008*"state" + 0.007*"–" + 0.006*"trump" + 0.006*"court" + 0.006*"law" + 0.005*"right" + 0.005*"would" + 0.004*"president" + 0.004*"people" + 0.004*"republican" + 0.004*"cnn" + 0.004*"one" + 0.003*"bill" + 0.003*"issue" + 0.003*"year" + 0.003*"say"')
(2, '0.018*"’" + 0.017*"trump" + 0.013*"clinton" + 0.012*"”" + 0.012*"“" + 0.008*"said" + 0.006*"president" + 0.006*"russia" + 0.006*"image" + 0.005*"russian" + 0.004*"cnn" + 0.004*"–" + 0.004*"campaign" + 0.004*"obama" + 0.004*"political" + 0.004*"election" + 0.004*"country" + 0.004*"party" + 0.003*"new" + 0.003*"republican"')
(3, '0.023*"’" + 0.015*"“" + 0.015*"”" + 0.010

  default_term_info = default_term_info.sort_values(


LGBTdone
(0, '0.029*"said" + 0.026*"police" + 0.024*"’" + 0.022*"“" + 0.022*"”" + 0.007*"cnn" + 0.007*"officer" + 0.006*"told" + 0.005*"according" + 0.005*"suspect" + 0.005*"two" + 0.005*"one" + 0.004*"man" + 0.004*"say" + 0.004*"found" + 0.004*"shooting" + 0.004*"—" + 0.003*"authority" + 0.003*"people" + 0.003*"victim"')
(1, '0.023*"police" + 0.015*"said" + 0.013*"’" + 0.012*"“" + 0.012*"”" + 0.008*"protester" + 0.008*"people" + 0.006*"attack" + 0.005*"image" + 0.005*"cnn" + 0.005*"protest" + 0.004*"government" + 0.004*"group" + 0.004*"–" + 0.004*"officer" + 0.003*"city" + 0.003*"hong" + 0.003*"arrested" + 0.003*"kong" + 0.003*"authority"')
(2, '0.029*"’" + 0.027*"police" + 0.026*"“" + 0.026*"”" + 0.020*"officer" + 0.019*"said" + 0.007*"department" + 0.006*"city" + 0.005*"cnn" + 0.005*"shooting" + 0.004*"video" + 0.004*"people" + 0.004*"–" + 0.004*"black" + 0.004*"one" + 0.003*"community" + 0.003*"say" + 0.003*"law" + 0.003*"according" + 0.003*"attorney"')
(3, '0.025*"’" + 0.015*"“" +

  default_term_info = default_term_info.sort_values(


policedone
(0, '0.023*"’" + 0.023*"“" + 0.023*"”" + 0.017*"said" + 0.014*"police" + 0.008*"officer" + 0.006*"protest" + 0.006*"cnn" + 0.005*"state" + 0.005*"people" + 0.004*"protester" + 0.004*"department" + 0.004*"city" + 0.004*"–" + 0.004*"law" + 0.003*"one" + 0.003*"told" + 0.003*"according" + 0.003*"two" + 0.003*"new"')
(1, '0.023*"’" + 0.020*"“" + 0.020*"”" + 0.014*"said" + 0.008*"protest" + 0.006*"government" + 0.006*"people" + 0.005*"cnn" + 0.005*"country" + 0.005*"–" + 0.005*"police" + 0.004*"protester" + 0.003*"also" + 0.003*"say" + 0.003*"year" + 0.003*"one" + 0.003*"new" + 0.003*"city" + 0.003*"group" + 0.003*"state"')
(2, '0.029*"hong" + 0.028*"kong" + 0.024*"protester" + 0.021*"image" + 0.014*"police" + 0.009*"protest" + 0.008*"’" + 0.007*"\'s" + 0.006*"”" + 0.006*"november" + 0.006*"“" + 0.006*"pro-democracy" + 0.005*"october" + 0.005*"city" + 0.005*"anthony" + 0.005*"china" + 0.004*"people" + 0.004*"university" + 0.004*"government" + 0.004*"said"')
(3, '0.028*"’" + 0.022

  default_term_info = default_term_info.sort_values(


protestdone
(0, '0.035*"’" + 0.020*"“" + 0.020*"”" + 0.019*"trump" + 0.010*"said" + 0.009*"president" + 0.008*"–" + 0.007*"republican" + 0.007*"house" + 0.006*"election" + 0.006*"state" + 0.006*"would" + 0.005*"cnn" + 0.004*"court" + 0.004*"democrat" + 0.004*"senate" + 0.004*"campaign" + 0.003*"vote" + 0.003*"white" + 0.003*"committee"')
(1, '0.020*"’" + 0.016*"”" + 0.016*"“" + 0.013*"said" + 0.007*"trump" + 0.006*"u" + 0.006*"\'s" + 0.005*"would" + 0.005*"china" + 0.005*"president" + 0.004*"cnn" + 0.004*"official" + 0.004*"government" + 0.004*"state" + 0.004*"year" + 0.004*"new" + 0.004*"also" + 0.003*"company" + 0.003*"–" + 0.003*"country"')
(2, '0.021*"’" + 0.015*"“" + 0.015*"”" + 0.012*"said" + 0.005*"cnn" + 0.005*"–" + 0.005*"government" + 0.004*"country" + 0.004*"people" + 0.004*"image" + 0.003*"year" + 0.003*"one" + 0.003*"minister" + 0.003*"two" + 0.003*"say" + 0.003*"state" + 0.003*"\'s" + 0.003*"—" + 0.003*"united" + 0.003*"world"')
(3, '0.030*"’" + 0.024*"“" + 0.024*"”" + 0.

  default_term_info = default_term_info.sort_values(


refuseddone
(0, '0.027*"’" + 0.010*"“" + 0.009*"”" + 0.006*"world" + 0.005*"–" + 0.005*"\'s" + 0.005*"year" + 0.005*"team" + 0.005*"game" + 0.005*"first" + 0.004*"image" + 0.003*"one" + 0.003*"player" + 0.003*"match" + 0.003*"time" + 0.003*"said" + 0.003*"cup" + 0.003*"win" + 0.003*"—" + 0.003*"republic"')
(1, '0.033*"’" + 0.021*"“" + 0.021*"”" + 0.008*"trump" + 0.007*"–" + 0.007*"said" + 0.006*"president" + 0.004*"cnn" + 0.004*"one" + 0.004*"people" + 0.004*"year" + 0.003*"state" + 0.003*"would" + 0.003*"new" + 0.003*"election" + 0.003*"time" + 0.003*"say" + 0.003*"u" + 0.003*"house" + 0.003*"republican"')
(2, '0.018*"’" + 0.011*"“" + 0.011*"”" + 0.007*"country" + 0.006*"said" + 0.006*"–" + 0.005*"health" + 0.005*"people" + 0.005*"world" + 0.004*"new" + 0.004*"case" + 0.004*"one" + 0.004*"year" + 0.004*"also" + 0.003*"africa" + 0.003*"say" + 0.003*"cnn" + 0.003*"travel" + 0.003*"vaccine" + 0.003*"according"')
(3, '0.026*"’" + 0.021*"“" + 0.021*"”" + 0.013*"said" + 0.008*"u" + 0.007*"i

  default_term_info = default_term_info.sort_values(


republicdone
(0, '0.030*"’" + 0.023*"“" + 0.022*"”" + 0.017*"said" + 0.006*"–" + 0.005*"cnn" + 0.005*"tsarnaev" + 0.005*"russia" + 0.005*"state" + 0.005*"say" + 0.004*"one" + 0.004*"russian" + 0.004*"told" + 0.004*"people" + 0.004*"would" + 0.003*"official" + 0.003*"u.s." + 0.003*"year" + 0.003*"two" + 0.003*"united"')
(1, '0.033*"’" + 0.009*"”" + 0.009*"“" + 0.008*"cnn" + 0.007*"video" + 0.007*"u" + 0.007*"source" + 0.007*"today" + 0.006*"ad" + 0.006*"said" + 0.006*"new" + 0.006*"feedback" + 0.006*"trump" + 0.005*"–" + 0.005*"get" + 0.004*"people" + 0.004*"thing" + 0.004*"day" + 0.004*"5" + 0.004*"president"')
(2, '0.038*"’" + 0.014*"cnn" + 0.012*"video" + 0.012*"source" + 0.012*"ad" + 0.011*"feedback" + 0.009*"trump" + 0.008*"”" + 0.008*"“" + 0.006*"–" + 0.005*"president" + 0.005*"new" + 0.005*"say" + 0.004*"day" + 0.004*"people" + 0.004*"u" + 0.004*"first" + 0.003*"one" + 0.003*"year" + 0.003*"know"')
(3, '0.020*"’" + 0.010*"”" + 0.009*"“" + 0.005*"–" + 0.004*"people" + 0.004*"said"

  default_term_info = default_term_info.sort_values(


Russiadone
(0, '0.014*"’" + 0.013*"isi" + 0.012*"said" + 0.012*"“" + 0.011*"”" + 0.009*"force" + 0.008*"syria" + 0.006*"group" + 0.006*"syrian" + 0.006*"image" + 0.005*"military" + 0.005*"israel" + 0.005*"israeli" + 0.005*"city" + 0.005*"turkey" + 0.005*"iraq" + 0.004*"–" + 0.004*"iraqi" + 0.004*"cnn" + 0.004*"government"')
(1, '0.013*"’" + 0.011*"said" + 0.010*"“" + 0.010*"”" + 0.007*"\'s" + 0.006*"police" + 0.005*"london" + 0.005*"attack" + 0.005*"year" + 0.004*"people" + 0.004*"cnn" + 0.004*"image" + 0.003*"terror" + 0.003*"country" + 0.003*"new" + 0.003*"also" + 0.003*"government" + 0.003*"–" + 0.003*"one" + 0.003*"saudi"')
(2, '0.031*"’" + 0.021*"“" + 0.021*"”" + 0.010*"said" + 0.007*"–" + 0.006*"cnn" + 0.005*"people" + 0.005*"one" + 0.005*"say" + 0.004*"isi" + 0.004*"attack" + 0.004*"year" + 0.003*"time" + 0.003*"like" + 0.003*"group" + 0.003*"family" + 0.003*"new" + 0.003*"video" + 0.003*"life" + 0.003*"terror"')
(3, '0.029*"’" + 0.024*"“" + 0.024*"”" + 0.013*"said" + 0.010*"tru

  default_term_info = default_term_info.sort_values(


terrordone
(0, '0.012*"oil" + 0.011*"cuban" + 0.008*"cuba" + 0.007*"venezuela" + 0.006*"air" + 0.006*"force" + 0.004*"maduro" + 0.004*"venezuelan" + 0.004*"opec" + 0.003*"image" + 0.003*"castro" + 0.003*"aircraft" + 0.003*"—" + 0.003*"jet" + 0.003*"carrier" + 0.003*"chavez" + 0.003*"price" + 0.002*"mosul" + 0.002*"u.s." + 0.002*"turkey"')
(1, '0.027*"trump" + 0.023*"’" + 0.018*"“" + 0.017*"”" + 0.013*"u" + 0.012*"said" + 0.012*"president" + 0.007*"administration" + 0.006*"state" + 0.006*"would" + 0.005*"north" + 0.005*"official" + 0.005*"china" + 0.004*"–" + 0.004*"country" + 0.004*"leader" + 0.004*"korea" + 0.004*"trade" + 0.004*"also" + 0.004*"united"')
(2, '0.041*"’" + 0.037*"trump" + 0.027*"“" + 0.027*"”" + 0.012*"said" + 0.009*"president" + 0.008*"–" + 0.006*"republican" + 0.006*"cnn" + 0.005*"campaign" + 0.005*"house" + 0.005*"donald" + 0.004*"would" + 0.004*"new" + 0.004*"white" + 0.004*"one" + 0.003*"clinton" + 0.003*"—" + 0.003*"state" + 0.003*"time"')
(3, '0.036*"trump" + 0.0

  default_term_info = default_term_info.sort_values(


Trumpdone
(0, '0.030*"’" + 0.019*"”" + 0.019*"“" + 0.014*"russia" + 0.014*"u" + 0.010*"said" + 0.010*"ukraine" + 0.009*"putin" + 0.008*"russian" + 0.007*"biden" + 0.007*"president" + 0.007*"–" + 0.006*"state" + 0.005*"would" + 0.005*"official" + 0.005*"war" + 0.005*"country" + 0.004*"nato" + 0.004*"cnn" + 0.004*"also"')
(1, '0.037*"’" + 0.022*"trump" + 0.021*"“" + 0.021*"”" + 0.013*"president" + 0.010*"said" + 0.009*"house" + 0.008*"–" + 0.008*"ukraine" + 0.006*"impeachment" + 0.005*"cnn" + 0.005*"republican" + 0.005*"biden" + 0.005*"democrat" + 0.005*"u" + 0.004*"would" + 0.004*"white" + 0.004*"former" + 0.004*"call" + 0.004*"investigation"')
(2, '0.020*"’" + 0.010*"“" + 0.010*"”" + 0.009*"said" + 0.008*"price" + 0.008*"year" + 0.007*"russia" + 0.007*"\'s" + 0.005*"oil" + 0.005*"company" + 0.005*"gas" + 0.005*"ukraine" + 0.005*"market" + 0.004*"energy" + 0.004*"—" + 0.004*"world" + 0.004*"economy" + 0.004*"country" + 0.004*"also" + 0.004*"russian"')
(3, '0.023*"“" + 0.023*"”" + 0.022*

  default_term_info = default_term_info.sort_values(


ukrainedone
(0, '0.034*"’" + 0.011*"cnn" + 0.010*"video" + 0.010*"source" + 0.009*"ad" + 0.009*"feedback" + 0.008*"”" + 0.008*"“" + 0.007*"u" + 0.006*"trump" + 0.006*"new" + 0.006*"today" + 0.005*"get" + 0.004*"thing" + 0.004*"said" + 0.004*"first" + 0.004*"day" + 0.004*"people" + 0.004*"president" + 0.004*"5"')
(1, '0.017*"“" + 0.017*"”" + 0.016*"’" + 0.011*"said" + 0.006*"image" + 0.004*"people" + 0.004*"–" + 0.004*"u" + 0.004*"\'s" + 0.003*"one" + 0.003*"cnn" + 0.003*"september" + 0.003*"say" + 0.003*"told" + 0.003*"time" + 0.003*"2020." + 0.003*"akram" + 0.002*"fire" + 0.002*"year" + 0.002*"two"')
(2, '0.021*"’" + 0.017*"“" + 0.017*"”" + 0.013*"said" + 0.006*"people" + 0.005*"vaccine" + 0.005*"state" + 0.005*"u" + 0.005*"–" + 0.004*"cnn" + 0.004*"health" + 0.004*"new" + 0.003*"uk" + 0.003*"covid-19" + 0.003*"year" + 0.003*"case" + 0.003*"temperature" + 0.003*"day" + 0.003*"according" + 0.003*"also"')
(3, '0.023*"“" + 0.023*"”" + 0.022*"’" + 0.012*"said" + 0.005*"cnn" + 0.004*"story

  default_term_info = default_term_info.sort_values(


UKdone
(0, '0.025*"’" + 0.018*"“" + 0.018*"”" + 0.015*"said" + 0.009*"u" + 0.008*"state" + 0.006*"people" + 0.005*"–" + 0.005*"cnn" + 0.004*"new" + 0.004*"health" + 0.004*"year" + 0.004*"border" + 0.004*"case" + 0.004*"according" + 0.003*"say" + 0.003*"covid-19" + 0.003*"week" + 0.003*"also" + 0.003*"child"')
(1, '0.044*"’" + 0.026*"“" + 0.025*"”" + 0.013*"said" + 0.006*"u" + 0.006*"one" + 0.005*"cnn" + 0.005*"family" + 0.005*"–" + 0.005*"like" + 0.005*"people" + 0.005*"say" + 0.004*"year" + 0.004*"time" + 0.003*"day" + 0.003*"told" + 0.003*"would" + 0.003*"get" + 0.003*"life" + 0.003*"know"')
(2, '0.021*"said" + 0.018*"’" + 0.017*"“" + 0.017*"”" + 0.009*"police" + 0.007*"cnn" + 0.006*"u" + 0.006*"according" + 0.005*"officer" + 0.004*"people" + 0.004*"told" + 0.004*"attorney" + 0.004*"one" + 0.003*"two" + 0.003*"state" + 0.003*"county" + 0.003*"department" + 0.003*"—" + 0.003*"shooting" + 0.003*"new"')
(3, '0.027*"“" + 0.027*"”" + 0.026*"’" + 0.014*"said" + 0.007*"school" + 0.006*"u" +

  default_term_info = default_term_info.sort_values(


USdone
(0, '0.028*"’" + 0.022*"“" + 0.021*"”" + 0.011*"said" + 0.011*"gun" + 0.008*"police" + 0.007*"violence" + 0.006*"–" + 0.006*"people" + 0.005*"trump" + 0.005*"shooting" + 0.005*"cnn" + 0.004*"law" + 0.004*"one" + 0.004*"white" + 0.003*"officer" + 0.003*"state" + 0.003*"new" + 0.003*"city" + 0.003*"president"')
(1, '0.018*"’" + 0.015*"said" + 0.011*"“" + 0.011*"”" + 0.009*"people" + 0.008*"violence" + 0.006*"cnn" + 0.006*"attack" + 0.006*"killed" + 0.005*"police" + 0.005*"–" + 0.005*"country" + 0.005*"child" + 0.004*"year" + 0.004*"one" + 0.004*"state" + 0.004*"two" + 0.004*"group" + 0.004*"according" + 0.003*"say"')
(2, '0.028*"’" + 0.025*"“" + 0.024*"”" + 0.015*"said" + 0.007*"violence" + 0.005*"cnn" + 0.005*"–" + 0.004*"people" + 0.004*"one" + 0.004*"woman" + 0.004*"say" + 0.003*"year" + 0.003*"police" + 0.003*"also" + 0.003*"told" + 0.003*"time" + 0.003*"—" + 0.003*"israeli" + 0.003*"would" + 0.003*"new"')
(3, '0.023*"“" + 0.023*"”" + 0.023*"’" + 0.016*"said" + 0.007*"presiden

  default_term_info = default_term_info.sort_values(


violencedone
(0, '0.018*"’" + 0.014*"“" + 0.014*"”" + 0.009*"said" + 0.008*"war" + 0.006*"people" + 0.005*"cnn" + 0.004*"child" + 0.004*"–" + 0.004*"image" + 0.004*"say" + 0.004*"one" + 0.004*"year" + 0.003*"crime" + 0.003*"country" + 0.003*"government" + 0.003*"world" + 0.003*"refugee" + 0.003*"police" + 0.003*"city"')
(1, '0.022*"’" + 0.018*"“" + 0.017*"”" + 0.010*"war" + 0.006*"said" + 0.006*"–" + 0.005*"one" + 0.005*"year" + 0.005*"\'s" + 0.004*"world" + 0.004*"time" + 0.004*"cnn" + 0.003*"first" + 0.003*"people" + 0.003*"day" + 0.003*"life" + 0.003*"say" + 0.003*"family" + 0.003*"new" + 0.003*"story"')
(2, '0.021*"’" + 0.017*"“" + 0.017*"”" + 0.013*"said" + 0.011*"syria" + 0.008*"war" + 0.008*"north" + 0.008*"syrian" + 0.007*"korea" + 0.007*"isi" + 0.006*"force" + 0.006*"military" + 0.006*"–" + 0.005*"attack" + 0.005*"state" + 0.005*"group" + 0.005*"country" + 0.005*"government" + 0.005*"korean" + 0.005*"u"')
(3, '0.029*"’" + 0.020*"”" + 0.020*"“" + 0.011*"trump" + 0.010*"said" + 

  default_term_info = default_term_info.sort_values(


wardone


In [44]:
pyLDAvis.enable_notebook
ps[0]

In [45]:
ps[1]

In [46]:
ps[2]

In [47]:
ps[3]

In [48]:
ps[4]

In [49]:
ps[5]

In [50]:
ps[6]

In [51]:
ps[7]

In [52]:
ps[8]

In [53]:
ps[9]

In [54]:
ps[10]

In [55]:
ps[11]

In [56]:
ps[12]

In [57]:
ps[13]

In [58]:
ps[14]

In [59]:
ps[15]

In [60]:
ps[16]

In [61]:
ps[17]

In [62]:
ps[18]

In [63]:
ps[19]

In [64]:
ps[20]

In [65]:
ps[21]

In [66]:
ps[22]

In [67]:
ps[23]

### Train NyPost LDA

In [84]:
#import nypost data
ny_csvs = [x for x in os.listdir('../data/nypost/') if x.endswith('.csv')]
fns = [os.path.splitext(os.path.basename(x))[0] for x in ny_csvs]
ny_topics=[]
for i in fns:
    ny_topics.append(i.split('_')[1])
ny_d = {}
for i in range(len(fns)):
    ny_d[ny_topics[i]] = pd.read_csv(os.path.join('./nypost',ny_csvs[i]))

In [75]:
#train all topic models for nypost, commented out because it takes a long time to run
# for i in ny_topics:
#     train_lda(ny_d[i], i+'nypost', 5, 10)
#     print(i+'done')

attackdone
Bidendone
blackdone
Chinadone
conflictdone
crimedone
democraticdone
fairdone
gundone
immigrationdone
invasiondone
LGBTdone
policedone
protestdone
refuseddone
republicdone
Russiadone
terrordone
Trumpdone


KeyError: 'UK'

In [87]:
# retrain the following model because error before
#train_lda(ny_d['UK'], 'UK'+'nypost', 5, 10)

In [88]:
#train_lda(ny_d['ukraine'], 'ukraine'+'nypost', 5, 10)

In [89]:
#train_lda(ny_d['US'], 'US'+'nypost', 5, 10)

In [90]:
#train_lda(ny_d['violence'], 'violence'+'nypost', 5, 10)

In [91]:
#train_lda(ny_d['war'], 'war'+'nypost', 5, 10)

In [None]:
# get pyLDAvis visualizations for all topic models under nypost
ny_topics=[]
ps_ny = []
for i in fns:
    ny_topics.append(i.split('_')[1])
for i in cnn_topics:
    model = models.ldamodel.LdaModel.load('{}nypost.model'.format(i))
    topics = model.print_topics(num_words=20)
    for topic in topics:
        print(topic)
    lemmas = d[i]['text'].apply(get_lemmas)
    dictionary = corpora.Dictionary.load('{}nypost.model.id2word'.format(i))
    corpus = [dictionary.doc2bow(text) for text in lemmas]
    pyLDAvis.enable_notebook()
    p = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
    ps_ny.append(p)
    print(i+'done')

(0, '0.013*"new" + 0.009*"—" + 0.009*"city" + 0.008*"said" + 0.007*"york" + 0.007*"state" + 0.007*"crime" + 0.006*"adam" + 0.006*"hochul" + 0.005*"year" + 0.005*"people" + 0.005*"mayor" + 0.004*"law" + 0.003*"would" + 0.003*"one" + 0.003*"democrat" + 0.003*"also" + 0.003*"republican" + 0.003*"like" + 0.003*"zeldin"')
(1, '0.014*"said" + 0.007*"—" + 0.005*"one" + 0.004*"told" + 0.004*"time" + 0.004*"year" + 0.003*"like" + 0.003*"people" + 0.003*"new" + 0.003*"police" + 0.003*"two" + 0.003*"crime" + 0.003*"family" + 0.003*"also" + 0.003*"post" + 0.003*"home" + 0.003*"‘" + 0.002*"would" + 0.002*"get" + 0.002*"man"')
(2, '0.019*"said" + 0.012*"police" + 0.006*"—" + 0.006*"cop" + 0.005*"officer" + 0.005*"crime" + 0.005*"victim" + 0.005*"year" + 0.005*"nypd" + 0.004*"court" + 0.004*"case" + 0.004*"two" + 0.004*"according" + 0.004*"shooting" + 0.004*"gun" + 0.004*"new" + 0.004*"woman" + 0.004*"one" + 0.004*"man" + 0.004*"told"')
(3, '0.016*"said" + 0.007*"police" + 0.005*"told" + 0.005*"—" + 

  default_term_info = default_term_info.sort_values(


attackdone
(0, '0.008*"—" + 0.008*"said" + 0.006*"year" + 0.006*"tax" + 0.006*"bill" + 0.006*"new" + 0.005*"would" + 0.005*"inflation" + 0.004*"state" + 0.004*"billion" + 0.004*"price" + 0.004*"biden" + 0.004*"million" + 0.004*"also" + 0.004*"company" + 0.003*"federal" + 0.003*"last" + 0.003*"spending" + 0.003*"cost" + 0.003*"american"')
(1, '0.011*"said" + 0.010*"border" + 0.009*"migrant" + 0.008*"biden" + 0.007*"—" + 0.007*"u" + 0.005*"president" + 0.005*"state" + 0.005*"new" + 0.004*"people" + 0.004*"city" + 0.004*"administration" + 0.004*"percent" + 0.004*"texas" + 0.004*"cuomo" + 0.003*"american" + 0.003*"year" + 0.003*"one" + 0.003*"last" + 0.003*"country"')
(2, '0.017*"biden" + 0.012*"president" + 0.010*"trump" + 0.009*"said" + 0.008*"house" + 0.007*"—" + 0.007*"republican" + 0.006*"democrat" + 0.005*"would" + 0.005*"former" + 0.004*"election" + 0.004*"senate" + 0.004*"white" + 0.003*"vote" + 0.003*"time" + 0.003*"one" + 0.003*"american" + 0.003*"year" + 0.003*"image" + 0.003*"j

  default_term_info = default_term_info.sort_values(


Bidendone
(0, '0.014*"said" + 0.007*"—" + 0.006*"court" + 0.005*"police" + 0.005*"case" + 0.004*"one" + 0.004*"year" + 0.004*"told" + 0.004*"judge" + 0.003*"attorney" + 0.003*"time" + 0.003*"trial" + 0.003*"also" + 0.003*"new" + 0.003*"two" + 0.003*"would" + 0.003*"according" + 0.003*"officer" + 0.003*"fair" + 0.002*"lawyer"')
(1, '0.008*"get" + 0.008*"bet" + 0.008*"new" + 0.006*"one" + 0.005*"—" + 0.005*"best" + 0.005*"bonus" + 0.004*"first" + 0.004*"amazon" + 0.004*"betting" + 0.004*"sport" + 0.003*"game" + 0.003*"available" + 0.003*"make" + 0.003*"giant" + 0.003*"look" + 0.003*"full" + 0.003*"21+" + 0.003*"also" + 0.003*"apply"')
(2, '0.009*"—" + 0.009*"said" + 0.005*"like" + 0.005*"new" + 0.005*"one" + 0.005*"year" + 0.004*"time" + 0.004*"people" + 0.003*"also" + 0.003*"told" + 0.003*"‘" + 0.003*"show" + 0.003*"say" + 0.003*"would" + 0.003*"get" + 0.003*"first" + 0.003*"life" + 0.003*"image" + 0.002*"day" + 0.002*"know"')
(3, '0.010*"said" + 0.008*"—" + 0.006*"new" + 0.006*"state" 

  default_term_info = default_term_info.sort_values(


blackdone
(0, '0.009*"—" + 0.005*"said" + 0.005*"like" + 0.005*"new" + 0.005*"one" + 0.004*"year" + 0.004*"time" + 0.003*"also" + 0.003*"gun" + 0.003*"‘" + 0.003*"star" + 0.003*"first" + 0.003*"image" + 0.003*"get" + 0.003*"fox" + 0.003*"would" + 0.002*"love" + 0.002*"getty" + 0.002*"show" + 0.002*"day"')
(1, '0.007*"said" + 0.006*"—" + 0.006*"game" + 0.005*"one" + 0.004*"year" + 0.004*"team" + 0.003*"time" + 0.003*"first" + 0.003*"school" + 0.003*"season" + 0.003*"image" + 0.003*"two" + 0.003*"player" + 0.003*"last" + 0.003*"getty" + 0.003*"also" + 0.002*"new" + 0.002*"would" + 0.002*"family" + 0.002*"student"')
(2, '0.023*"said" + 0.016*"police" + 0.010*"shooting" + 0.010*"gun" + 0.008*"shot" + 0.007*"cop" + 0.007*"—" + 0.006*"two" + 0.006*"officer" + 0.006*"according" + 0.005*"man" + 0.005*"one" + 0.005*"suspect" + 0.005*"victim" + 0.004*"school" + 0.004*"street" + 0.004*"nypd" + 0.004*"told" + 0.004*"source" + 0.004*"people"')
(3, '0.018*"said" + 0.006*"—" + 0.006*"police" + 0.006*

  default_term_info = default_term_info.sort_values(


Chinadone
(0, '0.008*"—" + 0.008*"said" + 0.005*"year" + 0.005*"new" + 0.005*"one" + 0.004*"people" + 0.004*"like" + 0.004*"american" + 0.004*"time" + 0.003*"first" + 0.003*"also" + 0.003*"world" + 0.003*"say" + 0.003*"u" + 0.003*"would" + 0.002*"country" + 0.002*"‘" + 0.002*"immigrant" + 0.002*"family" + 0.002*"day"')
(1, '0.012*"trump" + 0.009*"—" + 0.007*"president" + 0.005*"democrat" + 0.005*"republican" + 0.005*"percent" + 0.005*"new" + 0.004*"party" + 0.004*"said" + 0.004*"one" + 0.004*"biden" + 0.004*"would" + 0.004*"american" + 0.004*"year" + 0.004*"election" + 0.004*"time" + 0.003*"democratic" + 0.003*"voter" + 0.003*"also" + 0.003*"political"')
(2, '0.013*"said" + 0.009*"—" + 0.004*"new" + 0.004*"one" + 0.004*"year" + 0.004*"told" + 0.004*"police" + 0.004*"two" + 0.003*"time" + 0.003*"family" + 0.003*"also" + 0.003*"say" + 0.003*"according" + 0.003*"woman" + 0.003*"home" + 0.003*"u" + 0.003*"people" + 0.002*"day" + 0.002*"post" + 0.002*"life"')
(3, '0.014*"said" + 0.013*"city

  default_term_info = default_term_info.sort_values(


conflictdone
(0, '0.012*"said" + 0.008*"—" + 0.005*"one" + 0.005*"new" + 0.004*"home" + 0.004*"year" + 0.004*"like" + 0.003*"time" + 0.003*"two" + 0.003*"police" + 0.003*"say" + 0.003*"woman" + 0.003*"get" + 0.003*"also" + 0.003*"people" + 0.003*"told" + 0.003*"would" + 0.002*"invasion" + 0.002*"day" + 0.002*"first"')
(1, '0.008*"said" + 0.006*"—" + 0.006*"war" + 0.005*"iraq" + 0.005*"u" + 0.005*"president" + 0.004*"would" + 0.004*"state" + 0.004*"one" + 0.004*"american" + 0.004*"year" + 0.004*"obama" + 0.003*"north" + 0.003*"u.s." + 0.003*"military" + 0.003*"people" + 0.003*"korea" + 0.003*"world" + 0.003*"also" + 0.003*"time"')
(2, '0.017*"ukraine" + 0.015*"russian" + 0.015*"said" + 0.013*"russia" + 0.008*"putin" + 0.008*"ukrainian" + 0.008*"president" + 0.007*"u" + 0.005*"—" + 0.005*"war" + 0.005*"biden" + 0.005*"military" + 0.005*"invasion" + 0.005*"force" + 0.005*"country" + 0.004*"would" + 0.004*"people" + 0.003*"also" + 0.003*"official" + 0.003*"troop"')
(3, '0.010*"said" + 0.00

  default_term_info = default_term_info.sort_values(


crimedone
(0, '0.012*"said" + 0.007*"people" + 0.007*"—" + 0.006*"right" + 0.005*"gay" + 0.005*"woman" + 0.005*"transgender" + 0.005*"say" + 0.005*"lgbt" + 0.004*"law" + 0.004*"year" + 0.004*"also" + 0.004*"state" + 0.003*"one" + 0.003*"gender" + 0.003*"would" + 0.003*"court" + 0.003*"group" + 0.003*"new" + 0.003*"child"')
(1, '0.014*"said" + 0.008*"new" + 0.007*"—" + 0.005*"gay" + 0.005*"state" + 0.005*"city" + 0.004*"lgbt" + 0.004*"people" + 0.004*"york" + 0.004*"year" + 0.004*"would" + 0.004*"one" + 0.004*"community" + 0.003*"right" + 0.003*"cuomo" + 0.003*"also" + 0.003*"time" + 0.003*"group" + 0.003*"marriage" + 0.003*"told"')
(2, '0.010*"said" + 0.009*"—" + 0.009*"gay" + 0.005*"people" + 0.004*"one" + 0.004*"pride" + 0.004*"parade" + 0.004*"lgbt" + 0.004*"year" + 0.004*"community" + 0.004*"city" + 0.004*"new" + 0.004*"police" + 0.003*"also" + 0.003*"day" + 0.003*"trump" + 0.003*"time" + 0.003*"march" + 0.003*"would" + 0.003*"like"')
(3, '0.008*"—" + 0.005*"said" + 0.005*"trump" +

  default_term_info = default_term_info.sort_values(


democraticdone
(0, '0.026*"said" + 0.019*"police" + 0.009*"cop" + 0.007*"man" + 0.006*"officer" + 0.006*"victim" + 0.005*"two" + 0.005*"suspect" + 0.005*"according" + 0.005*"—" + 0.005*"shot" + 0.005*"street" + 0.005*"one" + 0.004*"car" + 0.004*"around" + 0.004*"woman" + 0.004*"shooting" + 0.004*"advertisement" + 0.004*"hospital" + 0.004*"nypd"')
(1, '0.008*"said" + 0.006*"—" + 0.004*"police" + 0.004*"told" + 0.004*"one" + 0.004*"family" + 0.004*"time" + 0.004*"also" + 0.003*"year" + 0.003*"advertisement" + 0.003*"game" + 0.003*"new" + 0.003*"image" + 0.003*"two" + 0.003*"friend" + 0.003*"getty" + 0.003*"home" + 0.003*"star" + 0.003*"fan" + 0.003*"day"')
(2, '0.007*"said" + 0.006*"police" + 0.005*"—" + 0.004*"one" + 0.004*"told" + 0.003*"video" + 0.003*"new" + 0.003*"time" + 0.003*"woman" + 0.003*"charge" + 0.003*"according" + 0.003*"year" + 0.003*"also" + 0.003*"two" + 0.003*"allegedly" + 0.002*"family" + 0.002*"court" + 0.002*"post" + 0.002*"officer" + 0.002*"incident"')
(3, '0.019*"

In [None]:
# display pyLDAvis visualizations for topic models under nypost
ps_ny[0]

In [None]:
ps_ny[1]

In [None]:
# war
ps[23]

In [None]:
ps_ny[23]

In [None]:
# nypost terror
ps_ny[17]

In [None]:
# CNN terror
ps[17]

In [None]:
#ukraine CNN
ps[19]

In [None]:
# ukraine nypost
ps_ny[19]

In [None]:
#violence CNN
ps[22]

In [None]:
#violence nypost
ps_ny[22]

In [None]:
# gun CNN 
ps[8]

In [None]:
ps_ny[8]

In [None]:
# republic CNN
ps[15]

In [None]:
# republic nypost
ps_ny[15]

In [None]:
# democrat CNN
ps[6]

In [None]:
# democrat nypost
ps_ny[6]