In [None]:
import pandas as pd
import nltk
import string
import pyLDAvis.gensim_models

from gensim import corpora, models
from gensim.utils import effective_n_jobs
import gc
import os

In [None]:
# get dataframe composed of all csvs in CNN folder
csvs = [x for x in os.listdir('../data/CNN/') if x.endswith('.csv')]
fns = [os.path.splitext(os.path.basename(x))[0] for x in csvs]
topics=[]
for i in fns:
    topics.append(i.split('_')[1])
d = {}
for i in range(len(fns)):
    d[topics[i]] = pd.read_csv(os.path.join('../data/CNN',csvs[i]))

In [None]:
# functions to lemmatize news texts
STOP = set(nltk.corpus.stopwords.words('english') + list(string.punctuation) + ['``', "''", "’", "“", "”","–", "\'s"])

def get_lemmas(text):
    '''
    Gets lemmas for a string input, excluding stop words, punctuation, as well
    as a set of study-specific stop-words
    '''
    lemmas = [nltk.stem.WordNetLemmatizer().lemmatize(t)
              for t in nltk.word_tokenize((str(text).lower())) if t not in STOP
              ]
    return lemmas

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=2):
    '''
    Computes Coherence values for LDA models with differing numbers of topics.

    Returns list of models along with their respective coherence values (pick
    models with the highest coherence)
    '''
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = models.ldamulticore.LdaMulticore(corpus=corpus,
                                                 id2word=dictionary,
                                                 num_topics=num_topics,
                                                 workers=effective_n_jobs(-1))
        model_list.append(model)
        coherence_model = models.coherencemodel.CoherenceModel(model=model,
                                                          corpus=corpus,
                                                          dictionary=dictionary,
                                                          coherence='u_mass')
        coherence_values.append(coherence_model.get_coherence())
        
    return model_list, coherence_values

In [None]:
def train_lda(topic_df, model_name, num_topics, workers):
    '''
    Trains LDA model on a dataframe of news articles, saves model to disk
    Inputs:
        topic_df: dataframe of news articles
        model_name: name of model to be saved
        num_topics: number of topics to be trained
        workers: number of workers to be used in training
    '''
    # Get lemmas for each article
    lemmas = topic_df['text'].apply(get_lemmas)
    #reduce memory load
    del topic_df
    gc.collect()
    # Initialize Series of lemmas as Gensim Dictionary for further processing
    dictionary = corpora.Dictionary(lemmas)
    # Convert dictionary into bag of words format: list of (token_id, token_count) tuples
    bow_corpus = [dictionary.doc2bow(text) for text in lemmas]
    model_list, coherence_values = compute_coherence_values(dictionary=dictionary,
                                                              corpus=bow_corpus,
                                                              texts=lemmas,
                                                              start=2,
                                                              limit=40,
                                                              step=6)
    # train LDA model
    ldamodel = models.ldamulticore.LdaMulticore(bow_corpus, num_topics=num_topics, id2word=dictionary, workers=workers, passes=20, iterations=400)
    ldamodel.save('{}.model'.format(model_name))

In [None]:
# train all topic models for CNN, commented out because it takes a long time to run

# for i in topics:
#     train_lda(d[i], i+'CNN', 5, 10)
#     print(i+'done')

In [None]:
# get pyLDAvis visualizations for all topic models under CNN
cnn_topics=[]
ps = []
for i in fns:
    cnn_topics.append(i.split('_')[1])
for i in cnn_topics:
    model = models.ldamodel.LdaModel.load('{}CNN.model'.format(i))
    topics = model.print_topics(num_words=20)
    for topic in topics:
        print(topic)
    lemmas = d[i]['text'].apply(get_lemmas)
    dictionary = corpora.Dictionary.load('{}CNN.model.id2word'.format(i))
    corpus = [dictionary.doc2bow(text) for text in lemmas]
    pyLDAvis.enable_notebook()
    p = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
    ps.append(p)
    print(i+'done')
#     else:
#         model = models.ldamodel.LdaModel.load('{}cnn.model'.format(i))
#         topics = ldamodel.print_topics(num_words=20)
#         for topic in topics:
#             print(topic)
#         dictionary = corpora.Dictionary.load('{}cnn.id2word'.format(i))
#         p = pyLDAvis.gensim_models.prepare(model, bow_corpus, dictionary)
#         print(p)

In [None]:
pyLDAvis.enable_notebook
ps[0]

In [None]:
ps[1]

In [None]:
ps[2]

In [None]:
ps[3]

In [None]:
ps[4]

In [None]:
ps[5]

In [None]:
ps[6]

In [None]:
ps[7]

In [None]:
ps[8]

In [None]:
ps[9]

In [None]:
ps[10]

In [None]:
ps[11]

In [None]:
ps[12]

In [None]:
ps[13]

In [None]:
ps[14]

In [None]:
ps[15]

In [None]:
ps[16]

In [None]:
ps[17]

In [None]:
ps[18]

In [None]:
ps[19]

In [None]:
ps[20]

In [None]:
ps[21]

In [None]:
ps[22]

In [None]:
ps[23]

### Train NyPost LDA

In [None]:
#import nypost data
ny_csvs = [x for x in os.listdir('../data/nypost/') if x.endswith('.csv')]
fns = [os.path.splitext(os.path.basename(x))[0] for x in ny_csvs]
ny_topics=[]
for i in fns:
    ny_topics.append(i.split('_')[1])
ny_d = {}
for i in range(len(fns)):
    ny_d[ny_topics[i]] = pd.read_csv(os.path.join('./nypost',ny_csvs[i]))

In [None]:
#train all topic models for nypost, commented out because it takes a long time to run
# for i in ny_topics:
#     train_lda(ny_d[i], i+'nypost', 5, 10)
#     print(i+'done')

In [None]:
# retrain the following model because error before
#train_lda(ny_d['UK'], 'UK'+'nypost', 5, 10)

In [None]:
#train_lda(ny_d['ukraine'], 'ukraine'+'nypost', 5, 10)

In [None]:
#train_lda(ny_d['US'], 'US'+'nypost', 5, 10)

In [None]:
#train_lda(ny_d['violence'], 'violence'+'nypost', 5, 10)

In [None]:
#train_lda(ny_d['war'], 'war'+'nypost', 5, 10)

In [None]:
# get pyLDAvis visualizations for all topic models under nypost
ny_topics=[]
ps_ny = []
for i in fns:
    ny_topics.append(i.split('_')[1])
for i in cnn_topics:
    model = models.ldamodel.LdaModel.load('{}nypost.model'.format(i))
    topics = model.print_topics(num_words=20)
    for topic in topics:
        print(topic)
    lemmas = d[i]['text'].apply(get_lemmas)
    dictionary = corpora.Dictionary.load('{}nypost.model.id2word'.format(i))
    corpus = [dictionary.doc2bow(text) for text in lemmas]
    pyLDAvis.enable_notebook()
    p = pyLDAvis.gensim_models.prepare(model, corpus, dictionary)
    ps_ny.append(p)
    print(i+'done')

In [None]:
# display pyLDAvis visualizations for topic models under nypost
ps_ny[0]

In [None]:
ps_ny[1]

In [None]:
# war
ps[23]

In [None]:
ps_ny[23]

In [None]:
# nypost terror
ps_ny[17]

In [None]:
# CNN terror
ps[17]

In [None]:
#ukraine CNN
ps[19]

In [None]:
# ukraine nypost
ps_ny[19]

In [None]:
#violence CNN
ps[22]

In [None]:
#violence nypost
ps_ny[22]

In [None]:
# gun CNN 
ps[8]

In [None]:
ps_ny[8]

In [None]:
# republic CNN
ps[15]

In [None]:
# republic nypost
ps_ny[15]

In [None]:
# democrat CNN
ps[6]

In [None]:
# democrat nypost
ps_ny[6]