--- 

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Reference:**

> https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#2prerequisitesdownloadnltkstopwordsandspacymodelforlemmatization

 --- 

__Import the modules__

In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import os
import glob
from datetime import datetime
import string
from collections import Counter
from tqdm import tqdm, notebook
tqdm.pandas()

# don't display warnings info
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Gensim
# !pip install --upgrade gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet
# set the environment for mallet
os.environ.update({'MALLET_HOME':r'C:\Users\Willa\OneDrive\Desktop\mallet-2.0.8'})
mallet_path = r'C:\Users\Willa\OneDrive\Desktop\mallet-2.0.8\bin\mallet'# update this path

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

# Nltk for text processing
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

# spacy for lemmatization
import spacy


# model storage
import pickle
import joblib


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Data Cleaning 

## Import Datasets

__Read the scraped FOX News__

In [4]:
fox_coronav = pd.read_excel('/content/drive/My Drive/News_Fox_CNN/foxnews_article_form.xlsx')
fox_coronav.head()

Unnamed: 0,true_time,headline,link,summary,content
0,2020-04-09 00:00:00,Wife of Edmonton Oilers' Colby Cave gives upda...,https://www.foxnews.com/sports/wife-of-edmonto...,The wife of Edmonton Oilers forward Colby Cave...,The wife of Edmonton Oilers forward Colby Cave...
1,2020-04-09 00:00:00,"Body of RFK granddaughter’s son, 8, recovered ...",https://www.foxnews.com/us/body-of-rfk-grandda...,The body of 8-year-old Gideon McKean was recov...,The body of 8-year-old Gideon McKean was recov...
2,2020-04-09 00:00:00,Hundreds of inmates at Washington state correc...,https://www.foxnews.com/us/hundreds-of-inmates...,Authorities in Washington state responded late...,Get all the latest news on coronavirus and mor...
3,2020-04-09 00:00:00,"Nikki Haley, in op-ed, says coronavirus respon...",https://www.foxnews.com/politics/nikki-haley-i...,Responsibility for combating the coronavirus o...,Get all the latest news on coronavirus and mor...
4,2020-04-09 00:00:00,Barr disappointed by partisan attacks leveled ...,https://www.foxnews.com/media/ag-william-barr-...,"Attorney General William Barr told ""The Ingrah...",Get all the latest news on coronavirus and mor...


In [5]:
# drop the nulls, transform the datatime type
fox_coronav = fox_coronav.dropna()
date = datetime.strptime('2020-1-22', '%Y-%m-%d')
df1 = fox_coronav.loc[fox_coronav['true_time']> date,:]

In [6]:
# rearrange the columns 
col = ['true_time', 'headline', 'content', 'link']
df1 = df1[col]
df1['platform'] = 'fox'
df1.head()

Unnamed: 0,true_time,headline,content,link,platform
0,2020-04-09 00:00:00,Wife of Edmonton Oilers' Colby Cave gives upda...,The wife of Edmonton Oilers forward Colby Cave...,https://www.foxnews.com/sports/wife-of-edmonto...,fox
1,2020-04-09 00:00:00,"Body of RFK granddaughter’s son, 8, recovered ...",The body of 8-year-old Gideon McKean was recov...,https://www.foxnews.com/us/body-of-rfk-grandda...,fox
2,2020-04-09 00:00:00,Hundreds of inmates at Washington state correc...,Get all the latest news on coronavirus and mor...,https://www.foxnews.com/us/hundreds-of-inmates...,fox
3,2020-04-09 00:00:00,"Nikki Haley, in op-ed, says coronavirus respon...",Get all the latest news on coronavirus and mor...,https://www.foxnews.com/politics/nikki-haley-i...,fox
4,2020-04-09 00:00:00,Barr disappointed by partisan attacks leveled ...,Get all the latest news on coronavirus and mor...,https://www.foxnews.com/media/ag-william-barr-...,fox


__Read the scraped CNN News__

In [11]:
cnn_coronav = pd.read_excel('CNN_covid-19_0408.xlsx').drop(['Unnamed: 0'], axis = 1)
cnn_coronav = cnn_coronav.dropna()
date = datetime.strptime('2020-1-22', '%Y-%m-%d')
cnn_coronav['published'] = cnn_coronav.published_date.apply(lambda x : datetime.strptime(x, '%b %d, %Y'))
df2 = cnn_coronav.loc[cnn_coronav['published']> date,:]
col2 = ['published', 'headline', 'content', 'link']
df2 = df2[col2]
df2.columns = col
df2['platform'] = 'cnn'
df2.head()

Unnamed: 0,true_time,headline,content,link,platform
0,2020-04-08,In photos: Trump vents his frustrations at cor...,"Trump points as he speaks on Tuesday, March 31...",https://www.cnn.com/2020/04/08/politics/galler...,cnn
1,2020-04-08,Coping with disease and disability in the time...,Sassy Outwater-Wright has fought off cancer th...,https://www.cnn.com/2020/04/08/health/coronavi...,cnn
2,2020-04-08,Why stock markets are stuck in limbo,"First came the initial shock as countries, sta...",https://www.cnn.com/2020/04/08/investing/prema...,cnn
3,2020-04-06,April 7 coronavirus news,Our live coverage of the coronavirus pandemic ...,https://www.cnn.com/world/live-news/coronaviru...,cnn
4,2020-04-05,Coronavirus pandemic upends daily life,Our live coverage of the coronavirus pandemic ...,https://www.cnn.com/world/live-news/coronaviru...,cnn


__Combine CNN News and FOX News__

In [8]:
df = pd.concat([df1, df2], axis = 0)
df = df.sort_values('true_time').reset_index(drop = False)
df.head()

NameError: ignored

## Clean the texts in 'content' column

Since the scraped news contain advertising 'stop sentences' like __'Sign up here'__ and __'Check out for latest hot headlines'__. We should first delete these sentences, then head to the domain of text cleaning.

In [None]:
# check for duplicated rows 
df[df.duplicated(subset = 'content')]

Unnamed: 0,index,true_time,headline,content,link,platform


In [None]:
# drop the duplicated rows 
df.drop_duplicates(subset = 'content', inplace = True)

__Check the 'stop sentences'__

In [None]:
all_news = ' '.join(''.join(df['content']).split('\xa0')) #exclude \xa0
all_news_sentences = sent_tokenize(all_news) # get all the sentences
Counter(all_news_sentences).most_common()

[('All rights reserved.', 1734),
 ('©2020 FOX News Network, LLC.', 1725),
 ('This material may not be published, broadcast, rewritten, or redistributed.',
  1636),
 ('Sign up here.', 1346),
 ('All market data delayed 20 minutes.Get all the latest news on coronavirus and more delivered daily to your inbox.',
  1285),
 ("Get all the stories you need-to-know from the most powerful name in news delivered first thing every morning to your inbox Subscribed You've successfully subscribed to this newsletter!",
  785),
 ("Check out what's clicking on Foxnews.com.", 685),
 ("Subscribed You've successfully subscribed to this newsletter!", 402),
 ("Check out what's clicking today in entertainment.", 379),
 ('The Associated Press contributed to this report.', 252),
 ('Gov.', 124),
 ('GUTFELD: Yes.', 117),
 ('...', 111),
 ('Fun stories about food, relationships, the great outdoors and more.', 106),
 ('All market data delayed 20 minutes.', 103),
 ('Fox News Flash top headlines are here.', 85),
 ('Tha

In [None]:
stop_sentences = ['contributed to this', 
                  'This material may not be published, broadcast, rewritten, or redistributed', 
                  'FOX News Network, LLC', 
                  'All rights reserved',
                  'All market data delayed',
                  'Sign up here',
                  "Get all the stories you need-to-know from the most powerful name in news delivered first thing every morning to your inbox Subscribed You've successfully subscribed to this newsletter! This material may not be published, broadcast, rewritten, or redistributed",
                  "LIMITED TIME OFFER, ",
                  r"Learn about all the*",
                  r'Get[ ]*latest[ ]*news',
                  r"delivered[ ]*daily[ ]*inbox",
                  r"Check out what's clicking[ ,\w]*",
                  r"successfully subscribed to[ ,\w]*",
                  r"Thank you for making us your first choice[ ,\w]*",
                  r"Flash top[ \w,]*headlines",
                  r"CLICK HERE*",
                  "Mobile users click here",
                  r"Fun stories about [\w,]* and more",
                  r"Stay up-to-date on the biggest [\w,]* news with our weekly recap",
                  r"originally appeared on [\w,]*",
                  r"originally published on [\w,]*",
                  r"Get a daily look at[ \w,]*",
                  "Fox Nation",
                  "The FOX NEWS RUNDOWN",
                  "subscribe and download",
                  "FOX platforms",
                  "FOX NOW",
                  "FOX NEWS APPFox News",
                  "Fox News First",
                  "copyright",
                  "Follow below on the Fox News live blog",
                  "Kim Komando Show",
                  # stop sentences from CNN
                  'Watch the latest videos on Covid-19.',
                  r'live[ ]*coverage[ ]*of[ \w]*',
                  "Note: The prices above reflect the retailer's listed price at the time of publication.",
                  "Read the full story here.",
                  r"CNN Coronavirus",
                  r"A version of this article first appeared",
                  "You can sign up for free right here",
                  "At CNN, we start with the facts.",
                  "Visit CNN's home for Facts First.",
                  "delivered to your inbox daily.",
                  "Sign up here."
                  ]
stop_sentences = [*map(lambda x: x.lower(), stop_sentences)]

__Delete stop sentences__

In [None]:
def contain_stop_sentences(sentence):
    '''
    extract the sentences that contain stop sentences
    '''
    check_status = [*map(lambda x: bool(re.search(x, sentence)), stop_sentences)]
    return(any(check_status)) 

In [None]:
def extract_no_stop_sentences(text_data):
    '''
    Applied on dataframe's column level,
    to delete the stop sentences from a whole news article
    '''
    single_news = ' '.join(''.join(text_data).lower().split('\xa0')) #exclude \xa0
    single_news_sentences = sent_tokenize(single_news)
    new_sentences = []
    for x in single_news_sentences:
        if contain_stop_sentences(x) is False:
            new_sentences.append(x)
    single_new_news = ' '.join(new_sentences)
    
    return(single_new_news)

In [None]:
new_news_list = df['content'].progress_apply(extract_no_stop_sentences)
df.loc[:,'true_content'] = new_news_list

100%|█████████████████████████████████████████████████████████████████████████████| 8342/8342 [00:20<00:00, 415.67it/s]


## Pre-process the text before stemming

In [None]:
# remove punctuations
def make_lower(text):
    return text.lower()

def remove_punctuation(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text) 
    text = re.sub("\'", "", text)
    return re.sub(r'[^\w\s]', ' ', text)

def strip_extraspace(text):
    return ' '.join(text.split())

def remove_digits(text):
    return re.sub('\d', ' ', text)

def replace_word(text,word,replacement):
    return text.replace(word,replacement)

def remove_words(text,wordlist):
    for word in wordlist:
        if word in text.split():
            text = re.sub(r'\b{}\b'.format(word), '', text)  
    return text

In [None]:
stop_words = stopwords.words('english')
stop_words.append('coronavirus')
stop_words.append('fox')

In [None]:
def clean_text(text):
    text = make_lower(text)
    text = replace_word(text,'covid-19','covid') 
    text = replace_word(text,'corona virus','coronavirus') 
    text = replace_word(text,'covid','coronavirus') 
    text = replace_word(text,'fox news','fox') 
    text = replace_word(text,'new york','newyork')
    text = replace_word(text, 'begin video clip', '')
    text = replace_word(text, 'commercial break', '')
    text = remove_punctuation(text)
    text = remove_digits(text)
    text = remove_words(text,stop_words)
    
    return text

In [None]:
df['clean_content'] = df['true_content'].progress_apply(lambda x:clean_text(x))
data = df.clean_content.values.tolist()

100%|█████████████████████████████████████████████████████████████████████████████| 8342/8342 [01:14<00:00, 111.70it/s]


## Tokenize and lemmatize words

__Tokenize the pre-processed words__

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations


In [None]:
data_words = list(sent_to_words(data))
data_words[:2]

[['early',
  'testing',
  'coronavirus',
  'vaccine',
  'likely',
  'begin',
  'three',
  'months',
  'says',
  'dr',
  'anthony',
  'fauci',
  'director',
  'national',
  'institute',
  'allergy',
  'infectious',
  'diseases',
  'national',
  'institutes',
  'health',
  'dr',
  'horacio',
  'arruda',
  'quebec',
  'chief',
  'public',
  'health',
  'officer',
  'told',
  'reporters',
  'wednesday',
  'five',
  'people',
  'recently',
  'traveled',
  'country',
  'held',
  'montreal',
  'hospitals',
  'though',
  'not',
  'clear',
  'ones',
  'sixth',
  'person',
  'initially',
  'group',
  'reportedly',
  'discharged',
  'testing',
  'negative',
  'virus',
  'montreal',
  'gazette',
  'reported',
  'confirmed',
  'cases',
  'arruda',
  'assured',
  'often',
  'people',
  'cold',
  'like',
  'went',
  'china',
  'went',
  'specific',
  'region',
  'careful',
  'majority',
  'cases',
  'negative',
  'would',
  'rather',
  'act',
  'abundance',
  'caution',
  'allow',
  'case',
  'slip',

__Lemmatize the tokenized words, keep only adj, noun, verb, adv__

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
all_news = []
for one_news_list in notebook.tqdm(data_words):
    one_news = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in one_news_list]
    all_news.append(one_news)
    
data_lemmatized = all_news

HBox(children=(FloatProgress(value=0.0, max=8342.0), HTML(value='')))




## Create Bigrams for lemmatized texts

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_lemmatized, min_count=5, threshold=100) # higher threshold fewer phrases.

# Faster way to get a sentence clubbed as a bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)


['early', 'testing', 'coronavirus', 'vaccine', 'likely', 'begin', 'three', 'months', 'says', 'dr_anthony', 'fauci', 'director', 'national_institute', 'allergy_infectious', 'diseases', 'national', 'institutes', 'health', 'dr', 'horacio', 'arruda', 'quebec', 'chief', 'public', 'health', 'officer', 'told', 'reporters', 'wednesday', 'five', 'people', 'recently', 'traveled', 'country', 'held', 'montreal', 'hospitals', 'though', 'not', 'clear', 'ones', 'sixth', 'person', 'initially', 'group', 'reportedly', 'discharged', 'testing', 'negative', 'virus', 'montreal', 'gazette', 'reported', 'confirmed', 'cases', 'arruda', 'assured', 'often', 'people', 'cold', 'like', 'went', 'china', 'went', 'specific', 'region', 'careful', 'majority', 'cases', 'negative', 'would', 'rather', 'act', 'abundance_caution', 'allow', 'case', 'slip', 'community', 'added', 'news', 'week', 'confirmed', 'first', 'case', 'coronavirus', 'seattle', 'man', 'patient', 'recently', 'traveled', 'wuhan', 'chinese', 'city', 'outbrea

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [None]:
data_words_bigrams = make_bigrams(data_lemmatized)

## Create the Dictionary and Corpus needed for Topic Modeling

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (6, 1), (7, 2), (8, 1), (9, 4), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 3), (19, 3), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 3), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 2), (53, 2), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 2), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 2), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 2), (93, 1), (94, 2), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1)]]


In [None]:
# # Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('abundance_caution', 1),
  ('act', 1),
  ('add', 1),
  ('allergy_infectious', 1),
  ('allow', 1),
  ('arruda', 2),
  ('assure', 1),
  ('begin', 2),
  ('careful', 1),
  ('case', 4),
  ('center', 1),
  ('chief', 1),
  ('china', 2),
  ('chinese', 1),
  ('city', 1),
  ('clear', 1),
  ('cold', 1),
  ('community', 1),
  ('confirm', 3),
  ('coronavirus', 3),
  ('country', 1),
  ('currently', 1),
  ('director', 1),
  ('discharge', 1),
  ('disease', 2),
  ('disease_control', 1),
  ('dr', 1),
  ('dr_anthony', 1),
  ('early', 2),
  ('experience', 1),
  ('fauci', 1),
  ('first', 2),
  ('five', 1),
  ('focus', 1),
  ('gazette', 1),
  ('go', 2),
  ('group', 1),
  ('health', 3),
  ('held', 1),
  ('horacio', 1),
  ('hospital', 1),
  ('however', 1),
  ('indeed', 1),
  ('initially', 1),
  ('like', 1),
  ('likely', 1),
  ('low', 1),
  ('maintains', 1),
  ('majority', 1),
  ('man', 1),
  ('month', 1),
  ('montreal', 2),
  ('national_institute', 2),
  ('negative', 2),
  ('news', 1),
  ('not', 1),
  ('of

# Building LDA Mallet Model

In [None]:
# train the ldamallet model
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word)

In [None]:
# Show Topics
print(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(0,
  [('time', 0.011850236277717193),
   ('coronavirus', 0.010972633328140417),
   ('home', 0.010087241003271538),
   ('aposs', 0.01004829412681103),
   ('family', 0.009048657630991329),
   ('year', 0.007745235498779665),
   ('people', 0.007444046320818404),
   ('day', 0.00658202212182583),
   ('show', 0.005673261671080646),
   ('make', 0.005662875837357844)]),
 (1,
  [('coronavirus', 0.03750557699721475),
   ('health', 0.017926526581159927),
   ('test', 0.015632918794588904),
   ('people', 0.01497450824962423),
   ('virus', 0.014747096712712088),
   ('hospital', 0.01210912288453125),
   ('patient', 0.011907701237551926),
   ('care', 0.0101490520187647),
   ('newyork', 0.009943298723463239),
   ('medical', 0.00991947465769149)]),
 (2,
  [('coronavirus', 0.014050741204390675),
   ('company', 0.00837389375184274),
   ('week', 0.007996887279898305),
   ('business', 0.007450711237209572),
   ('work', 0.007397543657832793),
   ('pandemic', 0.007025370602195337),
   ('make', 0.006283441199

## Find the optimal number of topics for LDA

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_words_bigrams, start=3, limit=21, step=3)

In [None]:
# Show elbow graph
import matplotlib.pyplot as plt
%matplotlib inline

limit=21; start=3; step=3;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# select the model based on the coherence-score elbow-plot 
optimal_model = model_list[4]
filename = 'foxcnn_model.sav'
pickle.dump(optimal_model, open(filename, 'wb'))
optimal_model = joblib.load(filename)

In [None]:
# Select the model and print the topics
model_topics = optimal_model.show_topics(formatted=False)
print("Topics for the chosen LDA model:\n")
pprint(optimal_model.print_topics(num_words=10))

Topics for the chosen LDA model:

[(0,
  '0.008*"call" + 0.008*"report" + 0.007*"medium" + 0.007*"write" + '
  '0.007*"year" + 0.007*"claim" + 0.006*"charge" + 0.006*"news" + '
  '0.006*"tweet" + 0.006*"law"'),
 (1,
  '0.039*"hospital" + 0.039*"newyork" + 0.025*"medical" + 0.021*"care" + '
  '0.018*"state" + 0.018*"patient" + 0.017*"city" + 0.017*"mask" + '
  '0.016*"health" + 0.012*"case"'),
 (2,
  '0.028*"trump" + 0.023*"biden" + 0.021*"president" + 0.021*"state" + '
  '0.016*"campaign" + 0.013*"sander" + 0.013*"democratic" + 0.012*"election" + '
  '0.011*"primary" + 0.010*"vote"'),
 (3,
  '0.056*"china" + 0.019*"travel" + 0.019*"chinese" + 0.018*"outbreak" + '
  '0.017*"report" + 0.016*"case" + 0.014*"virus" + 0.013*"health" + '
  '0.013*"official" + 0.013*"wuhan"'),
 (4,
  '0.027*"country" + 0.024*"people" + 0.017*"case" + 0.014*"italy" + '
  '0.012*"death" + 0.012*"virus" + 0.010*"government" + 0.010*"number" + '
  '0.009*"report" + 0.009*"iran"'),
 (5,
  '0.042*"aposs" + 0.022*"t

## Finding the dominant topic in each sentence

In [None]:
def format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel = optimal_model, corpus = corpus, texts = data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.sample(10)
df_dominant_topic.to_excel('LDA_foxcnn_colab.xlsx', index = False)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [None]:
# Show
df_dominant_topic.sample(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
4550,4550,14.0,0.5363,"virus, test, health, people, patient, spread, ...",acting homeland security secretary chad wolf j...
1693,1693,14.0,0.6054,"virus, test, health, people, patient, spread, ...",get latest news delivered daily inbox ...
4099,4099,12.0,0.3026,"show, write, star, home, video, post, share, l...",top theater chains u shut doors ...
541,541,8.0,0.4935,"test_positive, test, day, told, member, family...",get latest news delivered daily inbox ...
6011,6011,6.0,0.4136,"people, thing, lot, make, president, talk, tim...",moderate democrat neal urwitz says not want ...
4104,4104,1.0,0.34,"hospital, newyork, medical, care, state, patie...",veterans affairs secretary robert wilkie weigh...
3785,3785,13.0,0.2698,"team, season, game, player, event, year, cance...",nba suspended season last week pla...
3929,3929,5.0,0.4221,"aposs, time, life, pandemic, family, child, ap...",thousands schools closed across country due...
4180,4180,8.0,0.2755,"test_positive, test, day, told, member, family...",lot folks talked quarantine situation...
4450,4450,11.0,0.203,"bill, house, senate, american, business, billi...",president trump announces expansion medicare...


In [None]:
df_dominant_topic.to_excel('LDA_foxcnn_colab.xlsx', index = False)

## Find the most representative document for each topic

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.5656,"call, report, medium, write, year, claim, char...",new findings justice department inspector ge...
1,1.0,0.6287,"hospital, newyork, medical, care, state, patie...",u navy hospital ship docked port los angel...
2,2.0,0.7436,"trump, biden, president, state, campaign, sand...",former vice president joe biden says america n...
3,3.0,0.7096,"china, travel, chinese, outbreak, report, case...",disease spread countries global financ...
4,4.0,0.6155,"country, people, case, italy, death, virus, go...",country bans outside activities including e...
5,5.0,0.6271,"aposs, time, life, pandemic, family, child, ap...",psychotherapist author dr robi ludwig shares...
6,6.0,0.6489,"people, thing, lot, make, president, talk, tim...",joe biden benefits democrat party political...
7,7.0,0.4701,"trump, president, american, response, state, p...",get latest news delivered daily inbox ...
8,8.0,0.7656,"test_positive, test, day, told, member, family...",buckingham palace says queen elizabeth good ...
9,9.0,0.6646,"state, order, people, home, health, close, sta...",participating social distancing pandem...


In [None]:
sent_topics_sorteddf_mallet.to_excel('LDA_foxcnn_colab_rep.xlsx', index = False)

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates()
topic_num_keywords = topic_num_keywords.set_index('Dominant_Topic')

In [None]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics

Unnamed: 0,Topic_Keywords,Num_Documents,Perc_Documents
0.0,"call, report, medium, write, year, claim, char...",290,0.0436
1.0,"hospital, newyork, medical, care, state, patie...",428,0.0644
2.0,"trump, biden, president, state, campaign, sand...",298,0.0448
3.0,"china, travel, chinese, outbreak, report, case...",597,0.0898
4.0,"country, people, case, italy, death, virus, go...",480,0.0722
5.0,"aposs, time, life, pandemic, family, child, ap...",301,0.0453
6.0,"people, thing, lot, make, president, talk, tim...",172,0.0259
7.0,"trump, president, american, response, state, p...",491,0.0739
8.0,"test_positive, test, day, told, member, family...",302,0.0454
9.0,"state, order, people, home, health, close, sta...",474,0.0713


In [None]:
df_dominant_topics.sort_values('Num_Documents', ascending = False).to_excel('LDA_foxcnn_colab_topics.xlsx', index = True)

In [None]:
pprint(df_dominant_topics.Topic_Keywords)

0.0     call, report, medium, write, year, claim, char...
1.0     hospital, newyork, medical, care, state, patie...
2.0     trump, biden, president, state, campaign, sand...
3.0     china, travel, chinese, outbreak, report, case...
4.0     country, people, case, italy, death, virus, go...
5.0     aposs, time, life, pandemic, family, child, ap...
6.0     people, thing, lot, make, president, talk, tim...
7.0     trump, president, american, response, state, p...
8.0     test_positive, test, day, told, member, family...
9.0     state, order, people, home, health, close, sta...
10.0    company, make, food, work, employee, worker, r...
11.0    bill, house, senate, american, business, billi...
12.0    show, write, star, home, video, post, share, l...
13.0    team, season, game, player, event, year, cance...
14.0    virus, test, health, people, patient, spread, ...
Name: Topic_Keywords, dtype: object


---

# Topic Trending Viz

In [None]:
df.head()

Unnamed: 0,true_time,headline,link,summary,content,true_content,clean_content
0,2020-04-09 00:00:00,Wife of Edmonton Oilers' Colby Cave gives upda...,https://www.foxnews.com/sports/wife-of-edmonto...,The wife of Edmonton Oilers forward Colby Cave...,The wife of Edmonton Oilers forward Colby Cave...,the wife of edmonton oilers forward colby cave...,wife edmonton oilers forward colby cave wedn...
1,2020-04-09 00:00:00,"Body of RFK granddaughter’s son, 8, recovered ...",https://www.foxnews.com/us/body-of-rfk-grandda...,The body of 8-year-old Gideon McKean was recov...,The body of 8-year-old Gideon McKean was recov...,the body of 8-year-old gideon mckean was recov...,body year old gideon mckean recovered wed...
2,2020-04-09 00:00:00,Hundreds of inmates at Washington state correc...,https://www.foxnews.com/us/hundreds-of-inmates...,Authorities in Washington state responded late...,Get all the latest news on coronavirus and mor...,get all the latest news on coronavirus and mor...,get latest news delivered daily inbox ...
3,2020-04-09 00:00:00,"Nikki Haley, in op-ed, says coronavirus respon...",https://www.foxnews.com/politics/nikki-haley-i...,Responsibility for combating the coronavirus o...,Get all the latest news on coronavirus and mor...,get all the latest news on coronavirus and mor...,get latest news delivered daily inbox ...
4,2020-04-09 00:00:00,Barr disappointed by partisan attacks leveled ...,https://www.foxnews.com/media/ag-william-barr-...,"Attorney General William Barr told ""The Ingrah...",Get all the latest news on coronavirus and mor...,get all the latest news on coronavirus and mor...,get latest news delivered daily inbox ...


In [None]:
df_dominant_topic_date = pd.concat([df['true_time'].reset_index(), df_dominant_topic], axis = 1)
df_dominant_topic_date.head()

Unnamed: 0,index,true_time,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2020-04-09 00:00:00,0,13.0,0.2454,"team, season, game, player, event, year, cance...",wife edmonton oilers forward colby cave wedn...
1,1,2020-04-09 00:00:00,1,5.0,0.2825,"aposs, time, life, pandemic, family, child, ap...",body year old gideon mckean recovered wed...
2,2,2020-04-09 00:00:00,2,0.0,0.1826,"call, report, medium, write, year, claim, char...",get latest news delivered daily inbox ...
3,3,2020-04-09 00:00:00,3,7.0,0.3467,"trump, president, american, response, state, p...",get latest news delivered daily inbox ...
4,4,2020-04-09 00:00:00,4,7.0,0.1651,"trump, president, american, response, state, p...",get latest news delivered daily inbox ...


In [None]:
df_dominant_topic_date

Unnamed: 0,index,true_time,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2020-04-09 00:00:00,0,13.0,0.2454,"team, season, game, player, event, year, cance...",wife edmonton oilers forward colby cave wedn...
1,1,2020-04-09 00:00:00,1,5.0,0.2825,"aposs, time, life, pandemic, family, child, ap...",body year old gideon mckean recovered wed...
2,2,2020-04-09 00:00:00,2,0.0,0.1826,"call, report, medium, write, year, claim, char...",get latest news delivered daily inbox ...
3,3,2020-04-09 00:00:00,3,7.0,0.3467,"trump, president, american, response, state, p...",get latest news delivered daily inbox ...
4,4,2020-04-09 00:00:00,4,7.0,0.1651,"trump, president, american, response, state, p...",get latest news delivered daily inbox ...
...,...,...,...,...,...,...,...
6643,6899,2020-01-23 00:00:00,6643,3.0,0.2335,"china, travel, chinese, outbreak, report, case...",dr mehmet oz reacts china quarantining epic...
6644,6900,2020-01-23 00:00:00,6644,3.0,0.2840,"china, travel, chinese, outbreak, report, case...",infectious disease expert dr amesh adalja ...
6645,6901,2020-01-23 00:00:00,6645,0.0,0.2596,"call, report, medium, write, year, claim, char...",two days debate twitter exploded blisterin...
6646,6902,2020-01-23 00:00:00,6646,3.0,0.3642,"china, travel, chinese, outbreak, report, case...",public transit suspended public venues cl...


In [None]:
df_dominant_topic_date.to_excel('foxcnn_dominant_topic_date.xlsx', index = False)

In [None]:
# df_dominant_topic_date.groupby(['true_time','Dominant_Topic']).agg('count')
agg_topicandtime = pd.pivot_table(df_dominant_topic_date, index=['true_time'], values = ['Document_No'], columns = ['Dominant_Topic'], aggfunc = 'count')
agg_topicandtime.to_excel('foxcnn_agg_topicandtime.xlsx')