In [28]:
# import dependencies
import re
import nltk
import math
import gensim
import string
import pyLDAvis
import numpy as np
import pandas as pd
import pyLDAvis.gensim  
from nltk.stem.porter import *
import matplotlib.pyplot as plt
from newsapi import NewsApiClient
from nltk.corpus import stopwords
from gensim import corpora, models
from gensim.models import CoherenceModel
from pandas.io.json import json_normalize
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer, SnowballStemmer

%matplotlib inline

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
# setting parameters
API_KEY = 'b38baf2305f34037a08e1b0e04be6536'
newsapi = NewsApiClient(api_key=API_KEY)

# Load stemmer and punctuation symbols
punctuations = string.punctuation
stemmer = SnowballStemmer('english')

START_DATE = '2019-09-26'
END_DATE = '2019-09-26'
LANGUAGE = 'en'
COUNTRY = 'us'
STOP_WORDS =  set(stopwords.words('english'))
MAX_ARTICLES_LIMIT = 100
PAGE_SIZE = 100

In [0]:
# get list of all sources
all_sources = newsapi.get_sources()
sources = json_normalize(all_sources['sources'])

# filter all sources for which country is US
us_sources = sources[sources['country'] == 'us']
lst_us_sources = us_sources['id'].tolist()

In [0]:
# get first 100 articles from newsapi based on the above filters
first_100_articles = newsapi.get_everything(sources = ', '.join(lst_us_sources),
                                      from_param=START_DATE,
                                      to=END_DATE,
                                      language=LANGUAGE,
                                      page = 1,
                                      page_size=PAGE_SIZE)

# create dataframes from the articles
articles = json_normalize(first_100_articles['articles'])

In [0]:
# set articles to fetch (put MAX_ARTICLES_LIMIT to 0 above to fetch all articles)
total_articles = first_100_articles['totalResults']

if MAX_ARTICLES_LIMIT > 0 :
  articles_to_fetch = MAX_ARTICLES_LIMIT
else :
  articles_to_fetch = total_articles
  
  
# set the pages to iterate - for each page a separate request is issued 
# as maximum of 100 results can be obtained in each individual request
page_to_iterate = math.ceil(articles_to_fetch/PAGE_SIZE)

In [0]:
# iterate and get the articles
for _ in range(page_to_iterate - 1):
  next_100_articles = newsapi.get_everything(sources = ', '.join(lst_us_sources),
                                      from_param=START_DATE,
                                      to=END_DATE,
                                      language=LANGUAGE,
                                      page = page_to_iterate + 1,
                                      page_size=PAGE_SIZE)
  
  
  articles.append(json_normalize(next_100_articles['articles']))

In [0]:
# remove all none with empty string and join the relevant three text columns - content, description & title
articles = articles.fillna('')
articles['text'] = articles['content'] + " " + articles['description'] + " " + articles["title"]

In [0]:
# lowercase and remove punctuation
def tokenize(sent):
   tokens = gensim.utils.simple_preprocess(sent)
   return [lemmatize_stemming(token.lower()) for token in tokens if (token not in punctuations)]

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def clean(text):
  text = text.lower()
  patterns = [
      "&amp;",
      "&lt;",
      "&gt",
      r"[^a-zA-Z ]",
      r"\b[a-zA-z]\b"
      ," +"
  ]
  
  for pattern in patterns:
    clean_regex = re.compile(pattern)
    text = re.sub(clean_regex,' ', text)
  
  return text

# covert numerals to their text equivalent
def subsitute_numerals(text):
   return text.strip().replace('0', ' zero') \
                       .replace('1',' one') \
                       .replace('2',' two') \
                       .replace('3',' three') \
                       .replace('4',' four') \
                       .replace('5',' five') \
                       .replace('6',' six') \
                       .replace('7',' seven') \
                       .replace('8',' eight') \
                       .replace('9',' nine')
  
  
# remove empty token generated from inserting blank spaces
def remove_empty_token(tokens):
   result = []
   for token in tokens:
       if not token.strip() == '':
           result.append(token)
   return result


# optional - remove other common stop words
# get the stop words from NLTK package
def remove_stop_words(tokens):
   result = []
   for token in tokens:
       if not token in STOP_WORDS:
           result.append(token)
   return result


# optional - remove words less than 3 character long
def remove_short_words(tokens):
   result = []
   for token in tokens:
       if len(token) >= 3:
           result.append(token)
   return result

# get back the cleaned text from the tokens
def join_tokens(tokens):
  return ' '.join(tokens)

In [0]:
# tokenize, lemmatize and do stemming on the text column containing title, content & description
cleaned_articles = pd.DataFrame(articles['text'].apply(clean), columns=['text'])
cleaned_articles['tokens'] = cleaned_articles['text'].apply(tokenize)
cleaned_articles['tokens'] = cleaned_articles['tokens'].apply(remove_empty_token).apply(remove_stop_words).apply(remove_short_words)

In [0]:
# build the dictionary and remove the words which were used in less than 5 articles 
# and in over 90% of all articles. Also limit the dictionary

MAX_VOCAB_SIZE = 100000
dictionary = gensim.corpora.Dictionary(cleaned_articles['tokens'].values)
dictionary.filter_extremes(no_below=5, no_above=0.90, keep_n=MAX_VOCAB_SIZE)

In [26]:
# generate bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in cleaned_articles['tokens'].values]

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=2, workers=2)

# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(bow_corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=cleaned_articles['tokens'].values, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -4.630380266773945

Coherence Score:  0.3699789471245369


In [30]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [27]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.054*"team" + 0.053*"first" + 0.052*"announc" + 0.052*"char" + 0.051*"day" + 0.042*"report" + 0.038*"post" + 0.038*"entir" + 0.036*"view" + 0.036*"former"
Topic: 1 
Words: 0.062*"impeach" + 0.045*"team" + 0.045*"char" + 0.044*"trump" + 0.040*"hous" + 0.039*"democrat" + 0.033*"presid" + 0.031*"thursday" + 0.028*"say" + 0.028*"time"
Topic: 2 
Words: 0.096*"trump" + 0.092*"presid" + 0.051*"char" + 0.038*"thursday" + 0.038*"new" + 0.035*"star" + 0.035*"whistleblow" + 0.035*"complaint" + 0.029*"impeach" + 0.028*"biden"
Topic: 3 
Words: 0.065*"char" + 0.050*"end" + 0.042*"cut" + 0.041*"nation" + 0.039*"call" + 0.036*"show" + 0.036*"court" + 0.035*"trump" + 0.032*"world" + 0.031*"pressur"
Topic: 4 
Words: 0.067*"char" + 0.065*"thursday" + 0.054*"say" + 0.042*"peopl" + 0.038*"week" + 0.033*"run" + 0.032*"giant" + 0.030*"need" + 0.030*"outsid" + 0.029*"trump"
Topic: 5 
Words: 0.079*"saudi" + 0.072*"iran" + 0.063*"way" + 0.057*"sourc" + 0.056*"cut" + 0.051*"strike" + 0.049*"con

In [18]:
# generate tf-idf to try alternate model
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.042*"leader" + 0.039*"trump" + 0.033*"week" + 0.033*"open" + 0.030*"presid" + 0.027*"democrat" + 0.026*"one" + 0.024*"impeach" + 0.023*"pressur" + 0.023*"view"
Topic: 1 Word: 0.046*"show" + 0.034*"accus" + 0.033*"run" + 0.032*"say" + 0.032*"giant" + 0.025*"whistleblow" + 0.025*"thursday" + 0.024*"need" + 0.023*"new" + 0.022*"complaint"
Topic: 2 Word: 0.043*"democrat" + 0.034*"accord" + 0.033*"trump" + 0.031*"impeach" + 0.031*"control" + 0.030*"republican" + 0.029*"york" + 0.029*"hous" + 0.025*"time" + 0.023*"new"
Topic: 3 Word: 0.077*"outsid" + 0.072*"first" + 0.039*"week" + 0.037*"peopl" + 0.037*"take" + 0.036*"may" + 0.029*"way" + 0.027*"court" + 0.025*"could" + 0.023*"reuter"
Topic: 4 Word: 0.082*"second" + 0.076*"make" + 0.055*"question" + 0.047*"presid" + 0.039*"money" + 0.033*"trump" + 0.031*"republican" + 0.030*"biden" + 0.029*"manag" + 0.022*"say"
Topic: 5 Word: 0.044*"sourc" + 0.040*"iran" + 0.038*"saudi" + 0.037*"world" + 0.036*"citi" + 0.032*"news" + 0.032*"

Topic: 0 Word: 0.057*"may" + 0.055*"two" + 0.038*"republican" + 0.035*"control" + 0.033*"team" + 0.033*"new" + 0.028*"impeach" + 0.027*"help" + 0.026*"york" + 0.025*"time"
Topic: 1 Word: 0.057*"second" + 0.051*"trump" + 0.048*"presid" + 0.042*"releas" + 0.037*"donald" + 0.033*"whistleblow" + 0.032*"complaint" + 0.030*"biden" + 0.028*"hous" + 0.024*"ukrain"
Topic: 2 Word: 0.046*"report" + 0.043*"say" + 0.039*"senat" + 0.034*"washington" + 0.034*"giant" + 0.031*"open" + 0.029*"run" + 0.028*"accus" + 0.025*"trump" + 0.024*"thursday"
Topic: 3 Word: 0.043*"star" + 0.042*"cut" + 0.036*"latest" + 0.035*"democrat" + 0.034*"trump" + 0.033*"global" + 0.032*"new" + 0.025*"franchis" + 0.024*"way" + 0.023*"inquiri"
Topic: 4 Word: 0.041*"keep" + 0.040*"investig" + 0.038*"question" + 0.031*"trump" + 0.030*"take" + 0.030*"presid" + 0.030*"control" + 0.029*"democrat" + 0.029*"money" + 0.027*"impeach"
Topic: 5 Word: 0.057*"saudi" + 0.048*"world" + 0.043*"iran" + 0.040*"end" + 0.039*"news" + 0.039*"natio

In [19]:
# test to see document association with the topic
for index, score in sorted(lda_model[bow_corpus[10]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.6365354657173157	 
Topic: 0.094*"presid" + 0.083*"trump" + 0.068*"say" + 0.062*"char" + 0.037*"biden" + 0.032*"former" + 0.032*"year" + 0.032*"one" + 0.029*"thursday" + 0.029*"donald"

Score: 0.28344687819480896	 
Topic: 0.094*"trump" + 0.054*"presid" + 0.053*"hous" + 0.044*"whistleblow" + 0.043*"say" + 0.042*"like" + 0.041*"complaint" + 0.038*"money" + 0.036*"char" + 0.034*"impeach"

Score: 0.010003603994846344	 
Topic: 0.065*"char" + 0.053*"way" + 0.049*"sourc" + 0.046*"team" + 0.046*"saudi" + 0.042*"say" + 0.041*"nation" + 0.041*"cut" + 0.040*"iran" + 0.039*"control"

Score: 0.010002781637012959	 
Topic: 0.091*"thursday" + 0.065*"char" + 0.059*"first" + 0.058*"new" + 0.057*"franchis" + 0.052*"team" + 0.049*"star" + 0.041*"outsid" + 0.033*"latest" + 0.033*"global"

Score: 0.01000253763049841	 
Topic: 0.065*"former" + 0.049*"state" + 0.048*"char" + 0.043*"like" + 0.035*"view" + 0.033*"accus" + 0.033*"ukrain" + 0.033*"accord" + 0.033*"money" + 0.033*"reuter"

Score: 0.0100022