# Preprocessing NYT Article Content and Using Topic Modeling to Identify topics throughout the corpus using gensim LDA. 

In [29]:
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text as text
from sklearn import decomposition
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import nltk
from nltk.tokenize import wordpunct_tokenize
import pickle

In [30]:
with open('nyt_markets_complete_df.pickle', 'rb') as dff:
    nyt_markets_df = pickle.load(dff)

## First step will be to rename the dataframe columns and extract the date from the URL


In [31]:
nyt_markets_df.columns = ['index','date', 'url','article']

In [32]:
import re, datetime
def regex_date(string):
    try:
        match = re.search('\d{4}/\d{2}/\d{2}',string)
        date = datetime.datetime.strptime(match.group(), '%Y/%m/%d').date()
        return date
    except:
        return None

In [33]:
nyt_markets_df['url_date'] = nyt_markets_df['url'].apply(regex_date)

In [34]:
nyt_markets_df.head()

Unnamed: 0,index,date,url,article,url_date
0,0,,https://www.nytimes.com/2019/05/29/business/de...,Good Wednesday. (Want this by email? Sign up h...,2019-05-29
1,0,,https://www.nytimes.com/2017/12/03/business/me...,Dozens of people were gathered on a Manhattan ...,2017-12-03
2,0,,https://www.nytimes.com/2016/02/05/business/en...,LONDON — Royal Dutch Shell became the latest b...,2016-02-05
3,0,,https://www.nytimes.com/2016/01/31/nytnow/your...,Here are the week’s top stories.1. The preside...,2016-01-31
4,0,"UPDATED December 2, 2016Dec. 1, 2016",https://www.nytimes.com/interactive/2016/12/02...,,2016-12-02


### We will go ahead and drop the 'index' and 'date' column. Index column is unnecessary, while extracting the date of the article from the URL proved more fruitful than scraping the date. 

In [35]:
columns = ['index','date']
nyt_markets_df.drop(columns, axis=1, inplace=True)

### Given a cleaner dataframe, we will now remove rows in which the scraper was unable to pull article content

In [36]:
nyt_markets_df_clean = nyt_markets_df.dropna(subset=['article'])

In [37]:
print("Corpus is now",nyt_markets_df_clean.shape[0],"documents long")

Corpus is now 13496 documents long


# We will begin with some preprocessing of the text, as defined by the Text_cleanup function below

In [38]:
import pandas as pd
import matplotlib
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from datetime import datetime
import nltk, re


In [39]:
stopwords = set(stopwords.words('english'))
punctuation = set(string.punctuation) 

def text_cleanup(input_text):
    one = " ".join([i for i in input_text.lower().split() if i not in stopwords])
    two = "".join(re.sub(r'[^a-zA-Z ]', '', i) for i in one if i not in punctuation)
    three = [WordNetLemmatizer().lemmatize(i) for i in two.split()]
    return three

In [40]:
article_content = nyt_markets_df_clean[['article']].applymap(text_cleanup)['article']


# Bigrams and Tri Grams

In [41]:
from gensim.models import Phrases
# Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more.
bigram = Phrases(article_content, min_count=10)
#trigram = Phrases(bigram[docs])

In [42]:
trigram = Phrases(bigram[article_content])

In [43]:
for idx in range(len(article_content)):
    try:
        for token in bigram[article_content[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                article_content[idx].append(token)
        for token in trigram[article_content[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                article_content[idx].append(token)
    except:
        pass

# Following text cleaning, we use the gensim Dictionary function to create a dictionary containing the number of times a word appears in the corpus. 

In [46]:
from gensim.corpora.dictionary import Dictionary

dictionary = Dictionary(article_content)


## Dictionary is filtered according to the below: 

### less than 15 documents (absolute number) or
### more than 0.5 documents (fraction of total corpus size, not absolute number).
### after the above two steps, keep only the first 100000 most frequent tokens.

In [47]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


# Using the below doc2bow function, we  create a dictionary reporting how many words and how many times those words appear per document

In [48]:
nyt_markets_corpus = [dictionary.doc2bow(doc) for doc in article_content]

# The below code vectorizes the corpus using the TF_IDF model. This is then fed into the LDA model.

In [49]:
from gensim import corpora, models
tfidf = models.TfidfModel(nyt_markets_corpus)
corpus_tfidf = tfidf[nyt_markets_corpus]

# After testing different iterations of the LDA model to see what the ideal topic number was, it appears that 9 topics creates the most distinct and separable clusters. 

# Following the LDA model, we can view the topic keywords both per topic, and using pyLDA Vis. We can see that the NMF model did a better job in creating clearly separable topics. This fact is also made evident when viewing the pyLDA bubble chart shown below. 

In [52]:
import gensim
from gensim.models.ldamodel import LdaModel

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=9, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.001*"new_york" + 0.001*"wall_street" + 0.001*"united_state" + 0.001*"hong_kong" + 0.001*"private_equity" + 0.001*"restaurant" + 0.001*"chief_executive" + 0.001*"hedge_fund" + 0.001*"nyt" + 0.001*"cooking"
Topic: 1 Word: 0.002*"mr_trump" + 0.001*"united_state" + 0.001*"trump" + 0.001*"saudi_arabia" + 0.001*"new_york" + 0.001*"european_union" + 0.001*"oil" + 0.001*"saudi" + 0.001*"china" + 0.001*"trade"
Topic: 2 Word: 0.002*"bond" + 0.001*"revenue_bond" + 0.001*"refinancing" + 0.001*"refinancing_bond" + 0.001*"million_general" + 0.001*"revenue" + 0.001*"obligation_bond" + 0.001*"general_obligation" + 0.001*"obligation" + 0.001*"million"
Topic: 3 Word: 0.002*"mr_trump" + 0.002*"united_state" + 0.001*"fed" + 0.001*"stock_market" + 0.001*"health_care" + 0.001*"bank" + 0.001*"central_bank" + 0.001*"trump" + 0.001*"china" + 0.001*"rate"
Topic: 4 Word: 0.001*"mr_trump" + 0.001*"united_state" + 0.001*"central_bank" + 0.000*"bank" + 0.000*"european_union" + 0.000*"last_year" + 0

In [67]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_tfidf, nyt_markets_corpus, dictionary)
vis

KeyboardInterrupt: 

In [58]:
from gensim.models.coherencemodel import CoherenceModel

# Compute Coherence Score using c_v
coherence_model_lda = CoherenceModel(model=lda_model_tfidf, texts=article_content, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3599638987787557


In [54]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [63]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=nyt_markets_corpus, texts=article_content, start=2, limit=40, step=10)
# Show graph
import matplotlib.pyplot as plt
limit=40; start=2; step=6;
x = range(start, limit, step)
'''plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()'''

KeyboardInterrupt: 

In [61]:
x

range(2, 40, 6)

In [65]:
coherence_values

[0.34052796879146147,
 0.40480407027879856,
 0.4284799575224078,
 0.39850560948587516]

# As we can tell fromt the coherence values, we run the LDA with ~30 topics as that generates a higher score. Holistically however, the output from NMF still appears to be more seperable. 

In [66]:
import gensim
from gensim.models.ldamodel import LdaModel

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.002*"mr_trump" + 0.001*"united_state" + 0.001*"health_care" + 0.001*"trump" + 0.001*"republican" + 0.001*"charger" + 0.001*"white_house" + 0.001*"mr_obama" + 0.001*"affordable_care" + 0.001*"republican_leader"
Topic: 1 Word: 0.001*"united_state" + 0.001*"mr_trump" + 0.001*"stock_market" + 0.001*"new_york" + 0.001*"auction_house" + 0.001*"apple" + 0.001*"wall_street" + 0.001*"ge" + 0.001*"sale" + 0.001*"auction"
Topic: 2 Word: 0.001*"mr_trump" + 0.001*"united_state" + 0.001*"china" + 0.001*"new_york" + 0.001*"renminbi" + 0.001*"currency" + 0.001*"wall_street" + 0.001*"last_year" + 0.001*"stock_market" + 0.001*"saudi_arabia"
Topic: 3 Word: 0.001*"united_state" + 0.001*"mr_trump" + 0.001*"lone_star" + 0.001*"china" + 0.001*"new_york" + 0.001*"last_year" + 0.000*"the_new" + 0.000*"boko_haram" + 0.000*"york_time" + 0.000*"hong_kong"
Topic: 4 Word: 0.003*"the_new" + 0.003*"york_time" + 0.002*"central_bank" + 0.001*"mr_musk" + 0.001*"the_new_york_time" + 0.001*"rate_cut" + 0.