# News headline topic analysis with LDA
The purpose of this analysis is to find dominant topics across news headlines (25 per day, over 1 year). These topics will later be correlated with daily stock market loss/gain to understand if certain topics influence the stock market.

This work borrows heavily from Susan Li's ["Topic Modeling and LDA in Python"](https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24) article.

In [1]:
# import dependencies
import pandas as pd

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

import numpy as np
np.random.seed(22)

[nltk_data] Downloading package wordnet to /Users/stacy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Import the data
data = pd.read_csv("../Data/RedditNews.csv")

In [3]:
data.head()

# 73,608 records from 2016-2018

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...


In [4]:
# Getting just the headlines for our corpus
headlines = data[['News']]
headlines.head()

Unnamed: 0,News
0,A 117-year-old woman in Mexico City finally re...
1,IMF chief backs Athens as permanent Olympic host
2,"The president of France says if Brexit won, so..."
3,British Man Who Must Give Police 24 Hours' Not...
4,100+ Nobel laureates urge Greenpeace to stop o...


## Data pre-processing
### Lemmitize

In [5]:
# Lemmatize the words keeping the context (stemming is "dumb" so we won't)
# However if we have a much larger corpus, we might consider stemming (as it is faster)
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v') # pos='v' means it peforms stemming with context

### Remove stopwords and words shorter than 3 chars

In [6]:
# Remove stopwords and words shorter than 3 characters, then lemmatize
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize(token))
    return result

### Check outputs

In [7]:
sample = headlines['News'][2]

print('original document: ')
words = []
for word in sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(sample))

original document: 
['The', 'president', 'of', 'France', 'says', 'if', 'Brexit', 'won,', 'so', 'can', 'Donald', 'Trump']


 tokenized and lemmatized document: 
['president', 'france', 'say', 'brexit', 'donald', 'trump']


### Preprocess the headlines and save the results

In [8]:
cleaned_headlines = headlines['News'].map(preprocess)
cleaned_headlines[:5] # Check the results

0    [year, woman, mexico, city, finally, receive, ...
1      [chief, back, athens, permanent, olympic, host]
2      [president, france, say, brexit, donald, trump]
3    [british, police, hours, notice, threaten, hun...
4    [nobel, laureates, urge, greenpeace, stop, opp...
Name: News, dtype: object

## Count the word occurences using Bag of Words

In [9]:
# corpora.Dictionary implements the concept of a Dictionary – a mapping between words and their integer ids.
# https://radimrehurek.com/gensim/corpora/dictionary.html
dictionary = gensim.corpora.Dictionary(cleaned_headlines)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 alvarez
1 bear
2 birth
3 certificate
4 city
5 die
6 finally
7 hours
8 later
9 lira
10 mexico


In [10]:
# Filter out irrelevant words
'''
    less than 15 documents (absolute number) or
    more than 0.5 documents (fraction of total corpus size, not absolute number).
    after the above two steps, keep only the first 100000 most frequent tokens.
'''

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [11]:
'''
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.
'''

bow_corpus = [dictionary.doc2bow(doc) for doc in cleaned_headlines]
bow_corpus[2]

[(21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1)]

In [12]:
bow_doc_2 = bow_corpus[2]

for i in range(len(bow_doc_2)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_2[i][0], 
                                            dictionary[bow_doc_2[i][0]], 
                                            bow_doc_2[i][1]))

Word 21 ("brexit") appears 1 time.
Word 22 ("donald") appears 1 time.
Word 23 ("france") appears 1 time.
Word 24 ("president") appears 1 time.
Word 25 ("say") appears 1 time.
Word 26 ("trump") appears 1 time.


## TD-IDF
Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [13]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.2600564898594514),
 (1, 0.28994526204184407),
 (2, 0.3744826130954538),
 (3, 0.2051174112907887),
 (4, 0.21074667376840217),
 (5, 0.2818126498910203),
 (6, 0.24752380792284462),
 (7, 0.2842753904922502),
 (8, 0.2101120460560261),
 (9, 0.32010680937189695),
 (10, 0.253807102500421),
 (11, 0.3026294517497535),
 (12, 0.20803861949214233),
 (13, 0.16333461952636777),
 (14, 0.16583903979070258)]


## Running LDA using Bag of Words

Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’.

In [14]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [15]:
# For each topic, we will explore the words occuring in that topic and its relative weight.

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.045*"israel" + 0.020*"israeli" + 0.018*"china" + 0.018*"state" + 0.015*"right" + 0.014*"say" + 0.014*"human" + 0.011*"palestinian" + 0.010*"unite" + 0.010*"iran"
Topic: 1 
Words: 0.018*"bank" + 0.013*"government" + 0.010*"protest" + 0.009*"court" + 0.008*"chinese" + 0.008*"china" + 0.008*"million" + 0.008*"police" + 0.006*"people" + 0.006*"billion"
Topic: 2 
Words: 0.034*"kill" + 0.021*"attack" + 0.016*"bomb" + 0.014*"force" + 0.010*"army" + 0.010*"soldier" + 0.010*"troop" + 0.009*"pakistan" + 0.008*"taliban" + 0.008*"police"
Topic: 3 
Words: 0.023*"north" + 0.022*"korea" + 0.021*"kill" + 0.019*"south" + 0.015*"pakistan" + 0.011*"strike" + 0.008*"year" + 0.008*"death" + 0.008*"children" + 0.007*"say"
Topic: 4 
Words: 0.036*"gaza" + 0.020*"israel" + 0.013*"ship" + 0.013*"hamas" + 0.009*"israeli" + 0.007*"children" + 0.006*"say" + 0.006*"georgia" + 0.006*"sell" + 0.006*"report"
Topic: 5 
Words: 0.017*"world" + 0.010*"india" + 0.009*"power" + 0.009*"water" + 0.008*"japa

In [16]:
# Running LDA using TF-IDF

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.005*"world" + 0.004*"global" + 0.004*"china" + 0.004*"food" + 0.004*"karadzic" + 0.004*"crisis" + 0.003*"berlusconi" + 0.003*"say" + 0.003*"israel" + 0.003*"rise"
Topic: 1 Word: 0.006*"ahmadinejad" + 0.005*"china" + 0.005*"japan" + 0.004*"world" + 0.003*"fukushima" + 0.003*"people" + 0.003*"tsunami" + 0.003*"ship" + 0.003*"quake" + 0.003*"kill"
Topic: 2 Word: 0.006*"zimbabwe" + 0.005*"police" + 0.004*"israel" + 0.004*"ossetia" + 0.004*"bank" + 0.003*"die" + 0.003*"somalia" + 0.003*"year" + 0.003*"india" + 0.003*"china"
Topic: 3 Word: 0.013*"kill" + 0.009*"iraq" + 0.008*"bomb" + 0.007*"pakistan" + 0.007*"attack" + 0.007*"iran" + 0.007*"troop" + 0.006*"israel" + 0.006*"strike" + 0.006*"afghanistan"
Topic: 4 Word: 0.005*"police" + 0.004*"olympic" + 0.004*"kill" + 0.004*"pope" + 0.004*"china" + 0.004*"people" + 0.003*"gaza" + 0.003*"abuse" + 0.003*"world" + 0.003*"vatican"
Topic: 5 Word: 0.007*"israel" + 0.006*"gaza" + 0.005*"right" + 0.005*"israeli" + 0.005*"kill" + 0.005

In [17]:
# Performance evaluation by classifying sample document using LDA Bag of Words model

# We will check where our test document would be classified.

cleaned_headlines[2]

['president', 'france', 'say', 'brexit', 'donald', 'trump']

In [18]:
for index, score in sorted(lda_model[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.8713300824165344	 
Topic: 0.010*"election" + 0.009*"say" + 0.009*"drug" + 0.009*"party" + 0.009*"vote" + 0.008*"government" + 0.007*"german" + 0.007*"mexico" + 0.006*"people" + 0.006*"country"

Score: 0.014299782924354076	 
Topic: 0.026*"police" + 0.013*"president" + 0.013*"russian" + 0.009*"russia" + 0.009*"protest" + 0.008*"woman" + 0.008*"protesters" + 0.008*"sentence" + 0.008*"year" + 0.008*"women"

Score: 0.014299423433840275	 
Topic: 0.009*"internet" + 0.008*"government" + 0.007*"right" + 0.006*"say" + 0.006*"house" + 0.006*"muslim" + 0.006*"people" + 0.006*"israelis" + 0.006*"demand" + 0.005*"church"

Score: 0.014297443442046642	 
Topic: 0.037*"iran" + 0.018*"nuclear" + 0.018*"russia" + 0.014*"iraq" + 0.014*"world" + 0.008*"say" + 0.007*"china" + 0.007*"missile" + 0.007*"obama" + 0.006*"weapons"

Score: 0.0142962746322155	 
Topic: 0.034*"kill" + 0.021*"attack" + 0.016*"bomb" + 0.014*"force" + 0.010*"army" + 0.010*"soldier" + 0.010*"troop" + 0.009*"pakistan" + 0.008*"ta

In [19]:
# Performance evaluation by classifying sample document using LDA TF-IDF model.

for index, score in sorted(lda_model_tfidf[bow_corpus[2]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8713445067405701	 
Topic: 0.005*"police" + 0.004*"olympic" + 0.004*"kill" + 0.004*"pope" + 0.004*"china" + 0.004*"people" + 0.003*"gaza" + 0.003*"abuse" + 0.003*"world" + 0.003*"vatican"

Score: 0.014296788722276688	 
Topic: 0.004*"olympics" + 0.004*"putin" + 0.004*"russia" + 0.004*"say" + 0.004*"president" + 0.004*"sarkozy" + 0.003*"world" + 0.003*"east" + 0.003*"peace" + 0.003*"murdoch"

Score: 0.01429586112499237	 
Topic: 0.013*"korea" + 0.011*"north" + 0.010*"nuclear" + 0.008*"iran" + 0.007*"south" + 0.005*"russia" + 0.005*"missile" + 0.005*"say" + 0.004*"sanction" + 0.004*"mugabe"

Score: 0.014295285567641258	 
Topic: 0.005*"internet" + 0.004*"iran" + 0.004*"government" + 0.004*"chavez" + 0.004*"court" + 0.004*"protest" + 0.004*"sentence" + 0.003*"china" + 0.003*"arrest" + 0.003*"google"

Score: 0.014294859021902084	 
Topic: 0.010*"georgia" + 0.006*"israel" + 0.005*"israeli" + 0.004*"kill" + 0.004*"iran" + 0.004*"assange" + 0.004*"world" + 0.004*"protest" + 0.004*"russia

In [20]:
# Testing model on unseen document

unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

# lda_model[bow_vector] # Gets topic probabilities for unseen document 
# # (7, 0.66596633) is the highest score

# for topic in lda_model[bow_vector]:
#     print(topic)

# lda_model.print_topic(index, 7)

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6666535139083862	 Topic: 0.037*"iran" + 0.018*"nuclear" + 0.018*"russia" + 0.014*"iraq" + 0.014*"world"
Score: 0.19996194541454315	 Topic: 0.009*"internet" + 0.008*"government" + 0.007*"right" + 0.006*"say" + 0.006*"house"
Score: 0.016678424552083015	 Topic: 0.018*"bank" + 0.013*"government" + 0.010*"protest" + 0.009*"court" + 0.008*"chinese"
Score: 0.01667626015841961	 Topic: 0.017*"world" + 0.010*"india" + 0.009*"power" + 0.009*"water" + 0.008*"japan"
Score: 0.016672149300575256	 Topic: 0.023*"north" + 0.022*"korea" + 0.021*"kill" + 0.019*"south" + 0.015*"pakistan"
Score: 0.016672130674123764	 Topic: 0.036*"gaza" + 0.020*"israel" + 0.013*"ship" + 0.013*"hamas" + 0.009*"israeli"
Score: 0.016672059893608093	 Topic: 0.026*"police" + 0.013*"president" + 0.013*"russian" + 0.009*"russia" + 0.009*"protest"
Score: 0.016671735793352127	 Topic: 0.010*"election" + 0.009*"say" + 0.009*"drug" + 0.009*"party" + 0.009*"vote"
Score: 0.01667146384716034	 Topic: 0.045*"israel" + 0.020*"israel

## Export the model

In [21]:
from gensim.test.utils import datapath
temp_file = datapath("model")

In [22]:
lda_model.save(temp_file)
# https://radimrehurek.com/gensim/models/ldamodel.html

In [25]:
import joblib
joblib.dump(lda_model, "lda.gz")
# LdaMulticore

['lda.gz']