### Topic Modeling with Gensim's LDA algorithm on a selection of BBC News Articles.
#### Topic Modeling Workshop at Northwestern University, August, 2019.

In [None]:
import pandas as pd

**Handle collections import warnings.**

In [None]:
try:
    from collections.abc import Iterable
except ImportError:
    from collections import Iterable

**Suppress other annoying warnings.**

In [None]:
import logging
import warnings

logging.basicConfig( format='%(asctime)s : %(levelname)s : %(message)s' , 
    level=logging.ERROR )

warnings.filterwarnings( "ignore" , category=DeprecationWarning )
warnings.filterwarnings( action='ignore', category=UserWarning, module='gensim' )
warnings.simplefilter( action='ignore', category=FutureWarning )

**Display progress bars.**

In [None]:
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

# Read Article Data.

In [None]:
data = pd.read_csv( 'data/bbc-articles.csv' )

**Display number of rows (articles) and columns in data.**
**The first column is the language of the article.**
**The second column is the article text.**

In [None]:
data.shape

**Drop the first row which contains the column names.**

In [None]:
data = data.dropna().reset_index( drop=True )

**Display number of rows and columns again.**

In [None]:
data.shape

# Clean Data.

**Count the number of articles in each language.**

In [None]:
from langdetect import detect


In [None]:
data['language'] = data.articles.progress_map( detect )

**The article language is specified by a two-letter ISO code.** 

In [None]:
data.language.value_counts()

#### Keep only the English language articles.

In [None]:
data = data.loc[data.language=='en']

**Split articles into sentences.**

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
import nltk
nltk.download('punkt', quiet=True)

In [None]:
data['sentences'] = data.articles.progress_map( sent_tokenize )


**Display first three sentences of first article.**

In [None]:
data['sentences'].head(1).tolist()[0][:3] 

**Tokenize words in each sentence.**

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('averaged_perceptron_tagger',quiet=True)

In [None]:
data['tokens_sentences'] = data['sentences'].progress_map(
    lambda sentences: [ word_tokenize( sentence ) for sentence in sentences ] )
print( data['tokens_sentences'].head(1).tolist()[0][:3] )

#### Lemmatize words (with part of speech tagging).

In [None]:
from nltk import pos_tag

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
data['POS_tokens'] = data['tokens_sentences'].progress_map(
    lambda tokens_sentences: [pos_tag(tokens) for tokens in tokens_sentences])
print(data['POS_tokens'].head(1).tolist()[0][:3])

**Create a lemmatizer.**

In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

**Lemmatize each word.**

In [None]:
nltk.download('wordnet', quiet=True)

In [None]:
data['tokens_sentences_lemmatized'] = data['POS_tokens'].progress_map(
    lambda list_tokens_POS: [
        [
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1])) 
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] 
        for tokens_POS in list_tokens_POS
    ]
)

In [None]:
data['tokens_sentences_lemmatized'].head(1).tolist()[0][:3]

#### Regroup tokens and remove stop words.

**We'll use the list of stopwords built-in to nltk plus a few others.**

In [None]:
from nltk.corpus import stopwords

# Load default stopwords from nltk.

my_stopwords = stopwords.words('english')

# You can add additional custom stopwords inline.

stopwords_verbs = ['say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 
                   'see', 'want', 'come', 'take', 'use', 'would', 'can']
stopwords_other = ['one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 
                   'also', 'copyright', 'something']

my_stopwords = my_stopwords + stopwords_verbs + stopwords_other

**You can also add custom stopwords from a text file.**
**Let's add the Buckley-Salton stopwords as an example.**

In [None]:
buckley_salton_stopwords = []

with open( "data/buckley-salton-stopwords.txt", "r") as f:
    for line in f:
        buckley_salton_stopwords.extend( line.split() )

my_stopwords = my_stopwords + buckley_salton_stopwords

In [None]:
from itertools import chain 

**Flatten list of sentences of tokens to list of tokens.**

In [None]:
data['tokens'] = data['tokens_sentences_lemmatized'].map(
    lambda sentences: list(chain.from_iterable(sentences)))
data['tokens'] = data['tokens'].map(lambda tokens: 
                                    [token.lower() 
                                     for token in tokens 
                                     if token.isalpha() 
                                    and token.lower() 
                                     not in my_stopwords and 
                                     len(token)>1])

In [None]:
data['tokens'].head(1).tolist()[0][:30]

# Latent Dirichlet Analysis (LDA).

## Data preparation

#### Prepare bigrams and trigrams.

In [None]:
from gensim.models.phrases import Phraser
from gensim.models.phrases import Phrases

In [None]:
tokens = data['tokens'].tolist()
bigram_model = Phrases( tokens )
trigram_model = Phrases( bigram_model[tokens], min_count=1 )
tokens = list( trigram_model[bigram_model[tokens]] )

#### Prepare objects for LDA gensim implementation.

In [None]:
from gensim import corpora

In [None]:
dictionary_LDA = corpora.Dictionary( tokens )
dictionary_LDA.filter_extremes( no_below=3 )
corpus = [dictionary_LDA.doc2bow( tok ) for tok in tokens]

## Run LDA.

In [None]:
from gensim import models
import numpy as np

In [None]:
np.random.seed( 32767 ) # Set pseudorandom number generate seed.

num_topics = 20 # We'll start by extracting 20 topics.

%time lda_model = models.LdaModel( corpus, num_topics=num_topics, \
                                   id2word=dictionary_LDA, \
                                   passes=10, \
                                   random_state=32767 )

## Overview of LDA results

**Compute perplexity, a measure of how good the model is.**  
**The lower the perplexity value the better.**

In [None]:
print( 'Log Perplexity: ', lda_model.log_perplexity( corpus ) )
print( 'Perplexity: ', np.exp( -1.0 * lda_model.log_perplexity( corpus ) ) )


**Compute Coherence Score.**  
**The higher the coherence value the better.**  

In [None]:
from gensim.models.coherencemodel import CoherenceModel

coherence_model_lda = CoherenceModel(\
    model = lda_model, corpus=corpus, texts=tokens, 
    dictionary=dictionary_LDA, coherence = 'c_v' )

coherence_lda = coherence_model_lda.get_coherence()

print( '\nCoherence Score (c_v): ' , coherence_lda )


#### Look at topics.

In [None]:
for i,topic in lda_model.show_topics(formatted=True, 
                                     num_topics=num_topics, 
                                     num_words=20):
    print(str(i)+": "+ topic)
    print()

#### Allocate topics to documents.

In [None]:
print( data.articles.loc[0][:500] )

In [None]:
lda_model[corpus[0]]

#### Predict topics for unseen documents.

In [None]:
document = '''Eric Tucker, a 35-year-old co-founder of a marketing company in Austin, Tex., had just about 40 Twitter followers. But his recent tweet about paid protesters being bused to demonstrations against President-elect Donald J. Trump fueled a nationwide conspiracy theory — one that Mr. Trump joined in promoting. 

Mr. Tucker's post was shared at least 16,000 times on Twitter and more than 350,000 times on Facebook. The problem is that Mr. Tucker got it wrong. There were no such buses packed with paid protesters.

But that didn't matter.

While some fake news is produced purposefully by teenagers in the Balkans or entrepreneurs in the United States seeking to make money from advertising, false information can also arise from misinformed social media posts by regular people that are seized on and spread through a hyperpartisan blogosphere.

Here, The New York Times deconstructs how Mr. Tucker’s now-deleted declaration on Twitter the night after the election turned into a fake-news phenomenon. It is an example of how, in an ever-connected world where speed often takes precedence over truth, an observation by a private citizen can quickly become a talking point, even as it is being proved false.'''
tokens = word_tokenize(document)
topics = lda_model.show_topics(formatted=True, 
                               num_topics=num_topics, num_words=20)
pd.DataFrame([(el[0], round(el[1],2), topics[el[0]][1]) 
              for el in lda_model[dictionary_LDA.doc2bow(tokens)]], 
             columns=['topic #', 'weight', 'words in topic'])

## Look at LDA results more closely.

#### Allocate topics for all documents.

In [None]:
topics = [lda_model[corpus[i]] for i in range( len(data) )]

In [None]:
def topics_document_to_dataframe( topics_document, num_topics ):
    res = pd.DataFrame( columns=range( num_topics ) )
    for topic_weight in topics_document:
        res.loc[0, topic_weight[0]] = topic_weight[1]
    return res

topics_document_to_dataframe([(9, 0.03853655432967504), 
                              (15, 0.09130117862212643), 
                              (18, 0.8692868808484044)], 20)

**Create matrix of topic weights.  Documents are rows and topics are columns.**

In [None]:
document_topic = \
pd.concat( [topics_document_to_dataframe( topics_document, 
                                         num_topics=num_topics ) \
            for topics_document in topics] ) \
  .reset_index( drop=True ).fillna(0)

In [None]:
document_topic.head()

**Which documents are about topic 10?**

In [None]:
document_topic.sort_values( 10, ascending=False )[10].head(20)

In [None]:
print( data.articles.loc[91][:1000] )

#### Look at distribution of topics in all documents.

In [None]:
%matplotlib inline
import seaborn as sns; sns.set( rc={'figure.figsize':(10,20)} )
sns.heatmap( document_topic.loc[document_topic.idxmax(axis=1).
                                sort_values().index] )

In [None]:
sns.set( rc={'figure.figsize':(10,5)} )
document_topic.idxmax( axis=1 ).value_counts().plot.bar( \
                                color='lightblue' )

#### Visualize topics.

Size of bubble: proportional to the proportions of the topics across the N total tokens in the corpus  
Red bars: estimated number of times a given term was generated by a given topic  
Blue bars: overall frequency of each term in the corpus  
-- Relevance of words is computed with a parameter lambda  
-- Lambda optimal value ~0.6


In [None]:
%matplotlib inline
import pyLDAvis
import pyLDAvis.gensim
vis = pyLDAvis.gensim.prepare( topic_model=lda_model, corpus=corpus, \
            dictionary=dictionary_LDA )
pyLDAvis.enable_notebook()
pyLDAvis.display( vis )