##### Social Media Analytics
### Introduction to Text Mining
## Topic Modeling
(c) Nuno Antonio 2019-2022 v1.02

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

import gensim         
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import pyLDAvis
import pyLDAvis.gensim

# For pyLDAvis new version - replace the previous line
# import pyLDAvis.gensim_models as gensimvis
#pyLDAvis.enable_notebook()


import matplotlib.pyplot as plt

In [2]:
# Just because of versions on my computer
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
# Load dataset
# Load dataset
dtypes = {'title':'category','author':'category','text':'category'}
ds = pd.read_csv("CNNArticles.csv", sep=",", 
                 error_bad_lines=False, dtype=dtypes, decimal=',', 
                 index_col='Unnamed: 0', parse_dates=['date'])



  ds = pd.read_csv("CNNArticles.csv", sep=",",


### Functions

In [4]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r"\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\|[0-9]|--| [ ] |'s" , removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [5]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [6]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (' ').join(words)
        return temp_str
    else:
        return np.nan

In [7]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

In [8]:
# Function to lemmatize words, running," "runs," and "ran" is gonna be "run." 
def lemmatize(words):
    if type(words) == list:
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(x) for x in words]
    else:
        return np.nan

### Analysis

In [9]:
# Create a dataframe with only the description
dsprocessedText = pd.DataFrame(data=ds.text.apply(textPreProcess).values, index=ds.index, columns=['PreProcessedText'])

In [10]:
# Tokekinze words
dsprocessedText['Words'] =  dsprocessedText['PreProcessedText'].apply(tokenize_words)

# Remove stopwords
stop_words = set(stopwords.words('english'))
dsprocessedText['WordsCleaned'] = dsprocessedText['Words'].apply(removeStopWords,stop_words=stop_words)

# Remove all reviews with no words
dsprocessedText = dsprocessedText[dsprocessedText['WordsCleaned'].str.len()>0]

# Lemmative words
dsprocessedText['WordsLemmatized'] = dsprocessedText['WordsCleaned'].apply(lemmatize)

In [11]:
# Create dictionary (needed for LDA)
id2word = corpora.Dictionary(dsprocessedText['WordsLemmatized'])


#LDA is a probabilistic model used to discover latent topics within a collection of documents. In order to perform LDA,
# a dictionary is created to map words to unique integer IDs. 
# This dictionary is used to represent the words in the text data as numerical tokens that can be processed by the LDA algorithm.

In [12]:
# Create corpus
corpus = dsprocessedText['WordsLemmatized']

In [13]:
# Create TDM (Frequency)
tdm = [id2word.doc2bow(text) for text in corpus]

In [14]:
# Build LDA model (can take from 2 to 10 minutes depending on the computer)
lda_model = gensim.models.ldamodel.LdaModel(corpus=tdm,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=123,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [15]:
# Print the keywords in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[tdm]
# Interpretation: Topic 0 is represented as 0.055"walking" + 0.054"garden" + ...

[(0, '0.034*"ukraine" + 0.026*"said" + 0.024*"russia" + 0.021*"russian" + 0.014*"u" + 0.011*"president" + 0.010*"ukrainian" + 0.010*"country" + 0.009*"sanction" + 0.008*"putin"'), (1, '0.011*"people" + 0.008*"n\'t" + 0.007*"one" + 0.006*"day" + 0.005*"say" + 0.005*"ukraine" + 0.005*"country" + 0.005*"time" + 0.005*"many" + 0.005*"like"'), (2, '0.025*"russian" + 0.020*"said" + 0.017*"u" + 0.016*"russia" + 0.011*"medium" + 0.010*"government" + 0.009*"cnn" + 0.009*"official" + 0.009*"griner" + 0.008*"information"'), (3, '0.014*"iran" + 0.013*"saudi" + 0.011*"world" + 0.010*"team" + 0.010*"international" + 0.009*"athlete" + 0.009*"arabia" + 0.008*"israel" + 0.008*"football" + 0.008*"uefa"'), (4, '0.016*"%" + 0.015*"oil" + 0.015*"price" + 0.014*"russia" + 0.012*"said" + 0.012*"gas" + 0.010*"energy" + 0.009*"$" + 0.009*"company" + 0.008*"sanction"'), (5, '0.023*"putin" + 0.016*"u" + 0.015*"russia" + 0.011*"biden" + 0.010*"china" + 0.009*"war" + 0.008*"ukraine" + 0.007*"would" + 0.007*"world"

In [16]:
pyLDAvis.enable_notebook()

In [17]:
vis = pyLDAvis.gensim.prepare(lda_model, tdm, id2word, sort_topics=False)
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(


### Evaluation of topic models
Good description at: https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [18]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(tdm))  # lower value is better (some liteture do not recommend the use of this measure)


Perplexity:  -8.102372485469402


In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# HOW TO SELECT K (based on https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=123,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

# ATTENTION!!!! Can take a LONG time to run.
lowerBound = 2
upperBound = 20
step = 6
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=tdm, texts=corpus, start=lowerBound, limit=upperBound, step= step)

# Show graph
x = range(lowerBound, upperBound, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
# The coherence score is decreasing. A higher value should be chosen, because as K increases the probability of having repeated keywords increases.
# ...So, in this case we may choose 8, as there is an inflaction at K=8