# Class 7: Topic Modeling and Dictionary-based Analysis - Tutorial

In this tutorial, we will see how we can preprocess text for a topic model and estimate a LDA in Python.

The tutorial is based on this: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/?utm_content=cmp-true

In [None]:
# Basic imports
import os
import re                                                      # for regular expressions
import platform
import numpy as np
import pandas as pd
from pprint import pprint                                      # for nice prints
from tqdm.autonotebook import tqdm as notebook_tqdm

# Gensim modules
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

# Spacy module
import spacy

# NLTK module
import nltk

In [None]:
# # # # Working Directory # # # #

if platform.system() == 'Linux':
    wd = '/home/rask/'
else:
    wd = 'C:/Users/au535365/'

wd = os.path.join(wd, 'Dropbox/teaching/css_fall2023')
    
# Change directory
os.chdir(wd)

# Confirm that the working directory is as intended 
os.getcwd()

## Stopwords

In [None]:
# NLTK also has stopwords - we first download them
nltk.download('stopwords')

# and then defines a list of english stopwords
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
pprint(stop_words)

## Reading Data

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
data_dict = fetch_20newsgroups(subset='train',  remove=('headers', 'footers', 'quotes'))

In [None]:
# Target categories
data_dict.target_names

In [None]:
# Inspect
data_dict.target[:10]

In [None]:
# Labels
topic_labels = data_dict.target

In [None]:
# Build target dictionary
target_dict = {i: k for i, k in enumerate(data_dict.target_names)}

In [None]:
# Extract corpus and convert it to a numpy array
corpus = np.array(data_dict['data'])
print(f"Length of corpus before removing empty docs: {len(corpus)}")

In [None]:
# Keep only non-empty speeches
indices = [ix for ix in range(len(corpus)) if len(corpus[ix]) > 0]
corpus = corpus[indices]
topic_labels = topic_labels[indices]
print(f"Length of corpus before removing empty docs: {len(corpus)}")

In [None]:
# Inspect corpus
corpus[0]

In [None]:
# We randomly sample 3000 documents

# Set seed
np.random.seed(10)

# Define number of samples
n_samples = 3000

# Random sampling
sample_indices = np.random.choice(len(corpus), size=n_samples, replace=False)

In [None]:
# Subset based on random speeches
samples = corpus[sample_indices]
topic_labels_samples = topic_labels[sample_indices]

In [None]:
# Count number of docs for each topic
_, count = np.unique(topic_labels_samples, return_counts=True)

In [None]:
# Generate pandas dataframe from the count
topic_count_df = pd.DataFrame({'topic': target_dict.values(), 'count': count})

In [None]:
# Print the counts
pprint(topic_count_df)

## Basic Text Cleaning

In [None]:
# Download spacy model for english if you not already have done so
# !python -m spacy download en_core_web_sm

In [None]:
# Define spacy pipeline
spacy_pipeline_en = spacy.load("en_core_web_sm")

# Define a stopwords lists
stop_words = sorted(list(spacy_pipeline_en.Defaults.stop_words))

In [None]:
# Remove e-mails
docs_clean = [re.sub('\S*@\S*\s?', '', sent) for sent in samples]

# Remove new line characters
docs_clean = [re.sub('\s+', ' ', sent) for sent in docs_clean]
docs_clean = [re.sub('\n', '', sent) for sent in docs_clean]

# Remove distracting single quotes
docs_clean = [re.sub("\'", "", sent) for sent in docs_clean]

# Remove two or more consecutive whitespaces (pattern=' +', replacement=' ')
docs_clean = [re.sub(' +', ' ', sent) for sent in docs_clean]

# Remove trailing and leading whitespaces
docs_clean = [sent.strip() for sent in docs_clean]

## Tokenization

In [None]:
tokens_raw = [[d for d in spacy_pipeline_en(doc)] for doc in notebook_tqdm(docs_clean, position=0, leave=True)]

In [None]:
# Remove stopwords
tokens_cleaned = [[x for x in token if x.text not in stop_words] for token in tokens_raw]

# Remove punctuation
tokens_cleaned = [[x for x in token if not x.is_punct] for token in tokens_cleaned]

# Remove digits
tokens_cleaned = [[x for x in token if not x.is_digit] for token in tokens_cleaned]

# Remove tokens shorter than 3 characters
tokens_cleaned = [[x for x in token if len(x) >= 3] for token in tokens_cleaned]

## Prepare Data for LDA

The LDA models take a vocabulary and a vectorized version of our corpus as inputs.

In `gensim`, this is done using the `Dictionary()` class and its `doc2bow` method. The former creates the vocabulary and the latter creates our BoW. 

In [None]:
# Apply Dictionary class from gensim on our tokens
vocab = Dictionary(tokens_cleaned)

In [None]:
# We get an error because the tokens are not strings. They still belong to the spacy module. 
# We can convert it to strings using the .text or .lower_ attribute
tokens_cleaned = [[x.lower_ for x in doc] for doc in tokens_cleaned]

In [None]:
# Compute word frequency and word-document frequency using the Counter() class from the collections module
from collections import Counter

word_freq = Counter()
for doc in tokens_cleaned:
    for word in doc:
        word_freq[word] += 1

word2doc_freq = Counter()
for doc in tokens_cleaned:
    unique_tokens = set(doc)
    word2doc_freq.update(unique_tokens)

In [None]:
# Inspect
word_freq

In [None]:
# Inspect
word2doc_freq

In [None]:
# Remove frequent and rare occuring tokens
tokens_final = [[x for x in doc if 
               word_freq[x]/len(samples) <= 0.10 and 
               word2doc_freq[x]/len(samples) <= 0.10 and
               word_freq[x] >= 5 and
               word2doc_freq[x] >= 5] for doc in tokens_cleaned]

In [None]:
# Construct vocabulary
vocab = Dictionary(tokens_final)

In [None]:
# Construct BoW
bow = [vocab.doc2bow(doc) for doc in tokens_final]

## Building the Topic Model

In [None]:
# Build lda_model
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow,
                                           id2word=vocab,
                                           num_topics=20, 
                                           random_state=100)

In [None]:
# Print topics
pprint(lda_model.print_topics())

Below, I generate a class called `TopicInspector` that are helpful in interpreting the results of the LDA. 

1. Initialization:
    * The __init__ method is the constructor for the class. It takes several parameters:
        
        * lda_model: This is the LDA model that you want to inspect.
        * vocab: This is a vocabulary mapping from word IDs to words.
        * corpus: This is the corpus of documents used to train the LDA model.
        * topn: This is an optional parameter specifying the number of top words in each topic to consider. It defaults to 10.
        
2. `id2token`:
    * The id2token method is used to convert a word ID (wid) to its corresponding word in the vocabulary.

3. `get_topic_words`:
    * The get_topic_words method takes a topic ID (tid) and retrieves the top words associated with that topic from the LDA model. It returns a list of words.


4. `get_topic_word_prob`:
    * The get_topic_word_prob method is similar to get_topic_words, but it returns the probabilities (scores) of the top words in the topic instead of the actual words.n

5. `topic_word_df`:
    * The topic_word_df method creates a DataFrame that represents the top words for each topic. It iterates through all topics, retrieves the top words for each topic using get_topic_words, and constructs a DataFrame where each column is labeled with the topic number (e.g., 'topic0', 'topic1').


6. `topic_doc_df` :
    * The topic_doc_df method creates a DataFrame that represents the distribution of topics for each document in the corpus.
    * It uses the LDA model's get_document_topics method to obtain the topic distribution for each document.
    * The resulting DataFrame has columns labeled with topic numbers ('topic0', 'topic1', etc.).
    * Additionally, it can include two optional columns:
        * max_topic: If add_max_topic is True, it adds a column with the topic that has the highest probability for each document.
        * max_score: If add_max_score is True, it adds a column with the highest probability score for each document.

In [None]:
class TopicInspector:
    """ A class for inspecting and analyzing Latent Dirichlet Allocation (LDA) models. 
        
        Attributes:
            lda_model (gensim.models.LdaModel): The LDA model to be inspected.
            vocab (dict): A vocabulary mapping from word IDs to words.
            corpus (list of list of tuples): The corpus of documents used to train the LDA model.
            num_topics (int): The number of topics in the LDA model.
            topn (int, optional): The number of top words in each topic to consider (default is 10).
        
        Methods:
            id2token(wid): Convert a word ID to its corresponding word in the vocabulary.
            get_topic_words(tid): Get the top words associated with a given topic.
            get_topic_word_prob(tid): Get the probabilities of the top words in a given topic.
            topic_word_df(): Create a DataFrame representing the top words for each topic.
            topic_doc_df(add_max_topic=True, add_max_score=True): Create a DataFrame representing the topic distribution for each document in the corpus.
    """
    
    def __init__(self, lda_model, vocab, corpus, topn=10):
        self.lda_model = lda_model
        self.vocab = vocab
        self.corpus = corpus
        self.num_topics = self.lda_model.num_topics
        self.topn = topn
    
    def id2token(self, wid):
        return self.vocab[wid]
    
    def get_topic_words(self, tid):
        topic_terms = self.lda_model.get_topic_terms(tid, topn=self.topn)
        wordids, score = zip(*topic_terms)
        return [self.id2token(x) for x in wordids]
    
    def get_topic_word_prob(self, tid):
        topic_terms = self.lda_model.get_topic_terms(tid, topn=self.topn)
        wordids, score = zip(*topic_terms)
        return score
    
    def topic_word_df(self):
        
        topic_df_list = []
        for k in range(0, self.num_topics - 1):
            words = self.get_topic_words(tid=k)
            topic_df_ = pd.DataFrame(words, columns=[f'topic{k}'])
            topic_df_list.append(topic_df_)
        
        topic_word_df = pd.concat(topic_df_list, axis=1)
        
        return topic_word_df
            
    
    def topic_doc_df(self, add_max_topic=True, add_max_score=True):
        
        topic_docs = self.lda_model.get_document_topics(self.corpus, minimum_probability=0)
        doc_dist_list = []
        for d in range(len(topic_docs)):
            doc_dist = [x[1] for x in topic_docs[d]]
            doc_dist_list.append(doc_dist)
        
        topic_doc_df = pd.DataFrame(doc_dist_list, columns=[f'topic{x}' for x in range(self.num_topics)])
        if add_max_topic:
            max_topics = topic_doc_df.idxmax(axis=1)
        else:
            max_topics = None
        
        if add_max_score:
            max_scores = topic_doc_df.max(axis=1)
        else:
            max_scores = None
        
        if add_max_topic:
            topic_doc_df['max_topic'] = max_topics
        
        if add_max_score: 
            topic_doc_df['max_score'] = max_scores

        return topic_doc_df


In [None]:
# Define instance of the TopicInspector
lda_inspect = TopicInspector(lda_model=lda_model, vocab=vocab, corpus=bow, topn=10)

In [None]:
# Get top words for each topic
topic_word_df = lda_inspect.topic_word_df()

In [None]:
# Print topic word df
print(topic_word_df)

In [None]:
# Get topic distribution for each doc
topic_doc_df = lda_inspect.topic_doc_df()

In [None]:
# Print
topic_doc_df

# Sentiment Analysis Examples

## Vader

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
text_1 = samples[0]
text_2 = samples[6]
sent_1 = sentiment.polarity_scores(text_1)
sent_2 = sentiment.polarity_scores(text_2)
print("Sentiment of text 1:", sent_1)
print("Sentiment of text 2:", sent_2)

## BERT

In [None]:
from danlp.models import load_bert_tone_model
classifier = load_bert_tone_model()

In [None]:
# using the classifier
classifier.predict('Analysen viser, at økonomien bliver forfærdelig dårlig')

In [None]:
classifier.predict('Jeg tror alligvel, det bliver godt')

In [None]:
# Get probabilities and matching classes names
probs = classifier.predict_proba('Analysen viser, at økonomien bliver forfærdelig dårlig')

In [None]:
print(f"Polarity probabilities for classes 'positive', 'neutral' and 'negative': {probs[0]}")

In [None]:
print(f"Subjectivity/objectivity probabilities for classes 'objective' and 'subjective': {probs[1]}")