# LDA TOPIC MODELING 

This notebook applies LDA modeling to a dataset and used GENSIM toolkit:

__Steps__

- Text porcessing
- Bulding the model

__Referred links:__

- https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

- https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

- https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d

- https://stackoverflow.com/questions/20984841/topic-distribution-how-do-we-see-which-document-belong-to-which-topic-after-doi

- https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

__Required Libraries__

In [None]:
import pandas as pd
import numpy as np


import string
import re
from pprint import pprint
from bs4 import BeautifulSoup

# NLTK 
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english') #this depends on each language

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import html

import ast

from langdetect import detect

__Loading data__

In [None]:
df = pd.read_csv("./listing_merged2.csv", index_col = 0) 

In [None]:
df.shape[0]

In [None]:
df.head(1)

In [None]:
data = df[['listing_id', 'description', 'title', 'tags', 'url', 'img1']]

In [None]:
data = data.dropna()

In [None]:
data.shape

In [None]:
data.isnull().sum(axis = 0)

In [None]:
def escape(texts): 
    return [html.unescape(str(text)) for text in texts]

In [None]:
data.description = data.description.apply(lambda x: html.unescape(x))
data.title = data.title.apply(lambda x: html.unescape(x))
data.tags = data.tags.apply(lambda x: ast.literal_eval(x))                                                                                               

In [None]:
#flatten list
data.tags = [','.join(x) for x in data.tags]

In [None]:
data.tags = data.tags.apply(lambda x: html.unescape(x))

In [None]:
mask = data.tags.apply(lambda x: len(x) > 3)
data = data.loc[mask, ]

In [None]:
#remove not filtered german
mask1 = data.tags.apply(lambda x: "ROSENKNOPF,PUPPENKÃœCHE" not in x)

In [None]:
data = data.loc[mask1, ] #fine remove german

In [None]:
ds = data['description'].values.tolist()
tl = data['title'].values.tolist()
tag = data['tags'].values.tolist()

In [None]:
# remove url
def remove_html(texts):
    return [ re.sub(r"https?:\S+", "", str(text)) for text in texts]

ds = remove_html(ds)
tl = remove_html(tl)
tag = remove_html(tag)

In [None]:
pprint(ds[0:2])

__Tokenize words and Clean-up text__

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
ds_words = list(sent_to_words(ds))
tl_words = list(sent_to_words(tl))
tag_words = list(sent_to_words(tag))
print(ds_words[:1])
print(tl_words[:1])
print(tag_words[:1])

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['toy', 'children', 'child', 'kid'])

In [None]:
stop_words

__Process Text__

In [None]:
def process_words(data_words):   
    # Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # See trigram example
    print(trigram_mod[bigram_mod[data_words[0]]])


    #Remove Stopwords, Make Bigrams and Lemmatize
    # Define functions for stopwords, bigrams, trigrams and lemmatization
    

    def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        """https://spacy.io/api/annotation"""
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent)) 
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out
    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    # Initialize spacy 'es' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download es
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    
    return data_lemmatized, corpus, id2word

ds_data_lemmatized, ds_corpus, ds_id2word = process_words(ds_words)
tl_data_lemmatized, tl_corpus, tl_id2word = process_words(tl_words)
tag_data_lemmatized, tag_corpus, tag_id2word = process_words(tag_words)

__Create the Dictionary and Corpus needed for Topic Modeling__

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary

In [None]:
id2word[0]

Or, you can see a human-readable form of the corpus itself.

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]

__Building the Topic Model__

In [None]:
#estimate of number of topics
from gensim.models import HdpModel
def hdp_topic(corpus, id2word):
    hdp = HdpModel(corpus, id2word, chunksize=10000)
    return hdp
hdp_ds = hdp_topic(ds_corpus, ds_id2word)
hdp_tl = hdp_topic(tl_corpus, tl_id2word)
hdp_tag = hdp_topic(tag_corpus, tag_id2word)
print("ds:", len(hdp_ds.print_topics()))
print("tl:", len(hdp_tl.print_topics()))
print("tag:", len(hdp_tag.print_topics()))
# hdp.print_topics(num_topics=20)

In [None]:
len(hdp.print_topics())

In [None]:
hdp.print_topics(num_topics=20)

In [None]:
# Build LDA model
ds_lda_model = gensim.models.LdaMulticore(ds_corpus, num_topics=3, id2word=ds_id2word, eval_every =1, passes=4, workers=5)
tl_lda_model = gensim.models.LdaMulticore(tl_corpus, num_topics=9, id2word=tl_id2word, eval_every = 1, passes=2, workers=6)
tag_lda_model = gensim.models.LdaMulticore(tag_corpus, num_topics = 2, id2word=tag_id2word, eval_every=1, passes=2, workers=6)

In [None]:
pprint(ds_lda_model.print_topics())
pprint(tag_lda_model.print_topics())
pprint(tl_lda_model.print_topics())

In [None]:
# Print the Keyword in the 10 topics
pprint(ds_lda_model.print_topics())
doc_lda = ds_lda_model[corpus]

In [None]:
# run tests
test = [['toddler', 'color'], ['free', 'shipping']]
from gensim.corpora import Dictionary
corpus_test = [ds_id2word.doc2bow(text) for text in test]
for n in range(len(corpus_test)):
    row = ds_lda_model[corpus_test[n]]
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    print(row)

In [None]:
pprint(ds_lda_model.print_topics())

__View the topics in LDA model__

__Compute Model Perplexity and Coherence Score__

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

__Visualize the topics-keywords__

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

In [None]:
def format_topics_sentences(ldamodel=tl_lda_model, corpus=tl_corpus, texts=tl):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
# Format
df_topic_sents_keywords_ds = format_topics_sentences(ldamodel = ds_lda_model, corpus=ds_corpus, texts=ds)
df_dominant_topic_ds = df_topic_sents_keywords_ds.reset_index()
df_dominant_topic_ds.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic_ds.head(30)

In [None]:
# Format
df_topic_sents_keywords_tag = format_topics_sentences(ldamodel = tag_lda_model, corpus=tag_corpus, texts=tag)
df_dominant_topic_tag = df_topic_sents_keywords_tag.reset_index()
df_dominant_topic_tag.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic_tag.iloc[21:40]

In [None]:
#Format
df_topic_sents_keywords_tl = format_topics_sentences(ldamodel = tl_lda_model, corpus=tl_corpus, texts=tl)
df_dominant_topic_tl = df_topic_sents_keywords_tl.reset_index()
df_dominant_topic_tl.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic_tl.head(10)#.iloc[21:40]

In [None]:
#add to data
data['dmt_ds'] = df_dominant_topic_ds['Dominant_Topic']
data['tpc_ds'] = df_dominant_topic_ds['Topic_Perc_Contrib']
data['kw_ds'] = df_dominant_topic_ds['Keywords']

In [None]:
# add to data
data['dmt_tag'] = df_dominant_topic_tag['Dominant_Topic']
data['tpc_tag'] = df_dominant_topic_tag['Topic_Perc_Contrib']
data['kw_tag'] = df_dominant_topic_tag['Keywords']

In [None]:
df_dominant_topic_tag.shape

In [None]:
data.to_csv('result.csv')

__Find the most representative document for each topic__

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords_tag.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(3)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Tags"]

# Show
sent_topics_sorteddf_mallet#.head(30)

__Topic distribution across documents__

In [None]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords_tag['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords_tag[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.head(10)

In [None]:

test = [['busy', 'engage', 'gift', 'puzzle']]

id2word_test = corpora.Dictionary(test)

# Create Corpus

# Term Document Frequency
corpus_test = [id2word_test.doc2bow(text) for text in test]

print(corpus_test[0])
topics = sorted(optimal_model[corpus_test],
                key=lambda 
                x:x[1],
                reverse=True)
print(topics)