# LDA TOPIC MODELING 

This notebook applies LDA modeling to a dataset from a Kaggle dataset of news headlines:

https://www.kaggle.com/therohk/million-headlines

using GENSIM toolkit:

https://radimrehurek.com/gensim/

and following a really useful tutorial from machinelearningplus website:

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/



__Steps__

- Text porcessing
- Bulding the model

__Other interesting links:__

- https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

- https://www.analyticsvidhya.com/blog/2016/08/beginners-guide-to-topic-modeling-in-python/

- https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d

- https://stackoverflow.com/questions/20984841/topic-distribution-how-do-we-see-which-document-belong-to-which-topic-after-doi

- https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

__Required Libraries__

In [1]:
import pandas as pd
import numpy as np


import string
import re
from pprint import pprint
from bs4 import BeautifulSoup

# NLTK 
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english') #this depends on each language

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import html

import ast

from langdetect import detect

__Loading data__

In [5]:
df = pd.read_csv("./listing_merged2.csv", index_col = 0) 

In [6]:
df.shape[0]

75979

In [7]:
df.head(1)

Unnamed: 0,category_id,listing_id,title,description,user_id,tags,price,url,img1
0,69150393.0,663885606,St. Martin de Porres Dominican saint crochet a...,Handmade crochet doll of saint Martin de Porre...,10324319.0,"['Catholic saint', 'crochet doll', 'catholic b...",18.0,https://www.etsy.com/listing/663885606/st-mart...,https://i.etsystatic.com/5930941/c/699/555/0/5...


In [8]:
data = df[['listing_id', 'description', 'title', 'tags', 'url', 'img1']]

In [9]:
data = data.dropna()

In [10]:
data.shape

(75979, 6)

In [11]:
data.columns

Index(['listing_id', 'description', 'title', 'tags', 'url', 'img1'], dtype='object')

In [12]:
data.isnull().sum(axis = 0)

listing_id     0
description    0
title          0
tags           0
url            0
img1           0
dtype: int64

In [13]:
def escape(texts): 
    return [html.unescape(str(text)) for text in texts]

In [14]:
data.description = data.description.apply(lambda x: html.unescape(x))
data.title = data.title.apply(lambda x: html.unescape(x))
data.tags = data.tags.apply(lambda x: ast.literal_eval(x))                                                                                               

In [15]:
#flatten list
data.tags = [','.join(x) for x in data.tags]

In [16]:
data.tags = data.tags.apply(lambda x: html.unescape(x))

In [17]:
mask = data.tags.apply(lambda x: len(x) > 3)
data = data.loc[mask, ]

In [885]:
# try not use filter of en
#mask1 = data['tags'].apply(lambda x: detect(x) == 'en')

In [886]:
#mask1

0        True
1        True
2        True
3        True
4        True
         ... 
75974    True
75975    True
75976    True
75977    True
75978    True
Name: tags, Length: 75212, dtype: bool

In [18]:
mk = data.tags.apply(lambda x: "ROSENKNOPF,PUPPENKÜCHE" not in x)

In [19]:
detect(data.loc[mk, 'tags'].iloc[0])

'en'

In [929]:
#data = data.loc[mask1, ]
# save data to 
#data_en = data

In [20]:
data.shape

(75212, 6)

In [21]:
data = data.loc[mk, ] #fine remove german

In [22]:
ds = data['description'].values.tolist()
tl = data['title'].values.tolist()
tag = data['tags'].values.tolist()

In [23]:
def remove_html(texts):
    return [ re.sub(r"https?:\S+", "", str(text)) for text in texts]

ds = remove_html(ds)
tl = remove_html(tl)
tag = remove_html(tag)

In [25]:
pprint(tl[1:2])

['Knitted kids Toy Umigurumi Deer - Knit baby toys - Cute animals- new year- '
 'Photo shoot props- Newborn photography accessories- READY TO SHIP']


__Tokenize words and Clean-up text__

In [26]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
ds_words = list(sent_to_words(ds))
tl_words = list(sent_to_words(tl))
tag_words = list(sent_to_words(tag))
print(ds_words[:1])
print(tl_words[:1])
print(tag_words[:1])

[['handmade', 'crochet', 'doll', 'of', 'saint', 'martin', 'de', 'porres', 'this', 'doll', 'makes', 'great', 'baby', 'shower', 'or', 'baptism', 'gift', 'all', 'details', 'including', 'the', 'eyes', 'and', 'the', 'mouse', 'at', 'his', 'feet', 'are', 'embroidered', 'making', 'this', 'doll', 'safe', 'for', 'babies', 'and', 'toddlers', 'the', 'materials', 'used', 'are', 'sturdy', 'medium', 'weight', 'acrylic', 'yarn', 'and', 'polyester', 'stuffing', 'the', 'doll', 'cannot', 'stand', 'on', 'its', 'own', 'but', 'can', 'be', 'propped', 'up', 'if', 'desired', 'saint', 'martin', 'de', 'porres', 'is', 'the', 'patron', 'saint', 'of', 'mixed', 'race', 'barbers', 'public', 'health', 'workers', 'and', 'innkeepers', 'size', 'tall']]
[['st', 'martin', 'de', 'porres', 'dominican', 'saint', 'crochet', 'amigurumi', 'kokeshi', 'doll', 'catholic', 'kids', 'gift', 'easter', 'basket', 'stuffer', 'baby', 'toy', 'nursery', 'decor']]
[['catholic', 'saint', 'crochet', 'doll', 'catholic', 'baby', 'shower', 'cathol

In [27]:
stop_words = stopwords.words('english')
stop_words.extend(['toy', 'children', 'child', 'kid'])

In [28]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

__Bigrams & Trigrams__

Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

In [30]:
def process_words(data_words):   
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # See trigram example
    print(trigram_mod[bigram_mod[data_words[0]]])


    #Remove Stopwords, Make Bigrams and Lemmatize
    # Define functions for stopwords, bigrams, trigrams and lemmatization
    

    def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        """https://spacy.io/api/annotation"""
        texts_out = []
        for sent in texts:
            doc = nlp(" ".join(sent)) 
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return texts_out
    
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    # Initialize spacy 'es' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download es
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    
    return data_lemmatized, corpus, id2word

ds_data_lemmatized, ds_corpus, ds_id2word = process_words(ds_words)
tl_data_lemmatized, tl_corpus, tl_id2word = process_words(tl_words)
tag_data_lemmatized, tag_corpus, tag_id2word = process_words(tag_words)

['handmade', 'crochet', 'doll', 'of', 'saint', 'martin_de', 'porres', 'this', 'doll', 'makes', 'great', 'baby', 'shower', 'or', 'baptism', 'gift', 'all', 'details', 'including', 'the', 'eyes', 'and', 'the', 'mouse', 'at', 'his', 'feet', 'are', 'embroidered', 'making', 'this', 'doll', 'safe', 'for', 'babies', 'and', 'toddlers', 'the', 'materials', 'used', 'are', 'sturdy', 'medium', 'weight', 'acrylic', 'yarn', 'and', 'polyester', 'stuffing', 'the', 'doll', 'cannot', 'stand', 'on', 'its', 'own', 'but', 'can', 'be', 'propped', 'up', 'if', 'desired', 'saint', 'martin_de', 'porres', 'is', 'the', 'patron_saint', 'of', 'mixed', 'race', 'barbers', 'public', 'health_workers', 'and', 'innkeepers', 'size', 'tall']
['st', 'martin', 'de', 'porres', 'dominican', 'saint', 'crochet', 'amigurumi', 'kokeshi', 'doll', 'catholic', 'kids', 'gift', 'easter', 'basket_stuffer', 'baby', 'toy', 'nursery', 'decor']
['catholic', 'saint', 'crochet', 'doll', 'catholic', 'baby', 'shower', 'catholic', 'baptism', 'whi

__Create the Dictionary and Corpus needed for Topic Modeling__

If you want to see what word a given id corresponds to, pass the id as a key to the dictionary

In [836]:
id2word[0]

'acrylic'

Or, you can see a human-readable form of the corpus itself.

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:2]]

__Building the Topic Model__

In [308]:
# Build LDA model
def build_model(corpus, id2word, num_topics):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=20, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=4,
                                               alpha='auto',
                                               per_word_topics=True)
    return lda_model

#model_ds_20 = build_model(ds_corpus, ds_id2word, 20)
# model_tl_20 = build_model(tl_corpus, tl_id2word, 20)
# model_tag_20 = build_model(tag_corpus, tag_id2word, 20)

#ds_lda_model = gensim.models.LdaMulticore(ds_corpus, num_topics=3, id2word=ds_id2word, eval_every =1, passes=4, workers=5)
#tl_lda_model = gensim.models.LdaMulticore(tl_corpus, num_topics=9, id2word=tl_id2word, eval_every = 1, passes=2, workers=6)
tag_lda_model = gensim.models.LdaMulticore(tag_corpus, num_topics = 2, id2word=tag_id2word, eval_every=1, passes=2, workers=6)

In [327]:
#pprint(ds_lda_model.print_topics())
pprint(tag_lda_model.print_topics())
#pprint(tl_lda_model.print_topics())

[(0,
  '0.083*"baby" + 0.047*"gift" + 0.036*"toy" + 0.032*"doll" + 0.022*"gym" + '
  '0.018*"wooden" + 0.016*"block" + 0.015*"crochet" + 0.011*"vintage" + '
  '0.011*"game"'),
 (1,
  '0.042*"book" + 0.035*"gift" + 0.028*"activity" + 0.027*"play" + 0.026*"kid" '
  '+ 0.025*"toy" + 0.021*"feel" + 0.019*"puzzle" + 0.018*"quiet" + '
  '0.018*"toddler"')]


In [314]:
test = [['toddler', 'color'], ['free', 'shipping']]
from gensim.corpora import Dictionary
#dt = Dictionary.load('ds_id2word.dict')
corpus_test = [ds_id2word.doc2bow(text) for text in test]
for n in range(len(corpus_test)):
#     top_topics = tag_lda_model.get_document_topics(corpus_test[n], minimum_probability=0.0, per_word_topics=True)
#     top_topics = tag_lda_model[corpus_test[n]]
#     topic_vec = [top_topics[i][1] for i in range(20)] 
    row = ds_lda_model[corpus_test[n]]
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    print(row)

[(1, 0.7530213), (2, 0.12964052), (0, 0.117338195)]
[(0, 0.75882804), (2, 0.12591636), (1, 0.11525562)]


In [None]:
from gensim.models import HdpModel
# ds_corpus, ds_id2word
# tl_corpus, tl_id2word
# tag_corpus, tag_id2word
def hdp_topic(corpus, id2word):
    hdp = HdpModel(corpus, id2word, chunksize=10000)
    return hdp
hdp_ds = hdp_topic(ds_corpus, ds_id2word)
hdp_tl = hdp_topic(tl_corpus, tl_id2word)
hdp_tag = hdp_topic(tag_corpus, tag_id2word)
print("ds:", len(hdp_ds.print_topics()))
print("tl:", len(hdp_tl.print_topics()))
print("tag:", len(hdp_tag.print_topics()))
# hdp.print_topics(num_topics=20)

In [None]:
len(hdp.print_topics())

In [None]:
hdp.print_topics(num_topics=20)

In [None]:
ds_corpus[1]
sorted(model_ds_20[ds_corpus[1]], key=lambda : -1*tup[1])

# for index, score in sorted(model_ds_20[ds_corpus[4310]], key=lambda tup: -1*tup[1]):
#     print("\nScore: {}\t \nTopic: {}".format(score, model_ds_20.print_topic(index, 10)))

In [None]:
pprint(model_ds_20.print_topics())

__View the topics in LDA model__

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

__Compute Model Perplexity and Coherence Score__

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

__Visualize the topics-keywords__

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

__Building LDA Mallet Model__

The MALLET topic model package includes an extremely fast and highly scalable implementation of Gibbs sampling, efficient methods for document-topic hyperparameter optimization, and tools for inferring topics for new documents given trained models. 

In [None]:
import os
os.environ.update({'MALLET_HOME':r'C:/Work/Projects/Insight/data/LDA-Topic-Modeling-master/Python/mallet-2.0.8/'})
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = 'C:\\Work\\Projects\\Insight\\data\\LDA-Topic-Modeling-master\\Python\\mallet-2.0.8\\bin\mallet'#./mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=ds_corpus, num_topics=20, id2word=ds_id2word)

In [None]:
ldamallet_tag = gensim.models.wrappers.LdaMallet(mallet_path, corpus=tag_corpus, num_topics=20, id2word=tag_id2word)

In [None]:
# ds_lda_model.save('ds_lda_model.model')
# tag_lda_model.save('tag_lda_model.model')
ds_id2word.save('ds_id2word.dict')
tag_id2word.save('tag_id2word.dict')

In [None]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

__How to find the optimal number of topics for LDA?__

My approach to finding the optimal number of topics is to build many LDA models with different values of number of topics (k) and pick the one that gives the highest coherence value.

Choosing a ‘k’ that marks the end of a rapid growth of topic coherence usually offers meaningful and interpretable topics. Picking an even higher value can sometimes provide more granular sub-topics.

If you see the same keywords being repeated in multiple topics, it’s probably a sign that the ‘k’ is too large.

The compute_coherence_values() (see below) trains multiple LDA models and provides the models and their corresponding coherence scores.

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list_ds, coherence_values_ds = compute_coherence_values(dictionary=ds_id2word, corpus=ds_corpus, texts=ds_data_lemmatized, start=2, limit=40, step=6)

In [None]:
model_list_1, coherence_values_1 = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=2)

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=20))

__Finding the dominant topic in each sentence__

In [321]:
pd.set_option('display.max_colwidth', -1)

In [None]:
ds = data.description.values.tolist()
tag = data.tags.values.tolist()

In [None]:
data = data.reset_index()

In [317]:
def format_topics_sentences(ldamodel=tl_lda_model, corpus=tl_corpus, texts=tl):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

# # Format
# df_dominant_topic_ds = df_topic_sents_keywords_ds.reset_index()
# df_dominant_topic_ds.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# df_dominant_topic_tag = df_topic_sents_keywords_ds.reset_index()
# df_dominant_topic_tag.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

#add to data
# data['dmt_ds'] = df_dominant_topic_ds['Dominant_Topic']
# data['tpc_ds'] = df_dominant_topic_ds['Topic_Perc_Contrib']
# data['kw_ds'] = df_dominant_topic_ds['Keywords']

# data['dmt_tag'] = df_dominant_topic_tag['Dominant_Topic']
# data['tpc_tag'] = df_dominant_topic_tag['Topic_Perc_Contrib']
# data['kw_tag'] = df_dominant_topic_tag['Keywords']


# Show
#df_dominant_topic_ds.head(30)

In [None]:
# Format
df_topic_sents_keywords_ds = format_topics_sentences(ldamodel = ds_lda_model, corpus=ds_corpus, texts=ds)
df_dominant_topic_ds = df_topic_sents_keywords_ds.reset_index()
df_dominant_topic_ds.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic_ds.head(30)

In [None]:
# Format
df_topic_sents_keywords_tag = format_topics_sentences(ldamodel = tag_lda_model, corpus=tag_corpus, texts=tag)
df_dominant_topic_tag = df_topic_sents_keywords_tag.reset_index()
df_dominant_topic_tag.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic_tag.iloc[21:40]

In [324]:
# Format
# df_topic_sents_keywords_tl = format_topics_sentences(ldamodel = tl_lda_model, corpus=tl_corpus, texts=tl)
# df_dominant_topic_tl = df_topic_sents_keywords_tl.reset_index()
# df_dominant_topic_tl.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic_tl.head(10)#.iloc[21:40]

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,6.0,0.7328,"doll, wooden, gift, toy, montessori, baby, toddler, board, waldorf, set","St. Martin de Porres Dominican saint crochet amigurumi kokeshi doll, catholic, kids gift, Easter basket stuffer, baby toy, nursery decor"
1,1,2.0,0.4472,"baby, gym, gift, toy, wooden, play, rattle, teether, crochet, shower",Knitted kids Toy Umigurumi Deer - Knit baby toys - Cute animals- new year- Photo shoot props- Newborn photography accessories- READY TO SHIP
2,2,2.0,0.7621,"baby, gym, gift, toy, wooden, play, rattle, teether, crochet, shower",Knitted newborn Toy gray rabbit - Knit baby bunny toys - Photo shoot props - Newborn photography accessories - Boy - Girl - READY TO SHIP
3,3,3.0,0.9506,"stuff, animal, plush, crochet, gift, handmade, amigurumi, soft, doll, cat",Bat Crochet Bat Halloween bat Amigurumi plush bat vampire bat creepy monster Stuffed animals Handmade toy adult gift halloween decoration
4,4,2.0,0.5779,"baby, gym, gift, toy, wooden, play, rattle, teether, crochet, shower","Crocheted Stroller Necklace, crochet maxi cosi pendant, baby gym, wood-gripping, heart, wooden beads, beautiful and homemade, baby gym"
5,5,4.0,0.476,"game, play, food, feel, set, pretend, activity, kitchen, vintage, bag","Enchanted Forest Board Game 1994, Vintage Board Game, Children's Board Game,Vintage Children's Board Game, :)s***"
6,6,7.0,0.4489,"kid, activity, gift, toy, feel, wooden, car, wood, toddler, educational","Vintage Preschool Rainbow Works Puzzles Little Rabbits and Insects, Vintage Childs Puzzle, Babies Nursery Decor, Childs Room Decor, :)s*"
7,7,3.0,0.8222,"stuff, animal, plush, crochet, gift, handmade, amigurumi, soft, doll, cat",Handmade Knit Jellyfish Plushie
8,8,8.0,0.8251,"puzzle, gift, wooden, name, personalize, vintage, set, kid, custom, girl",Colonial American history book for children of the Boston Tea Party. 1982 paperback with great pencil sketch illustrations.
9,9,5.0,0.4869,"ball, mat, baby, crochet, pattern, montessori, activity, unicorn, fairy, dinosaur",FALL SUBSCRIPTION BOX


In [None]:
#add to data
data['dmt_ds'] = df_dominant_topic_ds['Dominant_Topic']
data['tpc_ds'] = df_dominant_topic_ds['Topic_Perc_Contrib']
data['kw_ds'] = df_dominant_topic_ds['Keywords']

In [None]:
# add to data
data['dmt_tag'] = df_dominant_topic_tag['Dominant_Topic']
data['tpc_tag'] = df_dominant_topic_tag['Topic_Perc_Contrib']
data['kw_tag'] = df_dominant_topic_tag['Keywords']

In [657]:
df_dominant_topic_tag.shape

(72666, 5)

In [None]:
data.to_csv('result_v2.csv')

__Find the most representative document for each topic__

In [764]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords_tag.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(3)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Tags"]

# Show
sent_topics_sorteddf_mallet#.head(30)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Tags
0,0.0,0.975,"doll, gift, wooden, handmade, card, waldorf, toy, art, girl, vintage","Waldorf doll outfit,14 inch doll pants,Steiner doll pant,Miniland doll pants,White doll pajamas,Waldorf Doll Clothes,stripe doll pants,cotton doll pajamas,wool doll pants,cotton doll pants,gift for children,waldorf baby doll,stripe doll pajamas"
1,0.0,0.9743,"doll, gift, wooden, handmade, card, waldorf, toy, art, girl, vintage","Floral doll pants,Waldorf doll outfit,Steiner doll clothes,Miniland doll pajama,14 inch Doll pants,Waldorf Doll Clothes,liberty fabric pants,doll accessory,cotton doll pants,wool doll pants,miniland doll outfit,gift for children,Bitty baby clothes"
2,0.0,0.9736,"doll, gift, wooden, handmade, card, waldorf, toy, art, girl, vintage","playing card,playing card holder,children card holder,senior card holder,canasta card holder,wooden card holder,rummy card holder,playing card rack,engraved card holder,custom card holder,adult card holder,vinyl card holder,flash card holder"
3,1.0,0.9712,"board, game, vintage, busy, baby, toddler, fox, montessori, sensory, toy","mystery game,13 dead end drive,vintage board game,childrens game,learning games,halloween board game,1990s board game,mystery board game,childrens board game,dead end board game,3d board game,milton bradley,board game"
4,1.0,0.9712,"board, game, vintage, busy, baby, toddler, fox, montessori, sensory, toy","childrens game,learning games,family game night,board game,vintage games,ben casey,ben casey game,hospital game,vintage board game,vintage board games,board games,board game complete,family board game"
5,1.0,0.9712,"board, game, vintage, busy, baby, toddler, fox, montessori, sensory, toy","vintage board games,board game,board games,games,childrens games,children board games,board game toys,horse racing board,horse racing game,horse board game,vintage horses,jockey board game,horse racing jockey"
6,2.0,0.9768,"toy, baby, car, wooden, gift, gym, play, montessori, chain, wood","Children's car necklace,Stroller necklace star,Children's car chain koala,Children's car chain stoffstern,Children's car chain with name,Stroller Chain Name,Stroller Chain Girl,Children's car chain pink,Children's car chain cloud,Children's car necklace grey,Children's car chain name girl,Children's car chain girl name,Children's car necklace white"
7,2.0,0.9768,"toy, baby, car, wooden, gift, gym, play, montessori, chain, wood","Children's car necklace,Stroller necklace star,Children's car chain koala,Children's car chain stoffstern,Children's car chain with name,Stroller Chain Name,Stroller Chain Girl,Children's car chain pink,Children's car chain cloud,Children's car necklace grey,Children's car chain name girl,Children's car chain girl name,Children's car necklace white"
8,2.0,0.9768,"toy, baby, car, wooden, gift, gym, play, montessori, chain, wood","Children's car necklace,Stroller necklace star,Children's car chain koala,Children's car chain stoffstern,Children's car chain with name,Stroller Chain Name,Stroller Chain Girl,Children's car chain pink,Children's car chain cloud,Children's car necklace grey,Children's car chain name girl,Children's car chain girl name,Children's car necklace white"
9,3.0,0.9721,"game, baby, montessori, gift, learn, toy, card, educational, memory, wood","KIDS CARD GAMES,CRAZY EIGHT GAME,OLD MAID GAME,VINTAGE CARD GAME,SNAP CARD GAME,HEARTS CARD GAME,RUMMY CARD GAME,BOXED CARD GAME SET,WORD MATCH GAME,WORD CARDS,KIDS WORD GAME,VINTAGE WORD MATCH,CHILD WORD LEARN"


__Topic distribution across documents__

In [763]:
# Number of Documents for Each Topic
topic_counts = df_topic_sents_keywords_tag['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords_tag[['Dominant_Topic', 'Topic_Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.head(10)

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0.0,11.0,"block, baby, wooden, puzzle, wood, toy, gift, name, shower, decor",3041.0,0.0443
1.0,4.0,"gift, baby, stuff, animal, birthday, plush, toy, crochet, doll, easter",1557.0,0.0227
2.0,4.0,"gift, baby, stuff, animal, birthday, plush, toy, crochet, doll, easter",2731.0,0.0398
3.0,13.0,"chart, mask, gift, chore, party, costume, halloween, animal, dress, children",2918.0,0.0425
4.0,2.0,"toy, baby, car, wooden, gift, gym, play, montessori, chain, wood",7541.0,0.1099
5.0,1.0,"board, game, vintage, busy, baby, toddler, fox, montessori, sensory, toy",3612.0,0.0527
6.0,14.0,"puzzle, vintage, disney, jigsaw, school, game, wood, wooden, children, learn",2328.0,0.0339
7.0,4.0,"gift, baby, stuff, animal, birthday, plush, toy, crochet, doll, easter",2646.0,0.0386
8.0,5.0,"play, toy, set, pretend, bag, sensory, montessori, tea, vintage, gift",1990.0,0.029
9.0,5.0,"play, toy, set, pretend, bag, sensory, montessori, tea, vintage, gift",5338.0,0.0778


In [None]:
# example
#>>> other_texts = [ ['computer', 'time', 'graph'], ['survey', 'response', 'eps'], ['human', 'system', 'computer']]
# other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
# unseen_doc = other_corpus[0]
# vector = lda[unseen_doc] 

test = [['busy', 'engage', 'gift', 'puzzle']]

id2word_test = corpora.Dictionary(test)

# Create Corpus

# Term Document Frequency
corpus_test = [id2word_test.doc2bow(text) for text in test]

print(corpus_test[0])
topics = sorted(optimal_model[corpus_test],
                key=lambda 
                x:x[1],
                reverse=True)
print(topics)