In [1]:
%%capture
%cd ..

# Case study - LDA for the reviews of robotic vacuums

## 1. Initialisation

In [2]:
%%capture

import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()

import gc
import gensim.utils
from pyhelpers.dirs import cdd
from pyhelpers.store import save_data, load_data

from src.modeller import LatentDirichletAllocation

In [3]:
lda = LatentDirichletAllocation(product_category='vacuum', product_type='robotic')

data = lda.data.copy()
data[['review_text', 'sentiment_on_dual_scale']]

Loading "data\amazon_reviews\vacuum_cleaners\robotic\preprocd_data\preprocd_data.pkl" ... Done.


Unnamed: 0,review_text,sentiment_on_dual_scale
0,hate talk person need tech support email really,negative
1,excited receive vacuum today well package manu...,positive
2,great vacuum price work well hardwood floor gr...,positive
3,honestly want give try robot thing really work...,positive
4,hesitant buy robot simplicity robot look compl...,positive
...,...,...
77770,set run night morning wake find stick various ...,negative
77771,excellent robot vacuum come fully feature wet ...,positive
77772,good job cleaning laminate floor carpet guess ...,positive
77773,easy set use black lab pick dog hair love,positive


## 2. Preparation of input data - An example

### 2.1 Positive reviews

In [4]:
sentiment = 'positive'

docs = data[data[lda.sentiment_column_name] == sentiment][lda.review_column_name]
docs.iloc[0]

'excited receive vacuum today well package manufacturer great visual appeal surprisingly easy set install first thing notice sleek elegant design pretty impressed tire make rubber pretty solid different textured flooring'

#### 2.1.1 Get tokenized docs

In [5]:
tokenized_docs = lda.get_tokenized_docs(docs=docs, sentiment=sentiment)
tokenized_docs[0]

['excited',
 'receive',
 'package',
 'manufacturer',
 'visual',
 'appeal',
 'easy',
 'set',
 'install',
 'notice',
 'sleek',
 'elegant',
 'design',
 'pretty',
 'tire',
 'rubber',
 'pretty',
 'solid',
 'different',
 'textured',
 'flooring']

#### 2.1.2 Make a corpus

In [6]:
ngram = 3
min_count = 1
threshold = 10e-5

corpus, id2word, texts = lda.make_corpus(
    tokenized_docs=tokenized_docs, ngram=ngram, min_count=min_count, threshold=threshold, 
    scoring='npmi')

In [7]:
# `texts` is like a 3-gram version of `tokenized_docs`
texts[0]

['excited_receive',
 'package_manufacturer',
 'visual_appeal',
 'easy_set',
 'install_notice',
 'sleek_elegant',
 'design_pretty',
 'tire_rubber',
 'pretty_solid',
 'different_textured',
 'flooring']

In [8]:
# `corpus` is like an encoded version of `texts`
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1)]

### 2.2 Negative reviews

In [9]:
sentiment = 'negative'

docs = data[data[lda.sentiment_column_name] == sentiment][lda.review_column_name]
docs.iloc[0]

'hate talk person need tech support email really'

#### 2.2.1 Get tokenized docs

In [10]:
tokenized_docs = lda.get_tokenized_docs(docs=docs, sentiment=sentiment)
tokenized_docs[0]

['hate', 'talk', 'person', 'tech', 'support', 'email']

#### 2.2.2 Make a corpus

In [11]:
ngram = 3
min_count = 1
threshold = 10e-5

corpus, id2word, texts = lda.make_corpus(
    tokenized_docs=tokenized_docs, ngram=ngram, min_count=min_count, threshold=threshold, 
    scoring='npmi')

In [12]:
texts[0]

['hate_talk', 'person_tech', 'support_email']

In [13]:
corpus[0]

[(0, 1), (1, 1), (2, 1)]

## 3. Evaluation of LDA models on the review texts

In [14]:
# lda.evaluate_models()  # (with different sets of hyper-parameters)

### 3.1 Positive reviews

In [15]:
pos_lda_eval_summary = lda.fetch_evaluation_summary(sentiment='positive').query('alpha != "auto"')
pos_lda_eval_summary.head()

Unnamed: 0,min_count,threshold,corpus_proportion,num_topics,alpha,eta,coherence_score
76,5,0.0001,90%,3,asymmetric,1.0,0.598883
98,1,0.0001,80%,3,asymmetric,symmetric,0.594837
123,1,0.0001,85%,3,asymmetric,auto,0.58838
129,5,0.0001,95%,3,asymmetric,1.0,0.587385
136,1,0.0001,90%,3,asymmetric,symmetric,0.586332


In [16]:
pos_idx = 76
pos_key = f'LDA_{str(pos_idx).zfill(3)}'

# pos_lda_vis_data = lda.get_vis_data(sentiment='positive', i=pos_idx, export_to_html=True, update=True, verbose=True)
pos_lda_vis_data = lda.get_vis_data(sentiment='positive', i=pos_idx)

In [17]:
pos_lda_vis_data[pos_key]

### 3.2 Negative reviews

In [18]:
neg_eval_result_summary = lda.fetch_evaluation_summary(sentiment='negative')
neg_eval_result_summary.head()

Unnamed: 0,min_count,threshold,corpus_proportion,num_topics,alpha,eta,coherence_score
0,5,0.0001,85%,3,auto,1.0,0.667406
1,1,0.0001,90%,10,auto,symmetric,0.62678
2,1,0.0001,90%,10,auto,auto,0.62678
3,1,0.0001,85%,10,auto,auto,0.622343
4,1,0.0001,85%,10,auto,symmetric,0.622343


In [19]:
neg_idx = 0
neg_key = f'LDA_{str(neg_idx).zfill(3)}'

# neg_lda_vis_data = lda.get_vis_data(sentiment='negative', i=neg_idx, export_to_html=True, update=True, verbose=True)
neg_lda_vis_data = lda.get_vis_data(sentiment='negative', i=neg_idx)

In [20]:
neg_lda_vis_data[neg_key]

---