# LDA model using sklearn

In [2]:
# Usual imports
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
import os
print(os.listdir("../data"))

# Plotly based imports for visualization
from plotly import tools
import chart_studio.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff

# spaCy based imports
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import bz2
import re

#!python -m spacy download en_core_web_lg

['Bag_Reviews.xlsx', 'productReviewShopee_1.csv']


## Read file

In [3]:
reviews_ms = pd.read_excel('../data/Bag_Reviews.xlsx')
reviews_ms = reviews_ms[['rating','comments']]
reviews_ms.head()

Unnamed: 0,rating,comments
0,4,Give 4 stars because order at the price 37 but...
1,5,Ordered at a discount of 10 baht per piece. It...
2,5,"Small, cute, compact, good But the sash looks ..."
3,1,The size is not as large as it is down. The st...
4,1,The product is compared to the price. Okay. Se...


In [5]:
reviews_ms.shape

(3182, 2)

### Remove Duplicates 

In [6]:
reviews_ms.drop_duplicates().shape

(2677, 2)

In [12]:
reviews_ms=reviews_ms.drop_duplicates()
reviews_ms.head()

Unnamed: 0,rating,comments
0,4,Give 4 stars because order at the price 37 but...
1,5,Ordered at a discount of 10 baht per piece. It...
2,5,"Small, cute, compact, good But the sash looks ..."
3,1,The size is not as large as it is down. The st...
4,1,The product is compared to the price. Okay. Se...


### Remove very small text

In [13]:
reviews_ms.comments=reviews_ms.comments.astype(str)
reviews_ms['len_review']=reviews_ms.comments.apply(len)

In [15]:
reviews_ms.len_review.describe()

count    2677.000000
mean      163.776242
std       131.114011
min         2.000000
25%        77.000000
50%       140.000000
75%       202.000000
max      1236.000000
Name: len_review, dtype: float64

In [16]:
reviews_ms.comments=reviews_ms.comments.apply(lambda x: x.replace('👍','good '))
reviews_ms.head()

Unnamed: 0,rating,comments,len_review
0,4,Give 4 stars because order at the price 37 but...,66
1,5,Ordered at a discount of 10 baht per piece. It...,167
2,5,"Small, cute, compact, good But the sash looks ...",150
3,1,The size is not as large as it is down. The st...,135
4,1,The product is compared to the price. Okay. Se...,302


In [17]:
s_limit=50
max_limit=1300
reviews=reviews_ms.loc[(reviews.len_review>=s_limit) & (reviews.len_review<max_limit),:]
reviews.head()

Unnamed: 0,rating,comments,len_review
0,4,Give 4 stars because order at the price 37 but...,66
1,5,Ordered at a discount of 10 baht per piece. It...,167
2,5,"Small, cute, compact, good But the sash looks ...",150
3,1,The size is not as large as it is down. The st...,135
4,1,The product is compared to the price. Okay. Se...,302


In [18]:
reviews.shape

(2204, 3)

### loading spacy

In [19]:
# Creating a spaCy object
nlp = spacy.load('en_core_web_lg')

### Named Entity Recognition
 Named Entity Recognition is an information extraction task where named entities in unstructured sentences are located and classified  in some pre-defined categories such as the person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.

In [20]:
doc = nlp(reviews["comments"][0])
spacy.displacy.render(doc, style='ent',jupyter=True)

In [21]:
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

### Lemmatization
It is the  process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form. Words like "ran" and "running" are converted to "run" to avoid having words with similar meanings in our data.

In [22]:
review = str(" ".join([i.lemma_ for i in doc]))

In [23]:
doc = nlp(review)
spacy.displacy.render(doc, style='ent',jupyter=True)

### Parts of Speech tagging

This is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech,[1] based on both its definition and its context—i.e., its relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc.

In [24]:
# Parser for reviews
parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [25]:
tqdm.pandas()
reviews["processed_description"] = reviews["comments"].progress_apply(spacy_tokenizer)

100%|██████████| 2204/2204 [00:00<00:00, 4001.81it/s]


In [26]:
reviews

Unnamed: 0,rating,comments,len_review,processed_description
0,4,Give 4 stars because order at the price 37 but...,66,4 star order price 37 today 29 baht sorry
1,5,Ordered at a discount of 10 baht per piece. It...,167,ordered discount 10 baht piece worth note leav...
2,5,"Small, cute, compact, good But the sash looks ...",150,small cute compact good sash look like little ...
3,1,The size is not as large as it is down. The st...,135,size large stitch wrong bag line contemplate l...
4,1,The product is compared to the price. Okay. Se...,302,product compare price okay send slowly bad thi...
5,4,Beautiful work Sewing Good compact Suitable fo...,111,beautiful work sewing good compact suitable pr...
6,4,Worth the price ordered When holding the bag S...,116,worth price order hold bag send fast think wra...
7,5,This bag is like a shoulder bag. Colorful shap...,262,bag like shoulder bag colorful shape regret sh...
9,5,"Beautiful, cheap, very worthwhile. ðŸ‘ðŸ‘ðŸ‘...",531,beautiful cheap worthwhile ðÿ‘ðÿ‘ðÿ‘ðÿ‘ðÿ‘...
10,5,"A cute seller, very worthwhile. Very worthwhil...",285,cute seller worthwhile worthwhile good value b...


# Topic-modelling

In [27]:
# Creating a vectorizer
vectorizer = CountVectorizer(min_df=0.005, max_df=0.85, stop_words='english', lowercase=True, token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(reviews["processed_description"])

In [32]:
NUM_TOPICS = 4

In [33]:
# Latent Dirichlet Allocation Model
#lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=50, learning_method='online',verbose=True)
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=50, learning_method='batch',verbose=True)
data_lda = lda.fit_transform(data_vectorized)

iteration: 1 of max_iter: 50
iteration: 2 of max_iter: 50
iteration: 3 of max_iter: 50
iteration: 4 of max_iter: 50
iteration: 5 of max_iter: 50
iteration: 6 of max_iter: 50
iteration: 7 of max_iter: 50
iteration: 8 of max_iter: 50
iteration: 9 of max_iter: 50
iteration: 10 of max_iter: 50
iteration: 11 of max_iter: 50
iteration: 12 of max_iter: 50
iteration: 13 of max_iter: 50
iteration: 14 of max_iter: 50
iteration: 15 of max_iter: 50
iteration: 16 of max_iter: 50
iteration: 17 of max_iter: 50
iteration: 18 of max_iter: 50
iteration: 19 of max_iter: 50
iteration: 20 of max_iter: 50
iteration: 21 of max_iter: 50
iteration: 22 of max_iter: 50
iteration: 23 of max_iter: 50
iteration: 24 of max_iter: 50
iteration: 25 of max_iter: 50
iteration: 26 of max_iter: 50
iteration: 27 of max_iter: 50
iteration: 28 of max_iter: 50
iteration: 29 of max_iter: 50
iteration: 30 of max_iter: 50
iteration: 31 of max_iter: 50
iteration: 32 of max_iter: 50
iteration: 33 of max_iter: 50
iteration: 34 of ma

In [34]:
# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]]) 

In [35]:
# Keywords for topics clustered by Latent Dirichlet Allocation
print("LDA Model:")
selected_topics(lda, vectorizer)

LDA Model:
Topic 0:
[('bag', 462.22210623582856), ('color', 345.224477425405), ('small', 226.23761116131783), ('price', 197.83415878933263), ('cute', 187.13583001553062), ('beautiful', 164.22872186273645), ('little', 153.22814877371633), ('reasonable', 131.20025352797862), ('order', 101.5812555559852), ('okay', 92.67848282764608)]
Topic 1:
[('good', 4779.207975164057), ('product', 1326.6553665364395), ('quality', 1053.22284665684), ('value', 943.2224253175722), ('delivery', 887.9548696507887), ('fast', 705.018886235592), ('service', 560.227662971118), ('shop', 211.9990757574814), ('money', 184.9761734540279), ('company', 153.2419734973546)]
Topic 2:
[('beautiful', 523.2432358221826), ('like', 510.2216167941465), ('product', 372.490390578382), ('reed', 331.2456380353618), ('order', 308.12143927175754), ('lot', 196.22731582816857), ('send', 186.49226958957686), ('receive', 158.2195982968233), ('okay', 137.97673202772745), ('item', 114.75717950756022)]
Topic 3:
[('price', 1172.64285927268

In [36]:
# Transforming an individual sentence
text = reviews.comments.iloc[0]
x = lda.transform(vectorizer.transform([text]))#[0]
print(x)

[[0.05081709 0.05000103 0.40788681 0.49129506]]


In [38]:
reviews.comments.iloc[0]

'Give 4 stars because order at the price 37 but today 29 baht sorry'

#### Finding the main topic of each reviews which has length >1300 and length < 50

In [39]:
reviews_test = reviews_ms[reviews_ms.len_review>=max_limit]
reviews_test.shape

(0, 3)