# <p style="text-align: center;"> Title </p>

In [1]:
# general imports 
from pre_process import get_all_reviews, get_rate_setter_dictionary_corpus, pretty_print_html 
from gensim import corpora, models, similarities
from gensim.models import LdaMulticore
from gensim.models.ldamodel import LdaModel
from pyLDAvis.gensim import prepare
from random import randint
import pyLDAvis 
from IPython.display import HTML
from pre_process import ReviewsRetriever, ReviewNormalizer

#### We get the reviews by scraping TrustPilot's website and we take a look at a couple of them

In [5]:
rr = ReviewsRetriever()
reviews = rr.get(cached=True)
pretty_print_html([reviews[randint(0, len(reviews))], reviews[randint(0, len(reviews))]])

#### We now normalize the reviews: remove punctuation, make everything lowercase and stem. Then we take a look at another couple of them. 

In [10]:
rn = ReviewNormalizer()
normalized_reviews = [rn.tokenize(r)
                      for r in reviews]
pretty_print_html([" ".join(normalized_reviews[randint(0, len(normalized_reviews))]), 
                   " ".join(normalized_reviews[randint(0, len(normalized_reviews))])])

#### Training the model (this might take a while...)

In [12]:
dictionary = corpora.Dictionary(normalized_reviews)
corpus = [dictionary.doc2bow(r)
          for r in normalized_reviews]
lda = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=100)

In [16]:
print(dictionary)
#print(corpus)

Dictionary(1986 unique tokens: ['onboard', 'pre', 'mainstream', 'situat', 'dealer']...)


In [8]:
lda = LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=100)

In [10]:
lda.print_topics()

[(0,
  '0.031*straight + 0.029*forward + 0.020*time + 0.020*loan + 0.016*bank + 0.016*thank + 0.015*compani + 0.014*servic + 0.013*rate + 0.012*easi'),
 (1,
  '0.079*servic + 0.057*easi + 0.045*recommend + 0.044*quick + 0.040*excel + 0.033*would + 0.031*rate + 0.031*fast + 0.027*loan + 0.024*use'),
 (2,
  '0.039*loan + 0.032*easi + 0.029*process + 0.023*simpl + 0.021*applic + 0.021*servic + 0.020*quick + 0.013*bank + 0.012*rate + 0.012*appli'),
 (3,
  '0.016*help + 0.015*credit + 0.013*compani + 0.012*bank + 0.011*term + 0.010*lend + 0.009*look + 0.008*would + 0.008*score + 0.007*even'),
 (4,
  '0.037*rate + 0.028*loan + 0.013*setter + 0.010*pay + 0.010*good + 0.010*bank + 0.009*thank + 0.009*money + 0.009*earli + 0.009*get')]

In [11]:
dictionary[lda.get_topic_terms(0)[0][0]]


'straight'

#### Prepare data and visualize!

In [14]:
prepared_data = prepare(lda, corpus, dictionary)
pyLDAvis.display(prepared_data)