In [2]:
import pandas as pd

df = pd.read_csv('reviews.csv')
df['rating'] = df['rating'].astype(dtype='int64')

In [3]:
from typing import Generator, List
import pickle

from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import spacy
from spacy.lang.en import English

In [4]:
def sentences_to_words(sentences: List[str]) -> Generator:
    for sentence in sentences:
        # https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True elimina la puntuación

In [5]:
def remove_stopwords(documents: List[List[str]]) -> List[List[str]]:
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords.words('english')]
            for doc in documents]

In [6]:
def learn_bigrams(documents: List[List[str]]) -> List[List[str]]:
    # We learn bigrams
    #  https://radimrehurek.com/gensim/models/phrases.html#gensim.models.phrases.Phrases
    bigram = Phrases(documents, min_count=5, threshold=10)

    # we reduce the bigram model to its minimal functionality
    bigram_mod = Phraser(bigram)

    # we apply the bigram model to our documents
    return [bigram_mod[doc] for doc in documents]

In [7]:
def lemmatization(nlp: English, texts: List[List[str]], allowed_postags: List = None) -> List[List[str]]:
    if allowed_postags is None:
        allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']

    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [8]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])

In [9]:
def tokenize(documents: List[str]) -> List[List[str]]:

    document_words = list(sentences_to_words(documents))
    document_words = remove_stopwords(document_words)
    document_words = learn_bigrams(document_words)
    document_words = lemmatization(nlp, document_words)

    return document_words

In [10]:
tokenize_reviews = tokenize(df['review'])

In [43]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(tokenize_reviews)

In [44]:
corpus = [id2word.doc2bow(text) for text in tokenize_reviews]

In [45]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [46]:
from pprint import pprint

pprint(lda_model.print_topics())

[(0,
  '0.035*"treat" + 0.034*"ambiance" + 0.024*"cool" + 0.022*"step" + '
  '0.021*"rest" + 0.021*"hostel" + 0.021*"music" + 0.019*"amaze" + '
  '0.018*"afternoon" + 0.017*"touch"'),
 (1,
  '0.048*"romantic" + 0.042*"huge" + 0.038*"employee" + 0.023*"advance" + '
  '0.021*"hard" + 0.019*"impressive" + 0.017*"probably" + 0.016*"rather" + '
  '0.016*"life" + 0.015*"corridor"'),
 (2,
  '0.076*"ground" + 0.044*"take" + 0.032*"many" + 0.028*"think" + '
  '0.020*"provide" + 0.018*"door" + 0.018*"candle" + 0.017*"eat" + '
  '0.016*"leave" + 0.016*"ever"'),
 (3,
  '0.058*"gorgeous" + 0.033*"central" + 0.031*"less" + 0.028*"space" + '
  '0.027*"country" + 0.023*"business" + 0.021*"fill" + 0.020*"type" + '
  '0.020*"consider" + 0.019*"next"'),
 (4,
  '0.122*"hotel" + 0.093*"room" + 0.036*"beautiful" + 0.033*"great" + '
  '0.025*"service" + 0.021*"breakfast" + 0.020*"restaurant" + 0.016*"staff" + '
  '0.015*"comfortable" + 0.015*"location"'),
 (5,
  '0.048*"free" + 0.046*"local" + 0.036*"must" +

In [14]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

  pickler.file_handle.write(chunk.tostring('C'))
  pickler.file_handle.write(chunk.tostring('C'))


In [61]:
import pprint
tokens = tokenize(['this is the best hotel i have stayed'])
id2word = corpora.Dictionary(tokens)
corpus = [id2word.doc2bow(text) for text in tokens]
lda_prediction = pd.DataFrame(lda_model[corpus][0][0])
lda_prediction.columns = ['Index', 'Prob']
lda_prediction[['Prob']].idxmax() 
topics = ['Cool Music Hostel','Romantic Stay','Great to Eat','Gorgeous for Business Trip','Great Staff and Location','Top and Cozy','Old with Garden Patio','Dissapointed with Hotel','Good Place to Stay','Like a Museum']
topics[lda_prediction['Index'][7]]

'Good Place to Stay'

In [67]:
from typing import List

from gensim.models import LdaModel
from gensim import corpora
import pandas as pd


def predict(documents: List[str]):

    words = tokenize(documents)
    print(words)

    #lda_model = LdaModel.load('models/lda_model')

    topics = ['Cool Music Hostel', 'Romantic Stay', 'Great to Eat', 'Gorgeous for Business Trip',
              'Great Staff and Location', 'Top and Cozy', 'Old with Garden Patio', 'Dissapointed with Hotel',
              'Good Place to Stay', 'Like a Museum']

    predictions = []
    for document in words:
        print(document)
        id2word = corpora.Dictionary(document)
        corpus = id2word.doc2bow(document)

        lda_prediction = pd.DataFrame(lda_model[corpus][0][0])
        lda_prediction.columns = ['Index', 'Prob']
        topic = topics[lda_prediction['Index'][lda_prediction[['Prob']].idxmax()]]

        predictions.append(topic)

    return predictions


In [68]:
predict(['We were on the second floor. We arrived late at night time and the front desk had everything ready to go. The rooms were a little small but nothing to fuss over. They were clean and well kept. What a surprise when we opened the door in the morning - best view in Antigua. Breakfast was simple, and the service is wonderful. Location is just outside the main area, which means it was nice and quiet. Still very walkable to sights. I would stay here again.','We came here with a group and this was a five star surprise. Lush landscaping, wonderful beds, and close to the square  The water is drinkable but they go sell bottled water. The is a large grocery store only 3 blocks away for anything you might need '])

[['second', 'floor', 'arrive', 'late', 'night', 'time', 'front', 'desk', 'ready', 'go', 'room', 'little', 'small', 'fuss', 'clean', 'well', 'keep', 'surprise', 'open', 'door', 'morning', 'good', 'service', 'wonderful', 'location', 'main', 'area', 'mean', 'nice', 'quiet', 'still', 'walkable', 'sight', 'would', 'stay'], ['come', 'star', 'surprise', 'lush', 'landscaping', 'wonderful', 'bed', 'close', 'square', 'water', 'drinkable', 'go', 'sell', 'bottled', 'water', 'large', 'grocery', 'store', 'block', 'away', 'may', 'need']]
['second', 'floor', 'arrive', 'late', 'night', 'time', 'front', 'desk', 'ready', 'go', 'room', 'little', 'small', 'fuss', 'clean', 'well', 'keep', 'surprise', 'open', 'door', 'morning', 'good', 'service', 'wonderful', 'location', 'main', 'area', 'mean', 'nice', 'quiet', 'still', 'walkable', 'sight', 'would', 'stay']


TypeError: doc2bow expects an array of unicode tokens on input, not a single string