In [1]:
# Importa as bibliotecas necessárias
import gensim
import gensim.corpora as corpora
import pandas as pd
import re
import spacy

from gensim.models import LdaModel
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
from tqdm import tqdm
tqdm.pandas()

In [14]:
# Carrega o dataset referente às reviews do Airbnb Rio de Janeiro
reviews = pd.read_csv("data/reviews.csv")

In [15]:
# Apresenta as primeiras 5 reviews
reviews.head(5)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,17878,64852,2010-07-15,135370,Tia,This apartment is in a perfect location -- two...
1,17878,76744,2010-08-11,10206,Mimi,we had a really great experience staying in Ma...
2,17878,91074,2010-09-06,80253,Jan,Staying in Max appartment is like living in a ...
3,17878,137528,2010-11-12,230449,Orene,In general very good and reasonable price.\r\n...
4,17878,147594,2010-12-01,219338,David,The apt was nice and in a great location only ...


In [4]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292516 entries, 0 to 292515
Data columns (total 7 columns):
listing_id       292516 non-null int64
id               292516 non-null int64
date             292516 non-null object
reviewer_id      292516 non-null int64
reviewer_name    292516 non-null object
comments         292446 non-null object
language         292076 non-null object
dtypes: int64(3), object(4)
memory usage: 15.6+ MB


In [5]:
# Converte coluna comments para string
reviews["comments"] = reviews["comments"].astype(str)

In [None]:
# Adiciona coluna com idioma da review
def try_detect_language(text):
    """
    Dado uma string text, tenta obter o idioma predominante.
    
    Argumentos:
    text -- (str) o texto para o qual se deseja detectar o idioma
    Retorna: (str) o código do idioma (e.g. 'en' para inglês). 
             Retorna uma string vazia caso não seja possível detectar o idioma
    """
    try:
        return detect(text)
    except:
        return ""

reviews["language"] = reviews["comments"].apply(try_detect_language)

In [None]:
# Verifica total de reviews para cada idioma identificado
reviews.language.value_counts().head()

## Preprocessamento dos Dados

In [6]:
nlp_pt = spacy.load("pt")

def preprocess(text, 
               min_token_len = 2, 
               irrelevant_pos = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE'],
               nlp = nlp_pt): 
    """
    Dado text, min_token_len, e irrelevant_pos, faz o preprocessamento do texto
    
    Argumentos:
    text -- (str) texto a ser pré-processado
    min_token_len -- (int) o menor comprimento de token a ser considerado
    irrelevant_pos -- (list) lista as classes gramaticais a serem ignoradas
    
    Retorna: (str) texto pré processado
    """
    
    # Converte todo os texto para apenas caractéres minúsculos
    text = text.lower()
    
    # Remove termos utilizados pelo airbnb para ocultar informações
    text = re.sub(r'\((.*hidden by airbnb)\)', "", text)
    
    # Processa o texto usando spacy
    doc = nlp(text)
    
    preprocessed = []
    
    # Remove stop words, pontuação, tokens menores que min_token_len e tokens cuja 
    # classe gramatical identificada é irrelavante para análise
    for token in doc:
        if not token.is_stop and not token.is_punct and len(token) >= min_token_len and token.pos_ not in irrelevant_pos:
            # Armazena o lemma do token na variável de saída
            preprocessed.append(token.lemma_)
            
    return " ".join(preprocessed)

def preprocess_pt(text):
    return preprocess(text, nlp = nlp_pt)

In [7]:
reviews_pt = reviews[reviews.language == "pt"]

In [8]:
reviews_pt["clean_comments"] = reviews_pt.comments.progress_apply(preprocess_pt)

100%|██████████| 138315/138315 [1:07:43<00:00, 34.04it/s]    
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [9]:
reviews_pt.to_csv('data/reviews_pt_preprocessed.csv', index = False)

In [10]:
preprocessed_corpus = [doc.split() for doc in reviews_pt.clean_comments.tolist()]
# Cria um vocabulário para o modelo LDA e converte
# nosso corpus em uma matriz documento-termo para LDA
dictionary = corpora.Dictionary(preprocessed_corpus)

# Opcional: Remove palavras que aparecem em menos de no_below reviews e em mais do que no_above % das reviews
# dictionary.filter_extremes(no_below = 15, no_above = 0.1, keep_n = 10000)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in preprocessed_corpus]

In [11]:
print(dictionary)

Dictionary(32413 unique tokens: ['apt', 'conhecer', 'excelente', 'hotel', 'limpar']...)


In [12]:
num_topics = range(5, 15)
passes = range(1, 6)

lda_models = {}

for n in num_topics:
    for p in passes:
        print('Number of topics = ', n)
        print('Number of passes = ', p, '\n')
        lda = LdaModel(corpus = doc_term_matrix, 
                       id2word = dictionary, 
                       num_topics = n, 
                       passes = p)
        lda_models[str(n) + '_' + str(p)] = lda

        print(lda.print_topics(), '\n')

Number of topics =  5
Number of passes =  1 

[(0, '0.030*"casar" + 0.021*"voltar" + 0.019*"super" + 0.018*"ficar" + 0.016*"rir" + 0.014*"maravilhoso" + 0.014*"sentir" + 0.013*"anfitrião" + 0.012*"recomendar" + 0.011*"deixar"'), (1, '0.055*"apartamento" + 0.054*"localização" + 0.045*"excelente" + 0.039*"super" + 0.039*"recomendar" + 0.037*"ótima" + 0.036*"limpar" + 0.035*"anfitrião" + 0.031*"atencioso" + 0.024*"localizar"'), (2, '0.025*"cama" + 0.019*"apartamento" + 0.016*"cozinhar" + 0.012*"banheiro" + 0.012*"problema" + 0.012*"ar" + 0.012*"roupar" + 0.011*"ficar" + 0.010*"condicionar" + 0.010*"ter"'), (3, '0.051*"praia" + 0.032*"apartamento" + 0.030*"restaurante" + 0.025*"metrô" + 0.020*"ficar" + 0.019*"localização" + 0.016*"mercar" + 0.016*"bar" + 0.015*"localizar" + 0.015*"supermercado"'), (4, '0.050*"valer" + 0.047*"visto" + 0.042*"peno" + 0.023*"lindo" + 0.016*"otimo" + 0.013*"mar" + 0.011*"otima" + 0.010*"top" + 0.010*"lapa" + 0.009*"varanda"')] 

Number of topics =  5
Number of

[(0, '0.034*"cama" + 0.029*"apartamento" + 0.022*"cozinhar" + 0.017*"banheiro" + 0.016*"ar" + 0.016*"roupar" + 0.014*"condicionar" + 0.013*"custar" + 0.012*"chuveiro" + 0.011*"funcionar"'), (1, '0.049*"apartamento" + 0.045*"super" + 0.044*"localização" + 0.038*"excelente" + 0.037*"recomendar" + 0.033*"anfitrião" + 0.032*"limpar" + 0.031*"atencioso" + 0.031*"ótima" + 0.025*"voltar"'), (2, '0.047*"casar" + 0.031*"rir" + 0.023*"ficar" + 0.022*"sentir" + 0.016*"dica" + 0.016*"maravilhoso" + 0.014*"dar" + 0.014*"voltar" + 0.014*"melhor" + 0.012*"incrível"'), (3, '0.051*"praia" + 0.042*"apartamento" + 0.029*"restaurante" + 0.027*"localização" + 0.024*"metrô" + 0.019*"localizar" + 0.018*"excelente" + 0.016*"mercar" + 0.016*"bar" + 0.016*"recomendar"'), (4, '0.093*"check" + 0.068*"in" + 0.043*"flexível" + 0.038*"out" + 0.030*"horário" + 0.013*"flexibilidade" + 0.011*"the" + 0.011*"andré" + 0.010*"cumprir" + 0.010*"and"'), (5, '0.022*"ficar" + 0.015*"chegar" + 0.012*"dia" + 0.012*"ter" + 0.012*

[(0, '0.036*"cama" + 0.027*"apartamento" + 0.024*"cozinhar" + 0.018*"banheiro" + 0.017*"ar" + 0.017*"roupar" + 0.014*"condicionar" + 0.012*"chuveiro" + 0.011*"funcionar" + 0.010*"localização"'), (1, '0.058*"super" + 0.054*"apartamento" + 0.045*"localizar" + 0.039*"limpar" + 0.031*"atencioso" + 0.030*"recomendar" + 0.026*"voltar" + 0.023*"ótimo" + 0.021*"anfitrião" + 0.015*"rápido"'), (2, '0.052*"casar" + 0.032*"rir" + 0.025*"maravilhoso" + 0.023*"visto" + 0.021*"ficar" + 0.021*"sentir" + 0.019*"dica" + 0.014*"lindo" + 0.014*"voltar" + 0.014*"incrível"'), (3, '0.022*"casar" + 0.021*"deixar" + 0.019*"check" + 0.018*"voltar" + 0.015*"hospitalidade" + 0.014*"sentir" + 0.014*"cuidar" + 0.013*"receber" + 0.013*"atenção" + 0.013*"agradecer"'), (4, '0.027*"ficar" + 0.020*"problema" + 0.019*"ter" + 0.018*"dia" + 0.014*"chegar" + 0.013*"ser" + 0.010*"haver" + 0.010*"sair" + 0.009*"apartamento" + 0.009*"entrar"'), (5, '0.081*"localização" + 0.060*"excelente" + 0.057*"ótima" + 0.048*"apartamento" 

[(0, '0.021*"quentar" + 0.012*"restar" + 0.012*"the" + 0.011*"mau" + 0.011*"and" + 0.009*"sujo" + 0.009*"sujar" + 0.009*"suite" + 0.008*"ok" + 0.008*"chão"'), (1, '0.034*"cama" + 0.028*"apartamento" + 0.022*"cozinhar" + 0.017*"banheiro" + 0.016*"ar" + 0.016*"roupar" + 0.014*"condicionar" + 0.013*"ficar" + 0.013*"problema" + 0.012*"chuveiro"'), (2, '0.068*"valer" + 0.059*"custar" + 0.056*"peno" + 0.043*"benefício" + 0.032*"preço" + 0.022*"otima" + 0.021*"otimo" + 0.015*"justar" + 0.015*"beneficiar" + 0.013*"nao"'), (3, '0.021*"check" + 0.020*"ficar" + 0.015*"chegar" + 0.014*"in" + 0.013*"horário" + 0.012*"rir" + 0.012*"centrar" + 0.011*"sair" + 0.010*"dia" + 0.009*"check-in"'), (4, '0.044*"rir" + 0.026*"ficar" + 0.025*"dica" + 0.024*"dar" + 0.022*"voltar" + 0.021*"melhor" + 0.021*"apartamento" + 0.015*"ajudar" + 0.015*"hospedar" + 0.014*"anfitrião"'), (5, '0.060*"apartamento" + 0.058*"localização" + 0.048*"excelente" + 0.043*"ótima" + 0.043*"recomendar" + 0.043*"super" + 0.041*"limpar" 

[(0, '0.083*"praia" + 0.053*"restaurante" + 0.043*"apartamento" + 0.033*"metrô" + 0.031*"localização" + 0.030*"mercar" + 0.029*"bar" + 0.026*"supermercado" + 0.024*"farmácia" + 0.021*"copacabana"'), (1, '0.044*"fácil" + 0.038*"acesso" + 0.033*"rir" + 0.027*"ficar" + 0.018*"praia" + 0.016*"metrô" + 0.016*"centrar" + 0.014*"tranquilo" + 0.014*"bairro" + 0.012*"cidade"'), (2, '0.026*"the" + 0.024*"and" + 0.019*"cristina" + 0.017*"to" + 0.015*"larissa" + 0.015*"clean" + 0.013*"hostel" + 0.013*"balsa" + 0.012*"is" + 0.011*"hosts"'), (3, '0.029*"preço" + 0.023*"pessoa" + 0.019*"prédio" + 0.016*"ficar" + 0.015*"pequeno" + 0.015*"ar" + 0.015*"apartamento" + 0.014*"piscina" + 0.013*"quarto" + 0.013*"sala"'), (4, '0.042*"cama" + 0.025*"cozinhar" + 0.023*"apartamento" + 0.019*"roupar" + 0.019*"banheiro" + 0.014*"chuveiro" + 0.013*"ar" + 0.012*"toalha" + 0.012*"condicionar" + 0.011*"funcionar"'), (5, '0.033*"check" + 0.032*"problema" + 0.023*"in" + 0.021*"horário" + 0.019*"chegar" + 0.019*"ter" + 

[(0, '0.119*"localização" + 0.082*"ótima" + 0.080*"excelente" + 0.056*"apartamento" + 0.036*"anfitrião" + 0.036*"recomendar" + 0.025*"atencioso" + 0.024*"limpar" + 0.023*"voltar" + 0.020*"estadia"'), (1, '0.050*"super" + 0.047*"voltar" + 0.033*"casar" + 0.024*"ficar" + 0.023*"apartamento" + 0.022*"atencioso" + 0.019*"maravilhoso" + 0.018*"recomendar" + 0.017*"sentir" + 0.016*"limpar"'), (2, '0.080*"praia" + 0.047*"restaurante" + 0.041*"apartamento" + 0.039*"metrô" + 0.030*"localização" + 0.026*"mercar" + 0.026*"bar" + 0.023*"supermercado" + 0.022*"farmácia" + 0.019*"ficar"'), (3, '0.045*"rir" + 0.031*"acesso" + 0.029*"fácil" + 0.022*"ficar" + 0.019*"cidade" + 0.017*"dica" + 0.017*"casar" + 0.015*"janeiro" + 0.013*"opção" + 0.012*"centrar"'), (4, '0.047*"espaçar" + 0.043*"custar" + 0.039*"ótimo" + 0.031*"benefício" + 0.028*"café" + 0.024*"manhã" + 0.024*"tranquilo" + 0.019*"ruir" + 0.019*"simples" + 0.015*"lapa"'), (5, '0.070*"super" + 0.060*"apartamento" + 0.055*"localizar" + 0.050*"re

[(0, '0.063*"praia" + 0.060*"localização" + 0.050*"apartamento" + 0.026*"copacabana" + 0.025*"perfeito" + 0.024*"ótima" + 0.023*"excelente" + 0.021*"foto" + 0.019*"recomendar" + 0.019*"voltar"'), (1, '0.023*"cama" + 0.018*"roupar" + 0.018*"apartamento" + 0.016*"problema" + 0.015*"ter" + 0.013*"chegar" + 0.012*"deixar" + 0.012*"ficar" + 0.012*"haver" + 0.011*"toalha"'), (2, '0.125*"fácil" + 0.104*"acesso" + 0.037*"preço" + 0.036*"lapa" + 0.025*"otimo" + 0.022*"centrar" + 0.020*"otima" + 0.018*"justar" + 0.014*"localizacao" + 0.013*"transportar"'), (3, '0.052*"praia" + 0.044*"restaurante" + 0.034*"metrô" + 0.030*"apartamento" + 0.024*"mercar" + 0.024*"bar" + 0.022*"supermercado" + 0.021*"ficar" + 0.020*"farmácia" + 0.015*"padaria"'), (4, '0.037*"ar" + 0.032*"condicionar" + 0.030*"cama" + 0.028*"apartamento" + 0.025*"banheiro" + 0.022*"cozinhar" + 0.019*"ficar" + 0.019*"funcionar" + 0.019*"sala" + 0.018*"pequeno"'), (5, '0.041*"limpeza" + 0.027*"apartamento" + 0.025*"utensílio" + 0.025*"p

[(0, '0.029*"melhor" + 0.024*"noto" + 0.024*"hóspede" + 0.022*"detalhe" + 0.021*"experiência" + 0.020*"airbnb" + 0.019*"10" + 0.019*"cuidar" + 0.014*"poder" + 0.014*"ser"'), (1, '0.069*"apartamento" + 0.068*"super" + 0.063*"limpar" + 0.045*"localizar" + 0.045*"recomendar" + 0.044*"atencioso" + 0.040*"anfitrião" + 0.027*"excelente" + 0.026*"localização" + 0.023*"ótimo"'), (2, '0.030*"problema" + 0.025*"ficar" + 0.021*"chegar" + 0.019*"dia" + 0.018*"ter" + 0.017*"apartamento" + 0.015*"haver" + 0.014*"sair" + 0.014*"entrar" + 0.013*"check-in"'), (3, '0.048*"fácil" + 0.041*"acesso" + 0.034*"praia" + 0.024*"tranquilo" + 0.024*"metrô" + 0.021*"espaçar" + 0.019*"localizar" + 0.019*"ficar" + 0.018*"copacabana" + 0.017*"bairro"'), (4, '0.086*"host" + 0.036*"pai" + 0.031*"the" + 0.028*"and" + 0.020*"to" + 0.019*"in" + 0.017*"clean" + 0.014*"joana" + 0.014*"is" + 0.013*"hosts"'), (5, '0.097*"visto" + 0.052*"maravilhoso" + 0.045*"valer" + 0.040*"incrível" + 0.038*"peno" + 0.031*"expectativa" + 0.0

[(0, '0.086*"casar" + 0.040*"sentir" + 0.026*"super" + 0.026*"deixar" + 0.023*"vontade" + 0.019*"anfitrião" + 0.018*"pessoa" + 0.018*"voltar" + 0.018*"recomendar" + 0.017*"maravilhoso"'), (1, '0.038*"the" + 0.034*"and" + 0.026*"host" + 0.024*"to" + 0.021*"clean" + 0.019*"copo" + 0.018*"joana" + 0.017*"is" + 0.016*"hosts" + 0.014*"conexão"'), (2, '0.056*"dica" + 0.049*"check" + 0.048*"dar" + 0.034*"in" + 0.029*"flexível" + 0.028*"horário" + 0.022*"check-in" + 0.020*"out" + 0.018*"deixar" + 0.018*"super"'), (3, '0.126*"cama" + 0.077*"roupar" + 0.047*"toalha" + 0.042*"banhar" + 0.041*"preço" + 0.036*"limpo" + 0.029*"qualidade" + 0.018*"limpeza" + 0.017*"justar" + 0.015*"sofá"'), (4, '0.051*"voltar" + 0.044*"apartamento" + 0.043*"maravilhoso" + 0.030*"super" + 0.030*"visto" + 0.029*"rir" + 0.028*"ficar" + 0.022*"incrível" + 0.019*"melhor" + 0.017*"limpar"'), (5, '0.090*"limpeza" + 0.072*"impecável" + 0.049*"rápido" + 0.047*"noto" + 0.040*"atendimento" + 0.040*"respostar" + 0.040*"10" + 0.0

[(0, '0.059*"condomínio" + 0.041*"visto" + 0.040*"acomodação" + 0.040*"flat" + 0.039*"piscina" + 0.024*"serviço" + 0.022*"hotel" + 0.021*"imóvel" + 0.021*"possuir" + 0.018*"varanda"'), (1, '0.081*"host" + 0.049*"top" + 0.038*"custo-benefício" + 0.029*"the" + 0.026*"and" + 0.023*"recém" + 0.019*"arena" + 0.019*"fofar" + 0.018*"to" + 0.017*"marcia"'), (2, '0.035*"problema" + 0.025*"apartamento" + 0.018*"foto" + 0.016*"atender" + 0.015*"ficar" + 0.014*"anúncio" + 0.014*"resolver" + 0.014*"pequeno" + 0.014*"espaçar" + 0.013*"anfitrião"'), (3, '0.074*"praia" + 0.051*"restaurante" + 0.042*"metrô" + 0.041*"apartamento" + 0.028*"mercar" + 0.028*"bar" + 0.027*"localização" + 0.025*"supermercado" + 0.023*"farmácia" + 0.020*"ficar"'), (4, '0.072*"apartamento" + 0.052*"localização" + 0.041*"atencioso" + 0.040*"ótima" + 0.040*"limpar" + 0.038*"excelente" + 0.037*"anfitrião" + 0.029*"responder" + 0.029*"recomendar" + 0.029*"rápido"'), (5, '0.052*"rir" + 0.038*"dica" + 0.036*"dar" + 0.026*"valer" + 0