# Training Topic Model (LDA) on the Airbnb Comments

In [19]:

# import and setup modules we'll be using in this notebook
import logging
import itertools
import pandas as pd
import numpy as np
import gensim
import sys
reload(sys)
import cPickle
import re
sys.setdefaultencoding('utf8')
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

In [20]:
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS

def tokenize(text):
    return [token for token in simple_preprocess(text) if token.lower() not in STOPWORDS]

def iter_token(id_,comment):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
  #  ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for id_,comment in zip(id_,comment):
        tokens=tokenize(comment)
       # if len(tokens) < 50 :
        #    continue  # ignore short articles and various meta-articles
        yield id_, tokens
def person_remover(comment,person):
    return comment.replace(person,'')    

# Reading in data

In [21]:
col_to_read = ['listing_id','id','lan']
df_all_reviews= pd.read_csv('../DataFiles/merged_reviews_withCitiesLan.csv',usecols=col_to_read)

df_person = pd.read_csv('../DataFiles/sentences_withPersonName_NYC_.csv',header=None,sep='\t')
df_person.columns=['no','year','city','id','person','comment']

df_person=df_person.dropna()
df_person.year=df_person.year.astype(int)
df_person.id=df_person.id.astype(int)
df=pd.merge(df_person,df_all_reviews,on='id',how='left')
df=df[df.lan=='en']
df.listing_id.nunique()

27247

In [22]:
df['comment'] = df.apply(lambda row: person_remover(row['comment'], row['person']), axis=1)

df_text=df.groupby(['listing_id','year'])['comment'].apply(lambda x: '.'.join(x)).reset_index()
df_text=df_text.sort_values(['listing_id','year'], ascending=True)




In [23]:
#df_text=df_text[df_text.year>2009]
#df_=pd.DataFrame(df_text.groupby(['listing_id']).year.count())
#indices=df_[df_.year==7].index.tolist()
#df_text=df_text[df_text['listing_id'].isin(indices)]

In [24]:
df_text.to_csv('../DataFiles/sentences_withPersonName_NYC_listing.csv')

In [25]:
df_text=pd.read_csv('../DataFiles/sentences_withPersonName_NYC_listing.csv')

# Tokenizing documents

In [26]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')

docs = list(df_text.dropna().comment)
years= list(df_text.dropna().year)

for idx in range(len(docs)):

    docs[idx]=re.sub(r'[^\x00-\x7F]+',' ', docs[idx])
    docs[idx] = docs[idx].lower()# Convert to lowercase.
    docs[idx] =tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
#docs = [[token for token in doc if not token.isdigit()] for doc in docs]


# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [27]:
from nltk.stem.wordnet import WordNetLemmatizer


            
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [28]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [29]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)
dictionary.save('nyc_lda.dic')

with open(r"docs.pickle", "wb") as output_file:
    cPickle.dump(docs, output_file)

In [30]:
print len(docs),len(years)

# Vectorizing documents, and save the corpus

In [31]:

# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

with open(r"corpus.pickle", "wb") as output_file:
    cPickle.dump(corpus, output_file)
with open(r"years.pickle", "wb") as output_file:
    cPickle.dump(years, output_file)

In [32]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# Train and Save the LDA model

In [33]:
# Train LDA model.

from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

model.save('lda.pkl')

# High frequent words in each topic

In [34]:
model = gensim.models.LdaModel.load('lda.pkl')
#top_topics = model.top_topics(corpus, num_words=5)
top_topics=model.show_topics()
top_topics

[(0,
  u'0.056*"exactly" + 0.039*"described" + 0.038*"clean" + 0.030*"bed" + 0.027*"towel" + 0.027*"picture" + 0.026*"kitchen" + 0.020*"air" + 0.018*"exactly_described" + 0.018*"provided"'),
 (1,
  u'0.057*"restaurant" + 0.047*"recommendation" + 0.041*"area" + 0.034*"local" + 0.032*"gave" + 0.031*"bar" + 0.028*"eat" + 0.027*"information" + 0.026*"tip" + 0.021*"neighborhood"'),
 (2,
  u'0.060*"williamsburg" + 0.055*"loft" + 0.043*"hidden" + 0.037*"view" + 0.020*"url_hidden" + 0.020*"url" + 0.019*"sensitive" + 0.017*"art" + 0.017*"content" + 0.016*"sensitive_content"'),
 (3,
  u'0.039*"home" + 0.035*"wonderful" + 0.025*"family" + 0.023*"brooklyn" + 0.021*"beautiful" + 0.020*"lovely" + 0.017*"husband" + 0.017*"thank" + 0.016*"house" + 0.012*"wife"'),
 (4,
  u'0.043*"subway" + 0.036*"close" + 0.028*"location" + 0.026*"located" + 0.021*"manhattan" + 0.021*"minute" + 0.021*"walk" + 0.019*"away" + 0.017*"station" + 0.016*"train"'),
 (5,
  u'0.085*"recommend" + 0.065*"nyc" + 0.057*"new" + 0.05

In [35]:
model = gensim.models.LdaModel.load('lda.pkl')
#top_topics = model.top_topics(corpus, num_words=5)
top_topics=model.show_topics()
# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
#avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
#print('Average topic coherence: %.4f.' % avg_topic_coherence)

topics=[]
top_topics=model.show_topics()
for t in top_topics:
    topic={}
    for chunk in t[1].split('+'):
        topic[chunk.split('*')[1].strip("'")] = float(chunk.split('*')[0])
    topics.append(topic)
    

#pprint(top_topics)
df_list=[]

for topic in topics:

    df_= pd.DataFrame([(str(k),v) for k,v in topic.iteritems() ])
    df_.sort_values([1], ascending=False)
    
    df_list.append( df_[0])

df = pd.concat(df_list, axis = 1)
df.columns=[ [ 'Topic'+str(i) for i in range(len(df_list))]]
df

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9
0,"""exactly""","""recommendation""","""art""","""brooklyn""","""manhattan""","""nyc""","""sure""","""question""","""good""","""david"""
1,"""exactly_described""","""information""","""url_hidden""","""husband""","""walk""","""new""","""arrived""","""quick""","""helpful""","""timely_manner"""
2,"""described""","""tip""","""url""","""lovely""","""station""","""york""","""let""","""helpful""","""comfortable""","""went"""
3,"""clean""","""neighborhood""","""loft""","""home""","""minute""","""definitely""","""meet""","""accommodating""","""clean""","""matt"""
4,"""provided""","""gave""","""williamsburg""","""wonderful""","""subway""","""recommend""","""late""","""thanks""","""time""","""welcome"""
5,"""picture""","""bar""","""sensitive""","""thank""","""located""","""highly""","""got""","""check""","""welcoming""","""way"""
6,"""towel""","""area""","""sensitive_content""","""family""","""away""","""new_york""","""arrival""","""responsive""","""room""","""feel_welcome"""
7,"""air""","""local""","""view""","""wife""","""location""","""highly_recommend""","""day""","""easy""","""nice""","""went_way"""
8,"""bed""","""restaurant""","""hidden""","""house""","""train""","""staying""","""needed""","""perfect""","""kind""","""feel"""
9,"""kitchen""","""eat""","""content""","""beautiful""","""close""","""trip""","""met""","""communication""","""friendly""","""mile"""


In [36]:
model.get_document_topics(corpus[0])

[(0, 0.035872025760852314),
 (1, 0.035589528067941689),
 (2, 0.014092863641369651),
 (3, 0.12345158179667924),
 (4, 0.060668127144712189),
 (5, 0.083196714904605645),
 (6, 0.15860428997473705),
 (7, 0.17155117691777058),
 (8, 0.30384290796439439),
 (9, 0.013130783826937388)]