In [1]:
# Run in terminal or command prompt:
#python3 -m spacy download en

# Packages
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pprint import pprint

# Import stopwords and other word packages
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models # as gensimvis  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.corpus import stopwords

from scipy.sparse import csr_matrix, hstack, coo_matrix


import string
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
airbnb = pd.read_csv('../data/airbnb_gentrification.csv')

In [3]:
airbnb.head(1)

Unnamed: 0,listing_id,comments_concatenated,name,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,listing_url,description,neighborhood_overview,host_since,host_listings_count,property_type,accommodates,bathrooms_text,bedrooms,beds,amenities,minimum_nights_avg_ntm,maximum_nights_avg_ntm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,zip_code,GEOID,house_price_2021-01-31,house_pct_change,rentals_2021-01-31,rental_price_pct_change,new_restaurants,available_beer,str_permits_2020,str_permits_growth,crimes,total_pop_2010,total_pop_2019,total_pop_change,total_pop_pct_change,pop_over25_2010,pop_over25_2019,pop_over25_change,pop_over25_pcg_change,total_households_2010,total_households_2019,total_households_change,total_households_pct_change,white_pct_2010,white_pct_2019,white_value_change,white_pct_change,bach_pct_2010,bach_pct_2019,bach_value_change,bach_pct_change,rent_pct_2010,rent_pct_2019,rent_value_change,renter_pct_change,median_hhi_2010,median_hhi_2019,median_hhi_value_change,median_hhi_pct_change,poverty_pct_2010,poverty_pct_2019,poverty_value_change,poverty_pct_change,gentrifying
0,6422,I can't say enough about how wonderful it was ...,Nashville Charm,12172,36.17315,-86.73581,40,30,674,4.69,1,267,https://www.airbnb.com/rooms/6422,30 day or more rental during COVID. Show COVID...,Historic East Nashville is home to many new an...,2009-04-03,0.0,Private room in house,2,1 private bath,2.0,3.0,"[""Hair dryer"", ""Bathtub"", ""Lock on bedroom doo...",30.0,365.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0,37206.0,47037010000.0,412476.0,38.31,,,1.0,2.0,114.0,114.0,1165.0,2544.0,2100.0,-444.0,-0.174528,1703.0,1639.0,-64.0,-0.037581,1140.0,926.0,-214.0,-0.187719,0.657626,0.940952,0.283327,0.430833,0.408691,0.585723,0.177032,0.43317,0.320175,0.240821,-0.079355,-0.247848,46000.0,91643.0,45643.0,0.992239,10.6,10.2,-0.4,-0.037736,False


In [4]:
airbnb.shape

(5205, 76)

In [5]:
airbnb = airbnb[airbnb['comments_concatenated'].notna()]

In [6]:
airbnb_nlp = airbnb[['listing_id', 'comments_concatenated', 'gentrifying']]

In [7]:
airbnb_nlp.comments_concatenated.isna().sum()

0

## Split into two datasets - gentrifying and non-gentrifying

In [8]:
gentrifying = airbnb_nlp[airbnb_nlp['gentrifying']==True]
non_gentrifying = airbnb_nlp[airbnb_nlp['gentrifying']==False]

In [11]:
# create two lists with training and testing apns
train_listings_gent, test_listings_gen = tts(gentrifying['listing_id'].to_list(), 
                                            random_state = 42)

In [15]:
# Create train and test dataframes from the lists of apns
train_gentrifying = airbnb_nlp[airbnb_nlp['listing_id'].isin(train_listings_gent)].sort_values('listing_id')
test_gentrifying = airbnb_nlp[airbnb_nlp['listing_id'].isin(test_listings_gen)].sort_values('listing_id')

In [16]:
# create two lists with training and testing apns
train_listings_non, test_listings_non = tts(non_gentrifying['listing_id'].to_list(), 
                                            random_state = 42)

In [32]:
# Create train and test dataframes from the lists of apns
train_non_gentrifying = airbnb_nlp[airbnb_nlp['listing_id'].isin(train_listings_non)].sort_values('listing_id')
test_non_gentrifying = airbnb_nlp[airbnb_nlp['listing_id'].isin(test_listings_non)].sort_values('listing_id')

## Gensim LDA - Gentrifying
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [18]:
# Convert to list
data = train_gentrifying.comments_concatenated.values.tolist()

In [19]:
# Clean review break symbols
data = [re.sub("\\r\\n", "", comment) for comment in data]

In [20]:
# Tokenize each sentence to words, removing uneeded words/characters
def sent_to_words(sentences):
    for sentence in sentences:
        tagged_words = nltk.tag.pos_tag(sentence.split()) 
        no_names = [word for word,tag in tagged_words if tag != 'NNP' and tag != 'NNPS'] # Remove proper nouns
        yield(gensim.utils.simple_preprocess(str(no_names), deacc=True)) #Clean and remove punctuation

data_words = list(sent_to_words(data))

#print(data_words[0:1])

In [21]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words,
                               min_count=5,
                               threshold=100)#,  # higher threshold fewer phrases.
                               #connector_words=phrases.ENGLISH_CONNECTOR_WORDS) ***I think I need to download this.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)#, connector_words=phrases.ENGLISH_CONNECTOR_WORDS)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Test trigram on first review
#print(trigram_mod[bigram_mod[data_words[0]]])

In [22]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [23]:
# Build list of stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'stay', 'place', 'location', 'home', 'house', 'host', 'great'])

In [24]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#print(data_lemmatized[:1])

In [25]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
#print(corpus[:1])

In [26]:
# How to view a single word within the corpus
id2word[500]

'eaterie'

In [27]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('aanvoelen', 1),
  ('aardige', 1),
  ('abd', 1),
  ('ability', 1),
  ('able', 20),
  ('abode', 1),
  ('absence', 1),
  ('absolute', 3),
  ('absolutely', 26),
  ('absorb', 1),
  ('abundance', 1),
  ('accept', 2),
  ('access', 7),
  ('accessible', 1),
  ('accomadate', 1),
  ('accommadition', 1),
  ('accommodate', 34),
  ('accommodation', 12),
  ('accomodation', 1),
  ('accompany', 1),
  ('accomplish', 1),
  ('account', 1),
  ('accueil', 1),
  ('accurately', 1),
  ('across', 1),
  ('action', 2),
  ('active', 1),
  ('actively', 1),
  ('activist', 1),
  ('activity', 2),
  ('actual', 1),
  ('actually', 3),
  ('add', 2),
  ('addition', 2),
  ('additional', 1),
  ('address', 1),
  ('adequately', 1),
  ('adjustable', 1),
  ('adjustment', 1),
  ('adore', 2),
  ('adresse', 1),
  ('advance', 1),
  ('advantage', 1),
  ('adventure', 1),
  ('advertise', 2),
  ('advice', 16),
  ('advise', 3),
  ('aesthetically_please', 1),
  ('affordable', 1),
  ('afternoon', 1),
  ('age', 1),
  ('agin', 1),
  ('ag

In [28]:
# Build LDA model
lda_model = LdaMulticore(corpus=corpus,        # Replace with gensim.models.ldamodel.LdaModel()
                       id2word=id2word,
                       num_topics=8, #number of topics to identify
                       random_state=100,
                       #update_every=1,                          #Add back in with LdaModel
                       chunksize=100, #number of documents to pass per chunk
                       passes=10, #number of training passes
                       #alpha='auto',                            #Add back in with LdaModel
                       per_word_topics=True)

In [29]:
# Print the top 10 Keywords in each grouped Topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.019*"view" + 0.016*"rooftop" + 0.015*"clean" + 0.014*"recommend" + '
  '0.014*"downtown" + 0.013*"space" + 0.011*"nice" + 0.011*"group" + '
  '0.010*"beautiful" + 0.010*"amazing"'),
 (1,
  '0.019*"clean" + 0.016*"group" + 0.015*"recommend" + 0.014*"perfect" + '
  '0.014*"downtown" + 0.013*"definitely" + 0.012*"space" + 0.011*"need" + '
  '0.010*"time" + 0.010*"comfortable"'),
 (2,
  '0.160*"cottage" + 0.013*"cozy" + 0.011*"tiny" + 0.010*"cute" + 0.010*"need" '
  '+ 0.009*"little" + 0.007*"chicken" + 0.006*"perfect" + 0.006*"recommend" + '
  '0.006*"clean"'),
 (3,
  '0.021*"clean" + 0.019*"walk" + 0.017*"downtown" + 0.013*"comfortable" + '
  '0.013*"nice" + 0.012*"room" + 0.012*"recommend" + 0.011*"easy" + '
  '0.010*"definitely" + 0.010*"apartment"'),
 (4,
  '0.016*"clean" + 0.014*"comfortable" + 0.014*"recommend" + 0.012*"space" + '
  '0.012*"need" + 0.011*"perfect" + 0.011*"definitely" + 0.011*"stay" + '
  '0.011*"love" + 0.010*"well"'),
 (5,
  '0.035*"dog" + 0.031*"clean" 

In [30]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score - Likely more helpful. Takes a while to run.
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.546841952897288

Coherence Score:  0.27604132114600477


### Visual for Viewing each Topic

In [31]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics=False)
pyLDAvis.save_html(vis, 'lda_gentrifying.html')

#### Skipping step 17 about finding the best number of topics - Tim recommends 8-12

### Find dominant Topic in each Review

In [82]:
bow = corpora.Dictionary(data_lemmatized)

In [71]:
contents = []

for pdf, doc in tqdm(zip(pdfs, docs)):
    bow = corpora.Dictionary(data_lemmatized)
    topics = lda_model.get_document_topics(bow)
    for res in topics:
        topic, pct = res
        contents.append({'pdf': pdf, 'topic': topic, 'percent': pct})

topics = pd.DataFrame(contents)

NameError: name 'pdfs' is not defined

In [70]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

TypeError: '<' not supported between instances of 'int' and 'tuple'

### Find most representative Reviews for each Topic

In [None]:
for i in range(6):
    for j in range(5):

        file = topics[topics.topic == i].sort_values('percent', ascending = False).head().iloc[j, 0]

        cmd = f'cp {file} representative_docs/topic_{i + 1}/.'

        os.system(cmd)

## Gensim LDA - Non-Gentrifying
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [33]:
# Convert to list
data = train_non_gentrifying.comments_concatenated.values.tolist()

In [34]:
# Clean review break symbols
data = [re.sub("\\r\\n", "", comment) for comment in data]

In [35]:
# Tokenize each sentence to words, removing uneeded words/characters
def sent_to_words(sentences):
    for sentence in sentences:
        tagged_words = nltk.tag.pos_tag(sentence.split()) 
        no_names = [word for word,tag in tagged_words if tag != 'NNP' and tag != 'NNPS'] # Remove proper nouns
        yield(gensim.utils.simple_preprocess(str(no_names), deacc=True)) #Clean and remove punctuation

data_words = list(sent_to_words(data))

#print(data_words[0:1])

In [36]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words,
                               min_count=5,
                               threshold=100)#,  # higher threshold fewer phrases.
                               #connector_words=phrases.ENGLISH_CONNECTOR_WORDS) ***I think I need to download this.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)#, connector_words=phrases.ENGLISH_CONNECTOR_WORDS)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Test trigram on first review
#print(trigram_mod[bigram_mod[data_words[0]]])

In [37]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [38]:
# Build list of stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'stay', 'place', 'location', 'home', 'house', 'host', 'great'])

In [39]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#print(data_lemmatized[:1])

In [40]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
#print(corpus[:1])

In [41]:
# How to view a single word within the corpus
id2word[500]

'trail'

In [42]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('able', 3),
  ('absolutely', 2),
  ('access', 3),
  ('accessible', 1),
  ('accommodate', 3),
  ('accommodation', 2),
  ('acre', 1),
  ('action', 1),
  ('activity', 1),
  ('actor', 1),
  ('actual', 1),
  ('actually', 1),
  ('add', 1),
  ('advertised', 1),
  ('affordable', 1),
  ('air', 2),
  ('airbnb', 5),
  ('airport', 2),
  ('allow', 1),
  ('alone', 2),
  ('already', 1),
  ('also', 12),
  ('always', 3),
  ('amazing', 5),
  ('amenity', 5),
  ('ample', 1),
  ('answer', 2),
  ('anymore', 1),
  ('apartment', 57),
  ('appearance', 1),
  ('appliance', 1),
  ('appoint', 1),
  ('appreciator', 1),
  ('apt', 4),
  ('area', 9),
  ('arrival', 4),
  ('arrive', 5),
  ('art', 4),
  ('artwork', 1),
  ('ask', 5),
  ('aspect', 1),
  ('attend', 1),
  ('available', 5),
  ('aware', 1),
  ('away', 5),
  ('awesome', 2),
  ('back', 10),
  ('background', 1),
  ('backyard', 11),
  ('bad', 1),
  ('bake', 1),
  ('baked_cookie', 1),
  ('ball', 1),
  ('base', 1),
  ('basement', 1),
  ('beat', 2),
  ('beautiful'

In [43]:
# Build LDA model
lda_model = LdaMulticore(corpus=corpus,        # Replace with gensim.models.ldamodel.LdaModel()
                       id2word=id2word,
                       num_topics=8, #number of topics to identify
                       random_state=100,
                       #update_every=1,                          #Add back in with LdaModel
                       chunksize=100, #number of documents to pass per chunk
                       passes=10, #number of training passes
                       #alpha='auto',                            #Add back in with LdaModel
                       per_word_topics=True)

In [44]:
# Print the top 10 Keywords in each grouped Topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.039*"clean" + 0.023*"space" + 0.020*"recommend" + 0.019*"comfortable" + '
  '0.018*"definitely" + 0.016*"perfect" + 0.014*"love" + 0.014*"need" + '
  '0.012*"super" + 0.012*"close"'),
 (1,
  '0.027*"walk" + 0.024*"restaurant" + 0.017*"neighborhood" + 0.015*"perfect" '
  '+ 0.014*"clean" + 0.014*"distance" + 0.013*"downtown" + 0.013*"comfortable" '
  '+ 0.013*"shop" + 0.012*"recommend"'),
 (2,
  '0.045*"room" + 0.026*"clean" + 0.020*"nice" + 0.013*"downtown" + '
  '0.013*"comfortable" + 0.013*"friendly" + 0.012*"good" + 0.012*"recommend" + '
  '0.012*"bathroom" + 0.010*"stay"'),
 (3,
  '0.015*"make" + 0.014*"need" + 0.013*"recommend" + 0.013*"downtown" + '
  '0.012*"clean" + 0.012*"host" + 0.011*"comfortable" + 0.010*"definitely" + '
  '0.010*"perfect" + 0.009*"time"'),
 (4,
  '0.026*"walk" + 0.023*"clean" + 0.023*"apartment" + 0.012*"recommend" + '
  '0.012*"definitely" + 0.011*"distance" + 0.011*"condo" + 0.010*"perfect" + '
  '0.010*"nice" + 0.010*"downtown"'),
 (5,
  '0.01

In [45]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score - Likely more helpful. Takes a while to run.
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.569990494841519

Coherence Score:  0.28406596548923907


### Visual for Viewing each Topic

In [46]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics=False)
pyLDAvis.save_html(vis, 'lda_non_gentrifying.html')

#### Skipping step 17 about finding the best number of topics - Tim recommends 8-12

### Find dominant Topic in each Review

In [82]:
bow = corpora.Dictionary(data_lemmatized)

In [71]:
contents = []

for pdf, doc in tqdm(zip(pdfs, docs)):
    bow = corpora.Dictionary(data_lemmatized)
    topics = lda_model.get_document_topics(bow)
    for res in topics:
        topic, pct = res
        contents.append({'pdf': pdf, 'topic': topic, 'percent': pct})

topics = pd.DataFrame(contents)

NameError: name 'pdfs' is not defined

In [70]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

TypeError: '<' not supported between instances of 'int' and 'tuple'

### Find most representative Reviews for each Topic

In [None]:
for i in range(6):
    for j in range(5):

        file = topics[topics.topic == i].sort_values('percent', ascending = False).head().iloc[j, 0]

        cmd = f'cp {file} representative_docs/topic_{i + 1}/.'

        os.system(cmd)

## Gensim LDA - Location Review of 7 or Less
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [76]:
# Convert to list
data = airbnb[airbnb['review_scores_location']<=7].comments_concatenated.values.tolist()

In [63]:
# Clean review break symbols
data = [re.sub("\\r\\n", "", comment) for comment in data]

In [64]:
# Tokenize each sentence to words, removing uneeded words/characters
def sent_to_words(sentences):
    for sentence in sentences:
        tagged_words = nltk.tag.pos_tag(sentence.split()) 
        no_names = [word for word,tag in tagged_words if tag != 'NNP' and tag != 'NNPS'] # Remove proper nouns
        yield(gensim.utils.simple_preprocess(str(no_names), deacc=True)) #Clean and remove punctuation

data_words = list(sent_to_words(data))

#print(data_words[0:1])

In [65]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words,
                               min_count=5,
                               threshold=100)#,  # higher threshold fewer phrases.
                               #connector_words=phrases.ENGLISH_CONNECTOR_WORDS) ***I think I need to download this.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)#, connector_words=phrases.ENGLISH_CONNECTOR_WORDS)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Test trigram on first review
#print(trigram_mod[bigram_mod[data_words[0]]])

In [66]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [67]:
# Build list of stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'stay', 'place', 'location', 'home', 'house', 'host', 'great'])

In [68]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#print(data_lemmatized[:1])

In [69]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
#print(corpus[:1])

In [70]:
# How to view a single word within the corpus
id2word[500]

'bang'

In [71]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('bachelorette', 1),
  ('charming', 1),
  ('chic', 1),
  ('comfortable', 1),
  ('country', 1),
  ('eachother', 1),
  ('fit', 1),
  ('girl', 1),
  ('impeccable', 1),
  ('key', 1),
  ('low', 1),
  ('people', 1),
  ('property', 1),
  ('shape', 1),
  ('spot', 1),
  ('top', 1),
  ('weekend', 1)]]

In [72]:
# Build LDA model
lda_model = LdaMulticore(corpus=corpus,        # Replace with gensim.models.ldamodel.LdaModel()
                       id2word=id2word,
                       num_topics=8, #number of topics to identify
                       random_state=100,
                       #update_every=1,                          #Add back in with LdaModel
                       chunksize=100, #number of documents to pass per chunk
                       passes=10, #number of training passes
                       #alpha='auto',                            #Add back in with LdaModel
                       per_word_topics=True)

In [73]:
# Print the top 10 Keywords in each grouped Topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.014*"feel" + 0.010*"neighborhood" + 0.010*"hour" + 0.010*"bad" + '
  '0.010*"back" + 0.008*"issue" + 0.008*"building" + 0.007*"definitely" + '
  '0.006*"area" + 0.006*"come"'),
 (1,
  '0.015*"clean" + 0.012*"get" + 0.012*"area" + 0.011*"downtown" + '
  '0.010*"apartment" + 0.010*"nice" + 0.010*"neighborhood" + 0.009*"tell" + '
  '0.009*"book" + 0.009*"safe"'),
 (2,
  '0.020*"clean" + 0.013*"downtown" + 0.011*"group" + 0.011*"night" + '
  '0.010*"nice" + 0.008*"get" + 0.008*"well" + 0.008*"easy" + 0.007*"make" + '
  '0.007*"comfortable"'),
 (3,
  '0.023*"cottage" + 0.014*"enjoy" + 0.012*"clean" + 0.011*"perfect" + '
  '0.011*"love" + 0.009*"get" + 0.008*"look" + 0.008*"little" + 0.008*"need" + '
  '0.008*"cute"'),
 (4,
  '0.018*"nice" + 0.016*"clean" + 0.016*"area" + 0.014*"downtown" + '
  '0.013*"neighborhood" + 0.012*"feel" + 0.011*"group" + 0.010*"amazing" + '
  '0.009*"really" + 0.009*"get"'),
 (5,
  '0.013*"clean" + 0.013*"get" + 0.011*"refund" + 0.007*"lack" + 0.007*"bre

In [74]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score - Likely more helpful. Takes a while to run.
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.028445041980549

Coherence Score:  0.35090525787610866


### Visual for Viewing each Topic

In [75]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics=False)
pyLDAvis.save_html(vis, 'lda_bad_location_score.html')

#### Skipping step 17 about finding the best number of topics - Tim recommends 8-12

### Find dominant Topic in each Review

In [82]:
bow = corpora.Dictionary(data_lemmatized)

In [71]:
contents = []

for pdf, doc in tqdm(zip(pdfs, docs)):
    bow = corpora.Dictionary(data_lemmatized)
    topics = lda_model.get_document_topics(bow)
    for res in topics:
        topic, pct = res
        contents.append({'pdf': pdf, 'topic': topic, 'percent': pct})

topics = pd.DataFrame(contents)

NameError: name 'pdfs' is not defined

In [70]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

TypeError: '<' not supported between instances of 'int' and 'tuple'

### Find most representative Reviews for each Topic

In [None]:
for i in range(6):
    for j in range(5):

        file = topics[topics.topic == i].sort_values('percent', ascending = False).head().iloc[j, 0]

        cmd = f'cp {file} representative_docs/topic_{i + 1}/.'

        os.system(cmd)