In [31]:
# Run in terminal or command prompt:
#python3 -m spacy download en

# Packages
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from pprint import pprint

# Import stopwords and other word packages
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models # as gensimvis  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.corpus import stopwords

from scipy.sparse import csr_matrix, hstack, coo_matrix


import string
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mattparker/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
airbnb = pd.read_csv('../data/airbnb_gentrification.csv')

In [11]:
airbnb.head(1)

Unnamed: 0,listing_id,comments_concatenated,name,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,listing_url,description,neighborhood_overview,host_since,host_listings_count,property_type,accommodates,bathrooms_text,bedrooms,beds,amenities,minimum_nights_avg_ntm,maximum_nights_avg_ntm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,zip_code,GEOID,house_price_2021-01-31,house_pct_change,rentals_2021-01-31,rental_price_pct_change,new_restaurants,available_beer,str_permits_2020,str_permits_growth,crimes,total_pop_2010,total_pop_2019,total_pop_change,total_pop_pct_change,pop_over25_2010,pop_over25_2019,pop_over25_change,pop_over25_pcg_change,total_households_2010,total_households_2019,total_households_change,total_households_pct_change,white_pct_2010,white_pct_2019,white_value_change,white_pct_change,bach_pct_2010,bach_pct_2019,bach_value_change,bach_pct_change,rent_pct_2010,rent_pct_2019,rent_value_change,renter_pct_change,median_hhi_2010,median_hhi_2019,median_hhi_value_change,median_hhi_pct_change,poverty_pct_2010,poverty_pct_2019,poverty_value_change,poverty_pct_change,gentrifying
0,6422,I can't say enough about how wonderful it was ...,Nashville Charm,12172,36.17315,-86.73581,40,30,674,4.69,1,267,https://www.airbnb.com/rooms/6422,30 day or more rental during COVID. Show COVID...,Historic East Nashville is home to many new an...,2009-04-03,0.0,Private room in house,2,1 private bath,2.0,3.0,"[""Hair dryer"", ""Bathtub"", ""Lock on bedroom doo...",30.0,365.0,99.0,10.0,10.0,10.0,10.0,10.0,10.0,37206.0,47037010000.0,412476.0,38.31,,,1.0,2.0,114.0,114.0,1165.0,2544.0,2100.0,-444.0,-0.174528,1703.0,1639.0,-64.0,-0.037581,1140.0,926.0,-214.0,-0.187719,0.657626,0.940952,0.283327,0.430833,0.408691,0.585723,0.177032,0.43317,0.320175,0.240821,-0.079355,-0.247848,46000.0,91643.0,45643.0,0.992239,10.6,10.2,-0.4,-0.037736,False


In [12]:
airbnb.shape

(5205, 76)

In [13]:
airbnb = airbnb[airbnb['comments_concatenated'].notna()]

In [14]:
airbnb_nlp = airbnb[['listing_id', 'comments_concatenated', 'gentrifying']]

In [15]:
airbnb_nlp.comments_concatenated.isna().sum()

0

In [16]:
# create two lists with training and testing apns
train_listings, test_listings = tts(airbnb_nlp['listing_id'].to_list(), 
                                    random_state = 42, 
                                    stratify=airbnb_nlp['gentrifying'])

In [17]:
len(train_listings)

3903

In [18]:
# Create train and test dataframes from the lists of apns
airbnb_train = airbnb_nlp[airbnb_nlp['listing_id'].isin(train_listings)].sort_values('listing_id')
airbnb_test = airbnb_nlp[airbnb_nlp['listing_id'].isin(test_listings)].sort_values('listing_id')

In [19]:
# Create the y_train and y_test dataframes from the lists of apns
y_train = airbnb_nlp['gentrifying']
y_test = airbnb_nlp['gentrifying']

In [20]:
print(airbnb_train.gentrifying.value_counts(normalize=True))
print(airbnb_test.gentrifying.value_counts(normalize=True))

False    0.755829
True     0.244171
Name: gentrifying, dtype: float64
False    0.756341
True     0.243659
Name: gentrifying, dtype: float64


## Gensim Blog - Machine Learning Plus
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [21]:
# Convert to list
data = airbnb_train.comments_concatenated.iloc[0:200].values.tolist()

In [22]:
# Clean review break symbols
data = [re.sub("\\r\\n", "", comment) for comment in data]

#### Set to Markdown for Safety

#Tokenize each sentence to words, remove punctuations and unnecessary characters
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words_with_names = list(sent_to_words(data))

#print(data_words[:1])

In [23]:
# Tokenize each sentence to words, removing uneeded words/characters
def sent_to_words(sentences):
    for sentence in sentences:
        tagged_words = nltk.tag.pos_tag(sentence.split()) 
        no_names = [word for word,tag in tagged_words if tag != 'NNP' and tag != 'NNPS'] # Remove proper nouns
        yield(gensim.utils.simple_preprocess(str(no_names), deacc=True)) #Clean and remove punctuation

data_words = list(sent_to_words(data))

#print(data_words[0:1])

In [24]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words,
                               min_count=5,
                               threshold=100)#,  # higher threshold fewer phrases.
                               #connector_words=phrases.ENGLISH_CONNECTOR_WORDS) ***I think I need to download this.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)#, connector_words=phrases.ENGLISH_CONNECTOR_WORDS)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Test trigram on first review
#print(trigram_mod[bigram_mod[data_words[0]]])

In [25]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [72]:
# Build list of stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'stay', 'place', 'location', 'home', 'house', 'host', 'great'])

In [73]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#print(data_lemmatized[:1])

In [74]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
#print(corpus[:1])

In [75]:
# How to view a single word within the corpus
id2word[500]

'deu'

In [76]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('abend', 1),
  ('aber', 1),
  ('able', 23),
  ('abound', 1),
  ('aboyait', 1),
  ('absente', 1),
  ('absolument', 1),
  ('absolute', 7),
  ('absolute_pleasure', 3),
  ('absolutely', 45),
  ('access', 15),
  ('accessible', 7),
  ('acceuillante', 1),
  ('acclimate', 1),
  ('accom', 1),
  ('accommodate', 37),
  ('accommodating', 1),
  ('accommodation', 23),
  ('accomodate', 1),
  ('accomodation', 4),
  ('accompagne', 1),
  ('accompany', 1),
  ('accueil', 2),
  ('accueilli', 1),
  ('accueillir', 1),
  ('accurate', 1),
  ('acknowledge', 1),
  ('acogedore', 1),
  ('acommodation', 1),
  ('act', 2),
  ('activism', 1),
  ('activity', 5),
  ('actually', 7),
  ('add', 5),
  ('addition', 2),
  ('adequate', 2),
  ('adjacent', 5),
  ('adjoining', 2),
  ('adjust', 2),
  ('adopt', 1),
  ('adorable', 27),
  ('adoravel', 1),
  ('adore', 1),
  ('adresse', 1),
  ('advantage', 1),
  ('adventure', 4),
  ('adventurous', 1),
  ('advertize', 1),
  ('advice', 17),
  ('advise', 1),
  ('affect', 1),
  ('affect

In [77]:
# Build LDA model
lda_model = LdaMulticore(corpus=corpus,        # Replace with gensim.models.ldamodel.LdaModel()
                       id2word=id2word,
                       num_topics=8, #number of topics to identify
                       random_state=100,
                       #update_every=1,                          #Add back in with LdaModel
                       chunksize=100, #number of documents to pass per chunk
                       passes=10, #number of training passes
                       #alpha='auto',                            #Add back in with LdaModel
                       per_word_topics=True)

In [78]:
# Print the top 10 Keywords in each grouped Topic
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.018*"room" + 0.017*"clean" + 0.016*"hostel" + 0.011*"staff" + '
  '0.010*"good" + 0.010*"downtown" + 0.009*"friendly" + 0.009*"stay" + '
  '0.009*"recommend" + 0.009*"make"'),
 (1,
  '0.010*"beautiful" + 0.008*"comfortable" + 0.008*"art" + 0.008*"make" + '
  '0.007*"wonderful" + 0.006*"well" + 0.006*"feel" + 0.006*"neighborhood" + '
  '0.006*"perfect" + 0.006*"recommend"'),
 (2,
  '0.026*"apartment" + 0.014*"bar" + 0.012*"clean" + 0.011*"walk" + '
  '0.011*"restaurant" + 0.008*"night" + 0.008*"perfect" + 0.008*"get" + '
  '0.008*"definitely" + 0.008*"recommend"'),
 (3,
  '0.012*"perfect" + 0.012*"downtown" + 0.012*"recommend" + 0.011*"clean" + '
  '0.011*"group" + 0.010*"time" + 0.010*"space" + 0.010*"well" + 0.010*"need" '
  '+ 0.009*"comfortable"'),
 (4,
  '0.013*"room" + 0.012*"comfortable" + 0.012*"make" + 0.011*"recommend" + '
  '0.011*"stay" + 0.010*"feel" + 0.010*"time" + 0.010*"host" + 0.009*"clean" + '
  '0.009*"cottage"'),
 (5,
  '0.021*"clean" + 0.018*"downtown" + 

In [79]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score - Likely more helpful. Takes a while to run.
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.65874981679059

Coherence Score:  0.2618261734711568


## Visual for Viewing each Topic

In [80]:
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, sort_topics=False)
pyLDAvis.save_html(vis, 'lda.html')

### Skipping step 17 about finding the best number of topics - Tim recommends 8-12

## Find dominant Topic in each Review

In [82]:
bow = corpora.Dictionary(data_lemmatized)

In [71]:
contents = []

for pdf, doc in tqdm(zip(pdfs, docs)):
    bow = corpora.Dictionary(data_lemmatized)
    topics = lda_model.get_document_topics(bow)
    for res in topics:
        topic, pct = res
        contents.append({'pdf': pdf, 'topic': topic, 'percent': pct})

topics = pd.DataFrame(contents)

NameError: name 'pdfs' is not defined

In [70]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

TypeError: '<' not supported between instances of 'int' and 'tuple'

## Find most representative Reviews for each Topic

In [None]:
for i in range(6):
    for j in range(5):

        file = topics[topics.topic == i].sort_values('percent', ascending = False).head().iloc[j, 0]

        cmd = f'cp {file} representative_docs/topic_{i + 1}/.'

        os.system(cmd)

## Sample from Housing Project (this isn't working)

In [None]:
# This is just a previous code, for safekeeping.
# Create a tokenizing function that takes text and removes all numbers
#def tokenizer(text):
#    return [x for x in re.findall(r'[a-z]+', text.lower()) if len(x) > 1]

In [None]:
airbnb_train.head(1)

In [None]:
# using a list comprehension, iterate over the four permit type contents and vectorize them using the TfidfVectorizer
vectorizer_test1 = TfidfVectorizer(
    tokenizer = tokenizer, 
    stop_words = 'english', 
    #min_df=50, 
    #max_df=0.4, 
    #ngram_range=(1,3)
).fit(airbnb_train)

In [None]:
# Check to see the shape of the first sparse matrix generated
vectorizer_test1.transform(airbnb_train)

## Sample from Gensim Blog
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

In [None]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [None]:
test_comment = airbnb.comments_concatenated.iloc[2]

In [None]:
print('original document: ')
words = []
for word in test_comment.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(test_comment))

## Try again from Michael's Notebook

In [None]:
docs = airbnb.copy()['comments_concatenated'].iloc[0:3]

In [None]:
docs[0]

In [None]:
# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in tqdm(range(len(docs))):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 2] for doc in docs]

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

## Rest of Michael's walkthrough below

In [None]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# Set training parameters.
num_topics = 6
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)